In [2]:
#Data Science in TSE Systems/MISP Project
#Code written by: Anshak Mallik

In [3]:
#Importing libraries
import numpy as np
import pandas as pd
import nltk
import re, string

In [None]:
#Getting the dataframe from Attributes.csv
data_folder = "Data"
attributes = pd.read_csv(f"{data_folder}/Attributes.csv")  #dataframe
# attributes = attributes[attributes["Orgc ID"] != 1203]
column_names = attributes.columns #names of columns in dataframe

## Normalization

In [None]:
value = attributes['Value']
attribute_type = attributes['Attribute Type']

def normalize(tokens):
    '''
    INPUT:  - tokens: list of tokens represented as strings
    OUTPUT: - normalized_tokens: list of input tokens which have been normalized
    '''
    
    normalized_tokens = []
    
    for token, i in zip(tokens, range(len(tokens))):
        
        #Removing punctuation
        if attribute_type[i] != ('ip-dst' or 'ip-dst|port' or 'ip-src' or 'ip-src|port'):
            
            #Removing 'http[s]://' and 'www' from beginning of urls
            if (token.startswith('http')):
                token = re.sub('http[s]?://','', token)
            if (token.startswith('www')):
                token = re.sub('www', '', token)
                
            #Remove punctuation
            token = re.sub(r'[^\w\s]', '', token)
        
        #Making all tokens lower case
        token = token.lower()
        
        #Appending to list
        normalized_tokens.append(token)
    
    return normalized_tokens

In [None]:
#Changing values to normalised values in dataframe
normalised_values = normalize(value)
attributes['Value'] = pd.Series(normalised_values)
value = attributes['Value']

In [None]:
from collections import OrderedDict

#Creating list with all event IDs (without repetition)
event_ids = attributes['Event ID']
events = list(OrderedDict.fromkeys(event_ids))

## Inheritance

In [None]:
# from difflib import SequenceMatcher

# #Working with first event ID

# #Getting relevant rows and their attribute IDs
# event_df = attributes.loc[attributes['Event ID'] == events[0]]
# n = len(event_df)
# attribute_ids = event_df['Attribute ID']
# values = event_df['Value']

# #Creating inheritance matrix
# I = np.zeros((n,n))
# for i in range(n):
#     for j in range(n):
#         if (i >= j):
#             I[i,j] = SequenceMatcher(None, values[i], values[j]).ratio()


In [None]:
from difflib import SequenceMatcher

m = 1
#Empty list to append with (Event ID, inhertiance matrix)
event_inheritances = []

#Generalising for all event IDs
for k in range(len(events[m:m+1])):
    
    #Getting relevant rows and their attribute IDs
    event_df = attributes.loc[attributes['Event ID'] == events[k]]
    n = len(event_df)
    attribute_ids = event_df['Attribute ID']
    values = event_df['Value']

    #Creating inheritance matrix
    I = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if (i >= j):
                I[i,j] = SequenceMatcher(None, values[i], values[j]).ratio()

    #Appending to list
    event_inheritances.append((events[k], I))
    
# print(event_inheritances)
