In [2]:
#Data Science in TSE Systems/MISP Project
#Code written by: Anshak Mallik

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
# import nltk
import re

In [2]:
#Getting the dataframe from Attributes.csv
data_folder = "Data"
attributes = pd.read_csv(f"{data_folder}/Attributes.csv")  #dataframe

## Normalization

In [3]:
value = attributes['Value']
attribute_type = attributes['Attribute Type']

def normalize(tokens):
    '''
    INPUT:  - tokens: list of tokens represented as strings
    OUTPUT: - normalized_tokens: list of input tokens which have been normalized
    '''
    
    normalized_tokens = []
    
    for token, i in zip(tokens, range(len(tokens))):
        
        #Removing punctuation
        if attribute_type[i] != ('ip-dst' or 'ip-dst|port' or 'ip-src' or 'ip-src|port'):
            
            #Removing 'http[s]://' and 'www' from beginning of urls
            if (token.startswith('http')):
                token = re.sub('http[s]?://','', token)
            if (token.startswith('www')):
                token = re.sub('www', '', token)
                
            #Remove punctuation
            token = re.sub(r'[^\w\s]', '', token)
        
        #Making all tokens lower case
        token = token.lower()
        
        #Appending to list
        normalized_tokens.append(token)
    
    return normalized_tokens

In [4]:
#Changing values to normalised values in dataframe
normalised_values = normalize(value)
attributes['Value'] = pd.Series(normalised_values)
value = attributes['Value'] 

In [5]:
from collections import OrderedDict

#Creating list with all event IDs (without repetition)
event_ids = attributes['Event ID']
events = list(OrderedDict.fromkeys(event_ids))

## Inheritance

In [6]:
#Loop to see which events have number of values of tolerance
tol = 5e3
counter = 0
big_events = []
big_indices = []

for k in range(len(events)):
    #Getting relevant rows and their attribute IDs
    event_df = attributes.loc[attributes['Event ID'] == events[k]]
    n = len(event_df)
    attribute_ids = event_df['Attribute ID']
    values = np.array(event_df['Value'])
    if len(values) >= tol: 
        print('The following event has too many values (more than %d)!' % (tol))
        print('Index %d, Event ID %d, Values %d' % (k, events[k], len(values)))
        print('----------')
        counter += 1
        big_events.append(events[k])
        big_indices.append(k)
        
print('Total number of events with values over %d: %d' % (tol, counter))

The following event has too many values (more than 5000)!
Index 218, Event ID 79819, Values 29641
----------
The following event has too many values (more than 5000)!
Index 335, Event ID 79837, Values 30752
----------
The following event has too many values (more than 5000)!
Index 602, Event ID 62222, Values 41630
----------
The following event has too many values (more than 5000)!
Index 2534, Event ID 86173, Values 7814
----------
The following event has too many values (more than 5000)!
Index 2721, Event ID 14027, Values 33457
----------
The following event has too many values (more than 5000)!
Index 2939, Event ID 14168, Values 41542
----------
The following event has too many values (more than 5000)!
Index 3045, Event ID 14223, Values 40520
----------
The following event has too many values (more than 5000)!
Index 3394, Event ID 62248, Values 34623
----------
The following event has too many values (more than 5000)!
Index 4333, Event ID 6271, Values 5832
----------
The following ev

In [7]:
#Filtering events before getting attribute IDs
filtered_events = [x for x in events if x not in big_events]

In [8]:
# from timeit import default_timer as timer
# from datetime import timedelta
# from difflib import SequenceMatcher
# # import jellyfish as jl

# #Creating subset matrix for example

# #Begin timer before loop
# start = timer()

# #Getting subset data
# N = 100
# attribute_ids_N = np.array(attributes['Attribute ID'][:N]) #Series
# values_N = value[:N] #After normalization

# #Empty matrix
# I = np.zeros((N, N))
# for i in range(N):
#     for j in range(N):
#         if (i >= j):
#             I[i,j] = round(SequenceMatcher(None, values_N[i], values_N[j]).ratio(), 3)
#             # I[i,j] = round(jl.jaro_similarity(values_N[i], values_N[j]), 3)
            
# #Matrix as dataframe
# df = pd.DataFrame(I, index=attribute_ids_N, columns=attribute_ids_N)
# # df.to_csv(r'Data/Inheritance_subset.csv')

# #End timer after loop    
# end = timer()
# print(timedelta(seconds=end-start))

In [9]:
from timeit import default_timer as timer
from datetime import timedelta

#Getting list of all event and attribute IDs separately
start = timer()
e, a, index = [], [], []
for k in range(len(filtered_events)):
    
    #Filtering using events
    df = attributes.loc[attributes['Event ID'] == filtered_events[k]]
    #Attribute ID
    attribute_ids = df['Attribute ID']
    
    #Indices of rows
    index_list = list(df.index)
    index += index_list
    #Appending e and a
    for l in range(len(attribute_ids)):
        e.append(filtered_events[k])
        a.append(np.array(attribute_ids)[l])
        
    #Checkpoints
    if k%1000 == 0:
        print('%d iterations done in: ' % (k))
        print(timedelta(seconds=timer()-start))
        print('----------')
print(timedelta(seconds=timer()-start))

0 iterations done in: 
0:00:00.003724
----------
1000 iterations done in: 
0:00:02.614534
----------
2000 iterations done in: 
0:00:05.670118
----------
3000 iterations done in: 
0:00:07.702613
----------
4000 iterations done in: 
0:00:11.083422
----------
5000 iterations done in: 
0:00:13.636675
----------
6000 iterations done in: 
0:00:16.747232
----------
7000 iterations done in: 
0:00:19.084246
----------
8000 iterations done in: 
0:00:22.750406
----------
0:00:23.320976


In [12]:
pip install dask

Collecting dask
  Downloading dask-2022.5.0-py3-none-any.whl (1.1 MB)
Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
Collecting toolz>=0.8.2
  Downloading toolz-0.11.2-py3-none-any.whl (55 kB)
Collecting locket
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: toolz, locket, partd, fsspec, dask
Successfully installed dask-2022.5.0 fsspec-2022.3.0 locket-1.0.0 partd-1.2.0 toolz-0.11.2
Note: you may need to restart the kernel to use updated packages.


In [10]:
from difflib import SequenceMatcher
#Creating matrix

#Getting the data
N = len(index)
labels = [(ev, at) for ev, at in zip(e, a)]
values = value[index]

#Begin timer before loop
start = timer()

#Empty matrix
I = np.zeros((N, N))    
for i in range(N):
    for j in range(N):
        if (i >= j):
            I[i,j] = round(SequenceMatcher(None, values[i], values[j]).ratio(), 3)
    #Checkpoints
    if i%1000 == 0:
        print('%d iterations done in: ' % (k))
        print(timedelta(seconds=timer()-start))
        print('----------')
        
#Matrix as dataframe
print('-------------------- \n%Saving to .csv%')
df = pd.DataFrame(I, index=labels, columns=labels)
df.to_csv(r'Data/Inheritance.csv')

#End timer after loop    
end = timer()
print(timedelta(seconds=end-start))

MemoryError: Unable to allocate 683. GiB for an array with shape (302875, 302875) and data type float64