In [1]:
#Data Science in TSE Systems/MISP Project
#Code written by: Anshak Mallik

In [2]:
#Importing libraries
import numpy as np
import pandas as pd
# import nltk
import re


In [3]:
#Getting the dataframe from Attributes.csv
data_folder = "/home/gillard/Bureau/MISP_Analysis/Back-up/WEIS2022"
attributes = pd.read_csv(f"{data_folder}/Attributes.csv")  #dataframe

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Normalization

In [7]:
value = attributes['Value'].astype(str)

attribute_type = attributes['Attribute Type']

def normalize(tokens):
    '''
    INPUT:  - tokens: list of tokens represented as strings
    OUTPUT: - normalized_tokens: list of input tokens which have been normalized
    '''
    
    normalized_tokens = []
    
    for token, i in zip(tokens, range(len(tokens))):
        
        #Removing punctuation
        if attribute_type[i] != ('ip-dst' or 'ip-dst|port' or 'ip-src' or 'ip-src|port'):
            
            #Removing 'http[s]://' and 'www' from beginning of urls
            if (token.startswith('http')):
                token = re.sub('http[s]?://','', token)
            if (token.startswith('www')):
                token = re.sub('www', '', token)
                
            #Remove punctuation
            token = re.sub(r'[^\w\s]', '', token)
        
        #Making all tokens lower case
        token = token.lower()
        
        #Appending to list
        normalized_tokens.append(token)
    
    return normalized_tokens

0                      http://fastchem.co.id/muri/config.bin
1                         http://fastchem.co.id/muri/bot.exe
2                        http://fastchem.co.id/muri/gate.php
3                      http://fastchem.co.id/kays/config.bin
4                         http://fastchem.co.id/kays/bot.exe
                                 ...                        
9423337                                      188.120.241.150
9423338    http://audio.mesomedia.co.uk/forums/viewtopic....
9423339    http://audio.mesomedia.co.uk/factor.xpd?import...
9423340    http://audio.mesomedia.co.uk/forums/viewtopic....
9423341                                    Driveby:Angler EK
Name: Value, Length: 9423342, dtype: object


In [6]:
#Changing values to normalised values in dataframe
normalised_values = normalize(value)
attributes['Value'] = pd.Series(normalised_values)
value = attributes['Value'] 

AttributeError: 'float' object has no attribute 'startswith'

In [None]:
from collections import OrderedDict

#Creating list with all event IDs (without repetition)
event_ids = attributes['Event ID']
events = list(OrderedDict.fromkeys(event_ids))

## Inheritance

In [None]:
#Loop to see which events have number of values of tolerance
tol = 5e3
counter = 0
big_events = []
big_indices = []

for k in range(len(events)):
    #Getting relevant rows and their attribute IDs
    event_df = attributes.loc[attributes['Event ID'] == events[k]]
    n = len(event_df)
    attribute_ids = event_df['Attribute ID']
    values = np.array(event_df['Value'])
    if len(values) >= tol: 
        print('The following event has too many values (more than %d)!' % (tol))
        print('Index %d, Event ID %d, Values %d' % (k, events[k], len(values)))
        print('----------')
        counter += 1
        big_events.append(events[k])
        big_indices.append(k)
        
print('Total number of events with values over %d: %d' % (tol, counter))

In [None]:
#Filtering events before getting attribute IDs
filtered_events = [x for x in events if x not in big_events]

In [None]:
# from timeit import default_timer as timer
# from datetime import timedelta
# from difflib import SequenceMatcher
# # import jellyfish as jl

# #Creating subset matrix for example

# #Begin timer before loop
# start = timer()

# #Getting subset data
# N = 100
# attribute_ids_N = np.array(attributes['Attribute ID'][:N]) #Series
# values_N = value[:N] #After normalization

# #Empty matrix
# I = np.zeros((N, N))
# for i in range(N):
#     for j in range(N):
#         if (i >= j):
#             I[i,j] = round(SequenceMatcher(None, values_N[i], values_N[j]).ratio(), 3)
#             # I[i,j] = round(jl.jaro_similarity(values_N[i], values_N[j]), 3)
            
# #Matrix as dataframe
# df = pd.DataFrame(I, index=attribute_ids_N, columns=attribute_ids_N)
# # df.to_csv(r'Data/Inheritance_subset.csv')

# #End timer after loop    
# end = timer()
# print(timedelta(seconds=end-start))

In [None]:
from timeit import default_timer as timer
from datetime import timedelta

#Getting list of all event and attribute IDs separately
start = timer()
e, a, index = [], [], []
for k in range(len(filtered_events)):
    
    #Filtering using events
    df = attributes.loc[attributes['Event ID'] == filtered_events[k]]
    #Attribute ID
    attribute_ids = df['Attribute ID']
    
    #Indices of rows
    index_list = list(df.index)
    index += index_list
    #Appending e and a
    for l in range(len(attribute_ids)):
        e.append(filtered_events[k])
        a.append(np.array(attribute_ids)[l])
        
    #Checkpoints
    if k%1000 == 0:
        print('%d iterations done in: ' % (k))
        print(timedelta(seconds=timer()-start))
        print('----------')
print(timedelta(seconds=timer()-start))

In [None]:
from difflib import SequenceMatcher
#Creating matrix

#Getting the data
N = len(index)
labels = [(ev, at) for ev, at in zip(e, a)]
values = value[index]

#Begin timer before loop
start = timer()

#Empty matrix
I = np.zeros((N, N))    
for i in range(N):
    for j in range(N):
        if (i >= j):
            I[i,j] = round(SequenceMatcher(None, values[i], values[j]).ratio(), 3)
    #Checkpoints
    if i%1000 == 0:
        print('%d iterations done in: ' % (k))
        print(timedelta(seconds=timer()-start))
        print('----------')
        
#Matrix as dataframe
print('-------------------- \n%Saving to .csv%')
df = pd.DataFrame(I, index=labels, columns=labels)
df.to_csv(r'Data/Inheritance.csv')

#End timer after loop    
end = timer()
print(timedelta(seconds=end-start))