In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from skmultilearn.adapt import MLkNN
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
     
from nltk.stem.snowball import SnowballStemmer

import pandas as pd
import json
import os
import re

import gensim 
from gensim.models import Word2Vec
import gensim.downloader

import pickle 

import nltk
from nltk.tokenize import word_tokenize

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
rcatt_data = '../data/rcatt_training_data_original.csv'
scraped_data = '../data/training_dataset_full.csv'

In [3]:
# transform into dataframes: 

df_r = pd.read_csv(rcatt_data).reset_index(drop = True)
df_r = df_r[~df_r['Text'].duplicated()]
df_s = pd.read_csv(scraped_data).reset_index(drop = True).rename(columns={'text': 'Text'})

In [4]:
# Converting from string to list using literal_eval:

for col in ['mitre_domain', 'tech_name', 'tech_id', 'tactic_id', 'software_id']:
    df_s[col] = df_s[col].apply(literal_eval)

In [5]:
mlb = MultiLabelBinarizer()
Y_s = mlb.fit_transform(df_s['tech_id'])
Y_s = pd.DataFrame(Y_s, columns=mlb.classes_)

In [6]:
mlb = MultiLabelBinarizer()
Y_s_tactic = mlb.fit_transform(df_s['tactic_id'])
Y_s_tactic = pd.DataFrame(Y_s_tactic, columns=mlb.classes_)

In [7]:
X_r = df_r['Text']
Y_r = df_r[[col for col in df_r.columns if col.startswith('T') and col[1:].isdecimal()]]

In [8]:
Y_r_tactic = df_r[[col for col in df_r.columns if col.startswith('TA')]]
s_r = list(set(Y_s.columns).difference(set(Y_r.columns)))
r_s = list(set(Y_r.columns).difference(set(Y_s.columns)))
rs = list(set(Y_s.columns).intersection(set(Y_r.columns)))

In [9]:
Y1 = Y_s[rs]
Y1[r_s] = 0
Y_s = Y1[Y1.sum(axis=1)>0] 
X_s = df_s['Text']
X_s = X_s[Y1.sum(axis=1)>0] # all urls who map at least one of the techniques in Y1
Y_s_tactic = Y_s_tactic[Y1.sum(axis=1)>0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y1[r_s] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y1[r_s] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y1[r_s] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:/

In [10]:
X_r_train, X_test_text, Y_r_train, Y_test, Y_r_train_tactic, Y_test_tactic = train_test_split(X_r, Y_r, Y_r_tactic, test_size=0.3,
                                                    random_state = 10)

In [11]:
X_train_text = pd.concat([X_r_train, X_s]).reset_index(drop=True)

In [12]:
Y_train = pd.concat([Y_r_train, Y_s]).reset_index(drop=True)

In [13]:
Y_train_tactic = pd.concat([Y_r_train_tactic, Y_s_tactic]).reset_index(drop=True)

In [14]:
Y_train_tactic = Y_train_tactic.dropna(axis=1)

In [15]:
def feature_extraction(featureExtract, X_train_text, X_test_text, average = False, embedding_type = None):
    if featureExtract in ['CountVectorizer', 'TfIdfVectorizer']:
        if featureExtract == 'CountVectorizer':
            fe = CountVectorizer(analyzer ='word', stop_words ='english', lowercase = True, min_df = 2, max_df = 0.99) # if words used less than 0.001 % and in less than 2 documents --> ignore  
        else:
            fe = TfidfVectorizer(analyzer = 'word', stop_words='english', lowercase=True, min_df = 2, max_df=0.99)
        
        X_train = fe.fit_transform(X_train_text)
        X_train = pd.DataFrame(X_train.toarray(), columns = fe.get_feature_names()) 
        X_test = fe.transform(X_test_text)
        X_test = pd.DataFrame(X_test.toarray(), columns = fe.get_feature_names())
    
    else:
        if embedding_type is None:
            raise ValueError("Missing embedding method")
        model = embedding_type
        # sent is tokenised sentence on which we do the embedding
        def get_embeddings(sent):
            # if text not in vocab:
            words_in_vocab = [word for word in sent if word in model]
            if not words_in_vocab:
                return np.zeros_like(model['the'])
            emb = model[words_in_vocab]
            return np.mean(emb, axis=0) if average else np.sum(emb, axis=0)
        #perform tokenisation
        X_train = pd.DataFrame(X_train_text.progress_apply(nltk.word_tokenize).progress_apply(get_embeddings).values.tolist())
        X_test = pd.DataFrame(X_test_text.progress_apply(nltk.word_tokenize).progress_apply(get_embeddings).values.tolist())
    return X_train, X_test
        
        
        

In [16]:
X_train, X_test = feature_extraction('TfIdfVectorizer', X_train_text, X_test_text)



In [17]:
similarities = cosine_similarity(X_train, X_test)

In [18]:
duplicates = set()
for i in range(similarities.shape[0]):
    for j in range(similarities.shape[1]):
        if similarities[i][j] > 0.9:
            # print(i, j, similarities[i][j])
            duplicates.add(i)

In [19]:
X_train_text = X_train_text[~X_train_text.index.isin(duplicates)]
X_train = X_train[~X_train.index.isin(duplicates)]
Y_train = Y_train[~Y_train.index.isin(duplicates)]
Y_train_tactic = Y_train_tactic[~Y_train_tactic.index.isin(duplicates)]

In [20]:
# save as pickle without any feature extraction 

with open('merged_data_no_duplicates.pickle', 'wb') as handle:
    pickle.dump((X_train_text, X_test_text, Y_train_tactic, Y_test_tactic, Y_train, Y_test), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
X_train_text

0       Exploit Public-Facing Application - Enterprise...
1       Emergency Incident ResponseReport a Confirmed ...
2       Pass the Hash - Enterprise | MITRE ATT&CK\xe2\...
3       Extra Window Memory Injection - Enterprise | M...
4       Tropic Trooper Targets Taiwanese Government an...
                              ...                        
2142     Molerats Delivers  MALWARE_NAME  Backdoor to ...
2143     Transparent Tribe  Evolution analysis  part  ...
2144     WWW FIDELISSECURITY COM  Fidelis Cybersecurit...
2145     OilRig uses  MALWARE_NAME  IIS Backdoor on Ta...
2146     The OilRig Campaign  Attacks on Saudi Arabian...
Name: Text, Length: 2072, dtype: object

In [22]:
Y_train_tactic

Unnamed: 0,TA0006,TA0002,TA0040,TA0003,TA0004,TA0008,TA0005,TA0010,TA0007,TA0009,TA0011,TA0001
0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,0,0
4,0,1,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2142,0,1,0,0,0,0,1,1,1,0,1,0
2143,1,1,0,1,1,1,1,0,1,1,1,1
2144,0,1,0,1,1,0,1,0,0,0,1,0
2145,0,1,0,1,0,0,1,0,1,1,1,0


In [23]:
Y_train

Unnamed: 0,T1066,T1047,T1156,T1113,T1067,T1037,T1033,T1003,T1129,T1492,...,T1124,T1035,T1086,T1490,T1216,T1094,T1043,T1211,T1127,T1077
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2142,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2143,0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2145,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
X_test_text

319     Duqu - Wikipedia Duqu From Wikipedia, the free...
336     Disk Content Wipe - Enterprise | MITRE ATT&CK\...
1077    Skip to main contentSkip to footerSkip to main...
486     Access Token Manipulation - Enterprise | MITRE...
371     SSH Agent Hijacking - Clockwork Skip to Conten...
                              ...                        
1477    Services | Microsoft Docs Skip to main content...
1376    Leo Loobeek op Twitter: "Subtle change I just ...
415     BITS Jobs - Enterprise | MITRE ATT&CK\xe2\x84\...
880     Retail Technology Life Sciences & Healthcare T...
1201    Net time | Microsoft Docs Skip to main content...
Name: Text, Length: 441, dtype: object

In [26]:
Y_test_tactic

Unnamed: 0,TA0006,TA0002,TA0040,TA0003,TA0004,TA0008,TA0005,TA0010,TA0007,TA0009,TA0011,TA0001
319,0,0,0,0,0,0,1,0,0,0,0,0
336,0,0,1,0,0,0,0,0,0,0,0,0
1077,0,1,0,1,1,0,1,0,1,0,0,0
486,0,0,0,0,1,0,1,0,0,0,0,0
371,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1477,0,0,0,1,1,0,0,0,0,0,0,0
1376,0,1,0,1,1,0,0,0,0,0,0,0
415,0,0,0,1,0,0,1,0,0,0,0,0
880,0,0,0,1,0,0,1,0,0,0,1,0


In [27]:
Y_test

Unnamed: 0,T1066,T1047,T1156,T1113,T1067,T1037,T1033,T1003,T1129,T1492,...,T1124,T1035,T1086,T1490,T1216,T1094,T1043,T1211,T1127,T1077
319,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1077,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
486,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
371,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1376,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
880,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
