In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from skmultilearn.adapt import MLkNN
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputClassifier


import pandas as pd
import json
import os
import re

import pickle
import nltk
from nltk.tokenize import word_tokenize

import gensim 
from gensim.models import Word2Vec
import gensim.downloader


import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('seaborn')

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

Matplotlib is building the font cache; this may take a moment.


In [2]:
rcatt_data = '../src/rcatt_training_data_original.csv'
scraped_data = '../src/training_dataset_full.csv'

In [3]:
# transform into dataframes: 

df_r = pd.read_csv(rcatt_data).reset_index(drop = True)
df_r = df_r[~df_r['Text'].duplicated()]
df_s = pd.read_csv(scraped_data).reset_index(drop = True).rename(columns={'text': 'Text'})

In [4]:
# Converting from string to list using literal_eval:

for col in ['mitre_domain', 'tech_name', 'tech_id', 'tactic_id', 'software_id']:
    df_s[col] = df_s[col].apply(literal_eval)

In [5]:
mlb = MultiLabelBinarizer()
Y_s = mlb.fit_transform(df_s['tech_id'])
Y_s = pd.DataFrame(Y_s, columns=mlb.classes_)

In [6]:
mlb = MultiLabelBinarizer()
Y_s_tactic = mlb.fit_transform(df_s['tactic_id'])
Y_s_tactic = pd.DataFrame(Y_s_tactic, columns=mlb.classes_)

In [7]:
X_r = df_r['Text']
Y_r = df_r[[col for col in df_r.columns if col.startswith('T') and col[1:].isdecimal()]]

In [8]:
Y_r_tactic = df_r[[col for col in df_r.columns if col.startswith('TA')]]

In [9]:
s_r = list(set(Y_s.columns).difference(set(Y_r.columns)))

In [10]:
r_s = list(set(Y_r.columns).difference(set(Y_s.columns)))

In [11]:
rs = list(set(Y_s.columns).intersection(set(Y_r.columns)))

In [12]:
Y1 = Y_s[rs]
Y1[r_s] = 0
Y_s = Y1[Y1.sum(axis=1)>0] 
X_s = df_s['Text']
X_s = X_s[Y1.sum(axis=1)>0] # all urls who map at least one of the techniques in Y1
Y_s_tactic = Y_s_tactic[Y1.sum(axis=1)>0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y1[r_s] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y1[r_s] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y1[r_s] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:/

In [13]:
X_r_train, X_test_text, Y_r_train, Y_test, Y_r_train_tactic, Y_test_tactic = train_test_split(X_r, Y_r, Y_r_tactic, test_size=0.3,
                                                    random_state = 10)

In [14]:
X_train_text = pd.concat([X_r_train, X_s])

In [15]:
Y_train = pd.concat([Y_r_train, Y_s]).reset_index(drop=True)

In [16]:
Y_train_tactic = pd.concat([Y_r_train_tactic, Y_s_tactic]).reset_index(drop=True)

In [17]:
Y_train_tactic = Y_train_tactic.dropna(axis=1)

In [18]:
def feature_extraction(featureExtract, X_train_text, X_test_text, average = False):
    if featureExtract in ['CountVectorizer', 'TfIdfVectorizer']:
        if featureExtract== 'CountVectorizer':
            fe = CountVectorizer(analyzer ='word', stop_words ='english', lowercase = True, min_df = 2, max_df = 0.99) # if words used less than 0.001 % --> ignore  
        else:
            fe = TfidfVectorizer(analyzer = 'word', stop_words='english', lowercase=True, min_df = 2, max_df=0.99)
        
        X_train = fe.fit_transform(X_train_text)
        X_train = pd.DataFrame(X_train.toarray(), columns = fe.get_feature_names()) 
        X_test = fe.transform(X_test_text)
        X_test = pd.DataFrame(X_test.toarray(), columns = fe.get_feature_names())
    
    else:
        model = gensim.downloader.load(featureExtract)
        # sent is tokenised sentence on which we do the embedding
        def get_embeddings(sent):
            # if text not in vocab:
            words_in_vocab = [word for word in sent if word in model]
            if not words_in_vocab:
                return np.zeros_like(model['the'])
            emb = model[words_in_vocab]
            return np.mean(emb, axis=0) if average else np.sum(emb, axis=0)
        #perform tokenisation
        X_train = pd.DataFrame(X_train_text.progress_apply(nltk.word_tokenize).progress_apply(get_embeddings).values.tolist())
        X_test = pd.DataFrame(X_test_text.progress_apply(nltk.word_tokenize).progress_apply(get_embeddings).values.tolist())
    return X_train, X_test
        
        
        

In [19]:
X_train, X_test = feature_extraction('TfIdfVectorizer', X_train_text, X_test_text)



In [None]:
# save tf-idf:

# Save as pickle: 

with open('tfi.pickle', 'wb') as handle:
    pickle.dump(sv_classifier, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
similarities = cosine_similarity(X_train, X_test)

In [21]:
# check if articles have a cosine similarity of more than 90%
duplicates = set() 
for i in range(similarities.shape[0]):
    for j in range(similarities.shape[1]):
        if similarities[i][j] > 0.9:
            
            duplicates.add(i)

In [22]:
# remove similar articles
X_train_text = X_train_text[~X_train_text.index.isin(duplicates)]
X_train = X_train[~X_train.index.isin(duplicates)]
Y_train = Y_train[~Y_train.index.isin(duplicates)]
Y_train_tactic = Y_train_tactic[~Y_train_tactic.index.isin(duplicates)]

In [23]:
# save as pickle

with open('technique_y_train.pickle', 'wb') as handle:
    pickle.dump(Y_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [46]:
def evaluation(Y_pred, Y_test):
    macro_precision = precision_score(Y_test, Y_pred, average ='macro')
    micro_precision = precision_score(Y_test, Y_pred, average ='micro')
    macro_recall = recall_score(Y_test, Y_pred, average='macro')
    micro_recall = recall_score(Y_test, Y_pred, average='micro')
    macro_fscore = fbeta_score(Y_test, Y_pred, beta=0.5, average ='macro')
    micro_fscore = fbeta_score(Y_test, Y_pred, beta=0.5, average ='micro')
    l_metric = ['macro precision', 'micro precision', 'macro recall', 'micro recall', 'macro fscore', 'micro fscore']
    l_result = [macro_precision, micro_precision, macro_recall, micro_recall, macro_fscore, micro_fscore]
    df_res = pd.DataFrame({'metric': l_metric, 'result': l_result})
    return df_res

In [23]:
# Train and test: First delete techniques less than 9 
# We fix the random state to have the same dataset in our different tests@

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42), n_jobs = 1)
sv_classifier.fit(X_train, Y_train)


OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced', dual=False,
                                        random_state=42),
                    n_jobs=1)

In [24]:
Y_pred = pd.DataFrame(sv_classifier.predict(X_test), columns=Y_test.columns)

In [None]:
evaluation(Y_pred, Y_test)

In [25]:
# Save as pickle: 

with open('technique_model.pickle', 'wb') as handle:
    pickle.dump(sv_classifier, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Open Tactic Performance: 

In [51]:
# Open Pickle: 

with open('tactic_model.pickle', 'rb') as handle:
    tactic_model = pickle.load(handle)

In [None]:
data = '../src/scraping/tactic_dataset.json'

In [None]:
with open(data) as file:
    tactic_data = json.load(file)

In [188]:
# Open Pickle: 

with open('tactic_train_test.pickle', 'rb') as handle:
    X_train_tactic_pickle, X_test_tactic_pickle, Y_train_tactic_pickle, Y_test_tactic_pickle = pickle.load(handle)



In [189]:
Y_test_tactic_pickle

Unnamed: 0,TA0006,TA0002,TA0040,TA0003,TA0004,TA0008,TA0005,TA0010,TA0007,TA0009,TA0011,TA0001
319,0,0,0,0,0,0,1,0,0,0,0,0
336,0,0,1,0,0,0,0,0,0,0,0,0
1077,0,1,0,1,1,0,1,0,1,0,0,0
486,0,0,0,0,1,0,1,0,0,0,0,0
371,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1477,0,0,0,1,1,0,0,0,0,0,0,0
1376,0,1,0,1,1,0,0,0,0,0,0,0
415,0,0,0,1,0,0,1,0,0,0,0,0
880,0,0,0,1,0,0,1,0,0,0,1,0


In [195]:
Y_test_pred_tactic = pd.DataFrame(tactic_model.predict(X_test_tactic_pickle), columns = Y_test_tactic.columns)

# Hanging Node:

In [None]:
# list of tactics and list of techniques:
from sklearn.preprocessing import MinMaxScaler

In [None]:
tactic_prob = pd.DataFrame(tactic_model.decision_function(X_test_tactic), columns=Y_test_tactic.columns)

In [None]:
# if scaling at (-1,1) then threshold of .5 
# scaling determines improvement 
# (0-1) performs poorly with threshold of .5 

technique_prob = pd.DataFrame(MinMaxScaler((-1, 1)).fit_transform(sv_classifier.decision_function(X_test)), columns=Y_test.columns)

In [None]:
def hanging_node(i, tactic_proba, technique_proba,tactic_threshold=0.5, technique_threshold=0.5, a = 1,b = 0, c=1, d=0):
    assert a > technique_threshold and c > technique_threshold and b < tactic_threshold and d < tactic_threshold
    pred_tactics = [tactic for tactic in tactic_proba.columns if tactic_proba.iloc[i][tactic] > tactic_threshold]
    pred_techniques = [technique for technique in technique_proba.columns if technique_proba.iloc[i][technique] > technique_threshold]
    #return pred_tactics, pred_techniques
    print((pred_tactics, pred_techniques))
    for tactic in tactic_proba.columns:
        for technique in tactic_data[tactic]['Technique_ID'][0]:
            if technique in pred_techniques and tactic not in pred_tactics:
                if technique_proba.iloc[i][technique] > a and tactic_proba.iloc[i][tactic] > b:
                    pred_tactics.append(tactic)
                elif technique_proba.iloc[i][technique] < c and tactic_proba.iloc[i][tactic] < d:
                    pred_techniques.remove(technique)
    print(pred_tactics, pred_techniques)
    print('-----------------------------')
    return pred_tactics, pred_techniques
    

In [None]:
Y_test_tactics_hanging_node = []
Y_test_techniques_hanging_node = []
for i in range(len(X_test)):
    pred_tactics, pred_techniques = hanging_node(i, tactic_prob, technique_prob,tactic_threshold=0, technique_threshold=0.5,  a = 0.55, b = -0.9, c = 0.95, d = -0.30)
    Y_test_tactics_hanging_node.append([1 if tactic in pred_tactics else 0 for tactic in Y_test_tactic.columns])
    Y_test_techniques_hanging_node.append([1 if technique in pred_techniques else 0 for technique in Y_test.columns])

Y_test_tactics_hanging_node = pd.DataFrame(Y_test_tactics_hanging_node, columns=Y_test_tactic.columns)
Y_test_techniques_hanging_node = pd.DataFrame(Y_test_techniques_hanging_node, columns=Y_test.columns)

In [200]:
def evaluation(Y_pred, Y_test):
    macro_precision = precision_score(Y_test, Y_pred, average ='macro')
    micro_precision = precision_score(Y_test, Y_pred, average ='micro')
    macro_recall = recall_score(Y_test, Y_pred, average='macro')
    micro_recall = recall_score(Y_test, Y_pred, average='micro')
    macro_fscore = fbeta_score(Y_test, Y_pred, beta=0.5, average ='macro')
    micro_fscore = fbeta_score(Y_test, Y_pred, beta=0.5, average ='micro')
    l_metric = ['macro precision', 'micro precision', 'macro recall', 'micro recall', 'macro fscore', 'micro fscore']
    l_result = [macro_precision, micro_precision, macro_recall, micro_recall, macro_fscore, micro_fscore]
    df_res = pd.DataFrame({'metric': l_metric, 'result': l_result})
    return df_res

In [None]:
evaluation(Y_test_tactics_hanging_node, Y_test_tactic)

In [None]:
evaluation(Y_test_techniques_hanging_node, Y_test)

# Train Model on Reports Tactics:

In [None]:
# see if training on only techniques improves or not
# only train techniques on reports belonging to a techniques' tactic

In [183]:
# Key as technique, value as tactic.

from collections import defaultdict
tactic_to_technique = defaultdict(list)
for tactic in tactic_data:
    for technique in tactic_data[tactic]['Technique_ID'][0]:
        if tactic in Y_train_tactic.columns:
            tactic_to_technique[technique].append(tactic)

In [184]:
# check for matching between tactic and technique
# find technique that are not in dictionary. 
# for Y_r, find report that only contains that technique and find what tactics is has 

for technique in Y_train.columns:
    if technique not in tactic_to_technique:
        index = Y_r[(Y_r.sum(axis=1)==1) & (Y_r[technique] == 1)].index[0]
        tacics = [tactic for tactic in Y_train_tactic.columns if 
                       int(Y_r_tactic.loc[index, tactic])==1]
        tactic_to_technique[technique] = tacics
    
    

In [185]:
classifiers = []
for technique in Y_train.columns:
    indexes = Y_train_tactic[Y_train_tactic.apply(lambda x: any(x[tactic] ==1 for tactic in tactic_to_technique[technique]), axis=1)].index
    X_train_technique = X_train.loc[indexes]
    Y_train_technique = Y_train.loc[indexes, technique]
    clf = LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42)
    clf.fit(X_train_technique, Y_train_technique)
    classifiers.append(clf)
    

In [199]:
Y_pred = pd.DataFrame(0, index=X_test.index, columns=Y_test.columns)
for i, technique in enumerate(Y_test.columns):
    indexes = Y_test_pred_tactic[Y_test_pred_tactic.apply(lambda x: any(x[tactic] ==1 for tactic in tactic_to_technique[technique]), axis=1)].index
    Y_pred.loc[indexes, technique] = classifiers[i].predict(X_test.loc[indexes])

In [201]:
evaluation(Y_pred, Y_test) # performance for technique 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Unnamed: 0,metric,result
0,macro precision,0.499174
1,micro precision,0.505065
2,macro recall,0.341547
3,micro recall,0.40138
4,macro fscore,0.429969
5,micro fscore,0.480253


# Linking Model:

In [56]:
data = '../src/scraping/tactic_dataset.json'

with open(data) as file:
    tactic_data = json.load(file)

In [60]:
df_tactic = pd.DataFrame(tactic_data).transpose()


In [65]:
df_tactic['Technique_ID'] = df_tactic['Technique_ID'].apply(lambda x: x[0])

In [67]:
df_tactic.columns

Index(['Tactic_ID', 'Link', 'Tactic_Name', 'Description', 'Technique_ID'], dtype='object')

In [68]:
df_tactic = df_tactic[['Tactic_ID', 'Technique_ID']]

In [69]:
df_tactic

Unnamed: 0,Tactic_ID,Technique_ID
TA0043,TA0043,"[T1595, T1592, T1589, T1590, T1591, T1598, T15..."
TA0042,TA0042,"[T1583, T1586, T1584, T1587, T1585, T1588, T1608]"
TA0001,TA0001,"[T1189, T1190, T1133, T1200, T1566, T1091, T11..."
TA0002,TA0002,"[T1059, T1609, T1610, T1203, T1559, T1106, T10..."
TA0003,TA0003,"[T1098, T1197, T1547, T1037, T1176, T1554, T11..."
TA0004,TA0004,"[T1548, T1134, T1547, T1037, T1543, T1484, T16..."
TA0005,TA0005,"[T1548, T1134, T1197, T1612, T1622, T1140, T16..."
TA0006,TA0006,"[T1557, T1110, T1555, T1212, T1187, T1606, T10..."
TA0007,TA0007,"[T1087, T1010, T1217, T1580, T1538, T1526, T16..."
TA0008,TA0008,"[T1210, T1534, T1570, T1563, T1021, T1091, T10..."


In [None]:
# technique model
# Open Pickle: 

with open('tactic_model.pickle', 'rb') as handle:
    tactic_model = pickle.load(handle)