In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from skmultilearn.adapt import MLkNN
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputClassifier


import pandas as pd
import json
import os
import re

import pickle
import nltk
from nltk.tokenize import word_tokenize

import gensim 
from gensim.models import Word2Vec
import gensim.downloader


import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('seaborn')

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
nltk.download('punkt')
tqdm.pandas()

[nltk_data] Downloading package punkt to /homes/lgf21/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Open Pickle without lemmatisation: 

with open('merged_data_no_duplicates.pickle', 'rb') as handle:
    (X_train_text, X_test_text, Y_train_tactic, Y_test_tactic, Y_train, Y_test) = pickle.load(handle)

In [4]:
X_train_text

0       Exploit Public-Facing Application - Enterprise...
1       Emergency Incident ResponseReport a Confirmed ...
2       Pass the Hash - Enterprise | MITRE ATT&CK\xe2\...
3       Extra Window Memory Injection - Enterprise | M...
4       Tropic Trooper Targets Taiwanese Government an...
                              ...                        
2142     Molerats Delivers  MALWARE_NAME  Backdoor to ...
2143     Transparent Tribe  Evolution analysis  part  ...
2144     WWW FIDELISSECURITY COM  Fidelis Cybersecurit...
2145     OilRig uses  MALWARE_NAME  IIS Backdoor on Ta...
2146     The OilRig Campaign  Attacks on Saudi Arabian...
Name: Text, Length: 2072, dtype: object

# Feature Extraction:

In [5]:
# call methods for feature extraction and evaluation:
import sys
sys.path.append('../src')

from methods import feature_extraction, evaluation

In [6]:
X_train, X_test = feature_extraction('TfIdfVectorizer', X_train_text, X_test_text)



In [7]:
# Train and test: First delete techniques less than 9 
# We fix the random state to have the same dataset in our different tests@

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42), n_jobs = 1)
sv_classifier.fit(X_train, Y_train)


OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced', dual=False,
                                        random_state=42),
                    n_jobs=1)

In [8]:
Y_pred = pd.DataFrame(sv_classifier.predict(X_test), columns=Y_test.columns)

In [None]:
evaluation(Y_pred, Y_test)

In [None]:
# Save as pickle technique performance: 

#with open('postprocessing.pickle', 'wb') as handle:
 #   pickle.dump(sv_classifier, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Open Tactic Performance: 

In [9]:
# Open Pickle: 

with open('tactic_MLP_TFIDF_model.pickle', 'rb') as handle:
    tactic_model = pickle.load(handle)

In [10]:
data = '../src/scraping/scraped_data/tactic_dataset.json'

In [11]:
with open(data) as file:
    tactic_data = json.load(file)

In [12]:
Y_test_pred_tactic = pd.DataFrame(tactic_model.predict(X_test), columns = Y_test_tactic.columns)

# Hanging Node:

In [13]:
# list of tactics and list of techniques:
from sklearn.preprocessing import MinMaxScaler

In [15]:
tactic_prob = pd.DataFrame(tactic_model(X_test), columns=Y_test_tactic.columns)

TypeError: 'MLPClassifier' object is not callable

In [16]:
# if scaling at (-1,1) then threshold of .5 
# scaling determines improvement 
# (0-1) performs poorly with threshold of .5 

technique_prob = pd.DataFrame(MinMaxScaler((-1, 1)).fit_transform(sv_classifier.decision_function(X_test)), columns=Y_test.columns)

In [18]:
def hanging_node(i, tactic_proba, technique_proba,tactic_threshold=0.5, technique_threshold=0.5, a = 1,b = 0, c=1, d=0):
    assert a > technique_threshold and c > technique_threshold and b < tactic_threshold and d < tactic_threshold
    pred_tactics = [tactic for tactic in tactic_proba.columns if tactic_proba.iloc[i][tactic] > tactic_threshold]
    pred_techniques = [technique for technique in technique_proba.columns if technique_proba.iloc[i][technique] > technique_threshold]
    #return pred_tactics, pred_techniques
    print((pred_tactics, pred_techniques))
    for tactic in tactic_proba.columns:
        for technique in tactic_data[tactic]['Technique_ID'][0]:
            if technique in pred_techniques and tactic not in pred_tactics:
                if technique_proba.iloc[i][technique] > a and tactic_proba.iloc[i][tactic] > b:
                    pred_tactics.append(tactic)
                elif technique_proba.iloc[i][technique] < c and tactic_proba.iloc[i][tactic] < d:
                    pred_techniques.remove(technique)
    print(pred_tactics, pred_techniques)
    print('-----------------------------')
    return pred_tactics, pred_techniques
    

In [19]:
Y_test_tactics_hanging_node = []
Y_test_techniques_hanging_node = []
for i in range(len(X_test)):
    pred_tactics, pred_techniques = hanging_node(i, tactic_prob, technique_prob,tactic_threshold=0, technique_threshold=0.5,  a = 0.55, b = -0.9, c = 0.95, d = -0.30)
    Y_test_tactics_hanging_node.append([1 if tactic in pred_tactics else 0 for tactic in Y_test_tactic.columns])
    Y_test_techniques_hanging_node.append([1 if technique in pred_techniques else 0 for technique in Y_test.columns])

Y_test_tactics_hanging_node = pd.DataFrame(Y_test_tactics_hanging_node, columns=Y_test_tactic.columns)
Y_test_techniques_hanging_node = pd.DataFrame(Y_test_techniques_hanging_node, columns=Y_test.columns)

NameError: name 'tactic_prob' is not defined

In [None]:
evaluation(Y_test_tactics_hanging_node, Y_test_tactic)

In [None]:
evaluation(Y_test_techniques_hanging_node, Y_test)

# Train Model on Reports Tactics:

In [None]:
# see if training on only techniques improves or not
# only train techniques on reports belonging to a techniques' tactic

In [None]:
# Key as technique, value as tactic.

from collections import defaultdict
tactic_to_technique = defaultdict(list)
for tactic in tactic_data:
    for technique in tactic_data[tactic]['Technique_ID'][0]:
        if tactic in Y_train_tactic.columns:
            tactic_to_technique[technique].append(tactic)

In [None]:
Y = pd.concat([Y_train, Y_test]).reset_index(drop=True)
Y_tactic = pd.concat([Y_train_tactic, Y_test_tactic]).reset_index(drop=True)

In [None]:
# check for matching between tactic and technique
# find technique that are not in dictionary. 
# for Y_r, find report that only contains that technique and find what tactics is has 

for technique in Y.columns:
    if technique not in tactic_to_technique:
        index = Y[(Y.sum(axis=1)==1) & (Y[technique] == 1)].index[-1]
        tactics = [tactic for tactic in Y_tactic.columns if 
                       int(Y_tactic.loc[index, tactic])==1]
        if len(tactics) == 0:
            print(technique)
            print(Y[(Y.sum(axis=1)==1) & (Y[technique] == 1)].index)
        tactic_to_technique[technique] = tactics
    
    

In [None]:
Y_train_tactic[Y_train_tactic.sum(axis=1)==0]

In [None]:
X_train_text.iloc[977]

In [None]:
Y_train = Y_train.reset_index(drop=True)
Y_train_tactic = Y_train_tactic.reset_index(drop=True)


In [None]:
Y_train_tactic

In [None]:
X_train

In [None]:
Y_train[Y_train['T1108']==1]

In [None]:
tactic_to_technique["T1108"]

In [None]:
classifiers = []
for technique in Y_train.columns:
    indexes = Y_train_tactic[Y_train_tactic.apply(lambda x: any(x[tactic] ==1 for tactic in tactic_to_technique[technique]), axis=1)].index
    X_train_technique = X_train.loc[indexes]
    Y_train_technique = Y_train.loc[indexes, technique]
    clf = LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42)
    clf.fit(X_train_technique, Y_train_technique)
    classifiers.append(clf)
    

In [None]:
Y_pred = pd.DataFrame(0, index=X_test.index, columns=Y_test.columns)
for i, technique in enumerate(Y_test.columns):
    indexes = Y_test_pred_tactic[Y_test_pred_tactic.apply(lambda x: any(x[tactic] ==1 for tactic in tactic_to_technique[technique]), axis=1)].index
    Y_pred.loc[indexes, technique] = classifiers[i].predict(X_test.loc[indexes])

In [None]:
evaluation(Y_pred, Y_test) # performance for technique 

# Linking Model:

In [None]:
data = '../src/scraping/tactic_dataset.json'

with open(data) as file:
    tactic_data = json.load(file)

In [None]:
df_tactic = pd.DataFrame(tactic_data).transpose()


In [None]:
df_tactic['Technique_ID'] = df_tactic['Technique_ID'].apply(lambda x: x[0])

In [None]:
df_tactic.columns

In [None]:
df_tactic = df_tactic[['Tactic_ID', 'Technique_ID']]

In [None]:
df_tactic

In [None]:
# technique model
# Open Pickle: 

with open('tactic_model.pickle', 'rb') as handle:
    tactic_model = pickle.load(handle)