In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from skmultilearn.adapt import MLkNN
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
     
from nltk.stem.snowball import SnowballStemmer

import pandas as pd
import json
import os
import re

import gensim 
from gensim.models import Word2Vec
import gensim.downloader

import pickle 

import nltk
from nltk.tokenize import word_tokenize

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
import flair
from flair.data import Corpus
from flair.datasets import ClassificationCorpus
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

In [23]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /homes/lgf21/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
tqdm.pandas()

# Opening Files: 

In [25]:
# Open Pickle: 

with open('merged_data_no_duplicates.pickle', 'rb') as handle:
    (X_train_text, X_test_text, Y_train, Y_test, _, _) = pickle.load(handle)

# Pre-processing: 

In [26]:
nlp = spacy.load("en_core_web_sm", disable=['ner']) 

In [10]:
# Lemmatisation: 


X_train_text = X_train_text.progress_apply(lambda x: x if len(x)> 1000000 else " ".join([y.lemma_ for y in nlp(x)]))
X_test_text = X_test_text.progress_apply(lambda x: x if len(x)> 1000000 else " ".join([y.lemma_ for y in nlp(x)]))


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2072/2072 [08:35<00:00,  4.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 441/441 [01:29<00:00,  4.94it/s]


In [None]:
# Stemming: 

#stemmer = SnowballStemmer(language='english')
#df['stemmer'] = df['text'].apply(lambda x: " ".join([stemmer.stem(token) for token in x]))


# Feature Extraction:

In [15]:
import sys

In [16]:
sys.path.append('../src')

In [19]:
from feature_extraction import feature_extraction

In [8]:
pd.DataFrame([1,2,3,4]).reset_index().apply(lambda row: print(row.index),axis=1)

Index(['index', 0], dtype='object')
Index(['index', 0], dtype='object')
Index(['index', 0], dtype='object')
Index(['index', 0], dtype='object')


0    None
1    None
2    None
3    None
dtype: object

In [18]:
a = pd.Series(['1', '1', '2'], index=['a', 'b', 'c'])

In [23]:
a.get('d', 2)

2

## Count Vectorizer:

In [27]:
X_train, X_test = feature_extraction('CountVectorizer', X_train_text, X_test_text)



## TF-IDF:

In [8]:
#X_train, X_test = feature_extraction('TfIdfVectorizer', X_train_text, X_test_text)

## word2vec Google news:

In [37]:
w2v_google = gensim.downloader.load('word2vec-google-news-300')

In [16]:
#X_train, X_test = feature_extraction('embedding', X_train_text, X_test_text, embedding_type = w2v_google, weighted=True)

100%|█| 2072/2072 [00:41<00:00, 50.16it/s]
100%|█| 2072/2072 [00:22<00:00, 90.16it/s]
100%|███| 441/441 [00:06<00:00, 69.87it/s]
100%|██| 441/441 [00:02<00:00, 218.35it/s]


## Glove:

In [None]:
#glv = gensim.downloader.load('glove-wiki-gigaword-100')

In [None]:
# X_train, X_test = feature_extraction('embedding', X_train_text, X_test_text, embedding_type = glv)

## Trained word2vec:

In [30]:
w2v = Word2Vec.load("word2vec.model").wv

In [31]:
X_train, X_test = feature_extraction("embedding", X_train_text, X_test_text, embedding_type = w2v, weighted=False)

2072it [01:06, 31.35it/s] 
441it [00:08, 49.01it/s]


# Visualisation:

In [None]:
Y_train.sum(axis=0).sort_values(ascending=False).plot()

In [None]:
Y_train.sum(axis=0).sort_values(ascending=False).plot(kind='bar')

In [None]:
# TSNE using vector from glove: 

X = list(X_train.values)
X_embedded = TSNE(n_components=2).fit_transform(X)

df_embeddings = pd.DataFrame(X_embedded)
df_embeddings = df_embeddings.rename(columns={0:'x',1:'y'})
df_embeddings = df_embeddings.assign(label= Y_train)

In [None]:
fig = px.scatter(
    df_embeddings, x ='tactic', y ='text',
    color='label', labels={'color': 'label'}
    hover_data=['text'], title = 'GoEmotions Embedding Visualization')
fig.show()

# Evaluation:

In [27]:
def evaluation(Y_pred, Y_test):
    macro_precision = precision_score(Y_test, Y_pred, average ='macro')
    micro_precision = precision_score(Y_test, Y_pred, average ='micro')
    macro_recall = recall_score(Y_test, Y_pred, average='macro')
    micro_recall = recall_score(Y_test, Y_pred, average='micro')
    macro_fscore = fbeta_score(Y_test, Y_pred, beta=0.5, average ='macro')
    micro_fscore = fbeta_score(Y_test, Y_pred, beta=0.5, average ='micro')
    l_metric = ['macro precision', 'micro precision', 'macro recall', 'micro recall', 'macro fscore', 'micro fscore']
    l_result = [macro_precision, micro_precision, macro_recall, micro_recall, macro_fscore, micro_fscore]
    df_res = pd.DataFrame({'metric': l_metric, 'result': l_result})
    return df_res

## Naive Bayes:

In [None]:
naive_bayes_classifier = OneVsRestClassifier(MultinomialNB())
naive_bayes_classifier.fit(X_train, Y_train)

In [None]:
y_pred_proba = pd.DataFrame(naive_bayes_classifier.predict_proba(X_test), columns = Y_test.columns)

In [None]:
y_pred = (y_pred_proba > 0.005).astype(int) # if increase threshold, recall decreases and precision (could) increase

In [None]:
evaluation(y_pred, Y_test)

In [13]:
X_train_text

1417    Exploit Public-Facing Application - Enterprise...
805     Emergency Incident ResponseReport a Confirmed ...
376     Pass the Hash - Enterprise | MITRE ATT&CK\xe2\...
20      Extra Window Memory Injection - Enterprise | M...
815     Tropic Trooper Targets Taiwanese Government an...
                              ...                        
1514     Molerats Delivers  MALWARE_NAME  Backdoor to ...
1515     Transparent Tribe  Evolution analysis  part  ...
1516     WWW FIDELISSECURITY COM  Fidelis Cybersecurit...
1518     OilRig uses  MALWARE_NAME  IIS Backdoor on Ta...
1519     The OilRig Campaign  Attacks on Saudi Arabian...
Name: Text, Length: 2036, dtype: object

## SVC:

In [28]:
# Train and test: First delete techniques less than 9 
# We fix the random state to have the same dataset in our different tests

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42), n_jobs = 1)
sv_classifier.fit(X_train, Y_train)




OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced', dual=False,
                                        random_state=42),
                    n_jobs=1)

In [84]:
len(X_train)

2513

In [85]:
X_train_text

0       Exploit Public-Facing Application - Enterprise...
1       Emergency Incident ResponseReport a Confirmed ...
2       Pass the Hash - Enterprise | MITRE ATT&CK\xe2\...
3       Extra Window Memory Injection - Enterprise | M...
4       Tropic Trooper Targets Taiwanese Government an...
                              ...                        
2142     Molerats Delivers  MALWARE_NAME  Backdoor to ...
2143     Transparent Tribe  Evolution analysis  part  ...
2144     WWW FIDELISSECURITY COM  Fidelis Cybersecurit...
2145     OilRig uses  MALWARE_NAME  IIS Backdoor on Ta...
2146     The OilRig Campaign  Attacks on Saudi Arabian...
Name: Text, Length: 2072, dtype: object

In [29]:
Y_pred = pd.DataFrame(sv_classifier.predict(X_test), columns=Y_test.columns)

In [30]:
evaluation(Y_pred, Y_test)

Unnamed: 0,metric,result
0,macro precision,0.406016
1,micro precision,0.436754
2,macro recall,0.585306
3,micro recall,0.598528
4,macro fscore,0.425718
5,micro fscore,0.461713


## Multi-label KNN: 

In [28]:
knn = MLkNN(k = 3)

In [None]:
# only works old version sklearn 

In [29]:
# train
knn.fit(X_train.values, Y_train.values)

# predict
predictions = knn.predict(X_test.values)

TypeError: __init__() takes 1 positional argument but 2 were given

In [None]:
evaluation(predictions, Y_test)

## Logistic Regression:

In [91]:
# reduce dimension using pca: 

pca = PCA(n_components=75)
pca.fit(X_train)
pca_result = pca.transform(X_train)
x_test_result = pca.transform(X_test )

In [33]:
X_train.shape

(2072, 100)

In [92]:
log_reg = OneVsRestClassifier(LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs', max_iter = 1000)).fit(X_train, Y_train)

predictions = log_reg.predict(X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [93]:
evaluation(predictions, Y_test)

Unnamed: 0,metric,result
0,macro precision,0.500705
1,micro precision,0.569536
2,macro recall,0.350402
3,micro recall,0.421913
4,macro fscore,0.450703
5,micro fscore,0.532288


## DT AdaBoost: 

In [None]:
dt_adaboost = OneVsRestClassifier(AdaBoostClassifier(n_estimators=100, random_state=0)).fit(X_train, Y_train)
predictions_ada = dt_adaboost.predict(X_test)

In [None]:
evaluation(predictions_ada, Y_test)

## Random Forest:

In [None]:
rf = OneVsRestClassifier(RandomForestClassifier(max_depth=2, random_state=0)).fit(X_train, Y_train)

# Classifer Chain: 

In [None]:
def chain_model(model):
    model_chain = ClassifierChain(model, order='random', random_state=0)
    return model_chain

In [None]:
#chain = chain_model(naive_bayes_classifier) # change model appropriately

In [None]:
chain = chain_model(log_reg) # change model appropriately

In [None]:
chainModel = chain.fit(X_train.values, Y_train.values)
predictions = chainModel.predict(X_test.values)

In [None]:
evaluation(predictions, Y_test)

# Neural Networks:

## Multi Layer Perceptron: 

In [None]:
mlp = MLPClassifier(random_state=1, max_iter=300).fit(X_train, Y_train)

In [None]:
predictions_mlp = mlp.predict(X_test)

In [None]:
evaluation(predictions_mlp, Y_test)

## Loading data from flair:

In [None]:
with open('fasttext_format_test.txt', 'w') as file:
    for i in range(len(Y_test)):
        file.write(' '.join(['__label__'+col for col in Y_test.columns if Y_test.iloc[i][col] == 1]) + ' ' + X_test_text.iloc[i] + '\n')

In [32]:

flair.device = 'cpu'
# this is the folder in which train, test and dev files reside
data_folder = '.'

# load corpus containing training, test and dev data
corpus = ClassificationCorpus(data_folder,
                                      test_file='fasttext_format_test.txt',
                                      dev_file='fasttext_format_test.txt',
                                      train_file='fasttext_format_train.txt',
                                      label_type='tactic',
                                      )

2022-08-01 15:23:32,769 Reading data from .
2022-08-01 15:23:32,770 Train: fasttext_format_train.txt
2022-08-01 15:23:32,770 Dev: fasttext_format_test.txt
2022-08-01 15:23:32,770 Test: fasttext_format_test.txt
2022-08-01 15:23:34,274 Initialized corpus . (label type name is 'tactic')


In [33]:
# 2. what label do we want to predict?
label_type = 'tactic'

# 3. create the label dictionary
label_dict = corpus.make_label_dictionary(label_type=label_type)

2022-08-01 15:23:49,689 Computing label dictionary. Progress:


2152it [02:25, 14.77it/s]

2022-08-01 15:26:15,785 Dictionary created for label 'tactic' with 13 values: TA0005 (seen 1237 times), TA0003 (seen 857 times), TA0002 (seen 756 times), TA0004 (seen 742 times), TA0011 (seen 688 times), TA0007 (seen 659 times), TA0006 (seen 487 times), TA0009 (seen 465 times), TA0008 (seen 312 times), TA0001 (seen 248 times), TA0010 (seen 199 times), TA0040 (seen 190 times)





## Transformers:

In [None]:
# initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('binay1999/text_classification_cybertexts', fine_tune=True)

# create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type, multi_label=True)

# initialize trainer
trainer = ModelTrainer(classifier, corpus)

# run training with fine-tuning
trainer.fine_tune('test_model',
                  learning_rate=5.0e-5,
                  mini_batch_size=4,
                  max_epochs=10,
                  )

## LSTM (with word2vec):

In [None]:
embedding = WordEmbeddings('en')

document_embeddings = DocumentRNNEmbeddings([embedding])

In [None]:
# create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type, multi_label=True)

# initialize trainer
trainer = ModelTrainer(classifier, corpus)

# run training with fine-tuning
trainer.fine_tune('test_model_word2vec',
                  learning_rate=5.0e-5,
                  mini_batch_size=4,
                  max_epochs=10,
                  )