In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from skmultilearn.adapt import MLkNN
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputClassifier


import pandas as pd
import json
import os
import re

import pickle
import nltk
from nltk.tokenize import word_tokenize

import gensim 
from gensim.models import Word2Vec
import gensim.downloader


import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('seaborn')

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import sys

In [None]:
nltk.download('punkt')

In [None]:
tqdm.pandas()

# Opening Files:

In [None]:
# Open Pickle: 

#with open('merged_data_no_duplicates.pickle', 'rb') as handle:
#    (X_train_text, X_test_text, _, _, Y_train, Y_test) = pickle.load(handle)

In [None]:
# Open Pickle lemmatised: 

with open('merged_data_lemma.pickle', 'rb') as handle:
    (X_train_text, X_test_text, _, _, Y_train, Y_test) = pickle.load(handle)

# Feature Extraction:

In [None]:

# call methods for feature extraction and evaluation:
import sys
sys.path.append('../src')

from methods import feature_extraction, evaluation

In [None]:
#X_train, X_test = feature_extraction('CountVectorizer', X_train_text, X_test_text)

In [None]:
X_train, X_test = feature_extraction('TfIdfVectorizer', X_train_text, X_test_text)

In [None]:
#X_train, X_test = feature_extraction('glove-wiki-gigaword-100', X_train_text, X_test_text)

In [None]:
#X_train, X_test = feature_extraction('glove-wiki-gigaword-100', X_train_text, X_test_text)

# Classifiers:

## Naive Bayes:

In [None]:

naive_bayes_classifier = OneVsRestClassifier(MultinomialNB())
naive_bayes_classifier.fit(X_train, Y_train)

In [None]:
y_pred_proba = pd.DataFrame(naive_bayes_classifier.predict_proba(X_test), columns = Y_test.columns)

In [None]:
y_pred = (y_pred_proba > 0.005).astype(int) # if increase threshold, recall decreases and precision (could) increase

In [None]:
evaluation(y_pred, Y_test)

## SVC: 

In [None]:
# Train and test: First delete techniques less than 9 
# We fix the random state to have the same dataset in our different tests@

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42), n_jobs = 1)
sv_classifier.fit(X_train, Y_train)

In [None]:
Y_pred = pd.DataFrame(sv_classifier.predict(X_test), columns=Y_test.columns)

In [None]:
evaluation(Y_pred, Y_test)

## Calibrated Classifier for SVC:

In [None]:
clf = CalibratedClassifierCV(sv_classifier) 
multioutput_clf = MultiOutputClassifier(clf).fit(X_train, Y_train)

In [None]:
np.flatnonzero(Y_pred.sum(axis=1) == 0) # index of all reports without predictions@

In [None]:
len(np.flatnonzero(Y_pred.sum(axis=1) == 0))/len(Y_test) # 27% of reports are never predicted

## Multi-label kNN:

In [None]:
knn = MLkNN(k = 3)

In [None]:
# train
knn.fit(X_train.values, Y_train.values)

# predict
predictions = knn.predict(X_test.values)

In [None]:
fbeta_score(Y_test, predictions, beta=0.5, average ='macro')

In [None]:

evaluation(predictions, Y_test)

## Logistic Regression:

In [None]:
# reduce dimension using pca: 

pca = PCA(n_components=150)
pca.fit(X_train)
pca_result = pca.transform(X_train)
x_test_result = pca.transform(X_test)

In [None]:

log_reg = OneVsRestClassifier(LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs', max_iter = 1000)).fit(X_train, Y_train)

predictions = log_reg.predict(X_test)

In [None]:
evaluation(predictions, Y_test)

## DT AdaBoost:

In [None]:
dt_adaboost = OneVsRestClassifier(AdaBoostClassifier(n_estimators=100, random_state=0)).fit(X_train, Y_train)
predictions_ada = dt_adaboost.predict(X_test)

In [None]:
evaluation(predictions_ada, Y_test)

## Random Forest:

In [None]:
predictions_rf = OneVsRestClassifier(RandomForestClassifier(max_depth=2, random_state=0)).fit(X_train, Y_train)

In [None]:

evaluation(predictions_rf, Y_test)

# Classifier Chain:

In [None]:
def chain_model(model):
    model_chain = ClassifierChain(model, order='random', random_state=0)
    return model_chain

In [None]:
chain = chain_model(log_reg) # change model appropriately

In [None]:
chainModel = chain.fit(X_train.values, Y_train.values)
predictions = chainModel.predict(X_test.values)

In [None]:
evaluation(predictions, Y_test)

# Neural Networks: 

## Multi Layer Perceptron:

In [None]:
mlp = MLPClassifier(random_state=1, max_iter=300).fit(X_train, Y_train)

In [None]:
predictions_mlp = mlp.predict(X_test)

In [None]:
evaluation(predictions_mlp, Y_test)

## Loading data from flair: 

In [None]:
with open('fasttext_format_test.txt', 'w') as file:
    for i in range(len(Y_test)):
        file.write(' '.join(['__label__'+col for col in Y_test.columns if Y_test.iloc[i][col] == 1]) + ' ' + X_test_text.iloc[i] + '\n')

In [None]:
flair.device = 'cpu'
# this is the folder in which train, test and dev files reside
data_folder = '.'

# load corpus containing training, test and dev data
corpus = ClassificationCorpus(data_folder,
                                      test_file='fasttext_format_test.txt',
                                      dev_file='fasttext_format_test.txt',
                                      train_file='fasttext_format_train.txt',
                                      label_type='tactic',
                                      )

In [None]:
# 2. what label do we want to predict?
label_type = 'tactic'

# 3. create the label dictionary
label_dict = corpus.make_label_dictionary(label_type=label_type)

## Transformers:

In [None]:

# initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('binay1999/text_classification_cybertexts', fine_tune=True)

# create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type, multi_label=True)

# initialize trainer
trainer = ModelTrainer(classifier, corpus)

# run training with fine-tuning
trainer.fine_tune('test_model',
                  learning_rate=5.0e-5,
                  mini_batch_size=4,
                  max_epochs=10,
                  )

## LSTM (with word2Vec):

In [None]:

embedding = WordEmbeddings('en')

document_embeddings = DocumentRNNEmbeddings([embedding])

In [None]:
# create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type, multi_label=True)

# initialize trainer
trainer = ModelTrainer(classifier, corpus)

# run training with fine-tuning
trainer.fine_tune('test_model_word2vec',
                  learning_rate=5.0e-5,
                  mini_batch_size=4,
                  max_epochs=10,
                  )