In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from skmultilearn.adapt import MLkNN
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier 
     
from nltk.stem.snowball import SnowballStemmer

import pandas as pd
import json
import os
import re

import gensim 
from gensim.models import Word2Vec
import gensim.downloader
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import pickle 

import nltk
from nltk.tokenize import word_tokenize

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

import pickle

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
import flair
from flair.data import Corpus
from flair.datasets import ClassificationCorpus
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

from sklearn.model_selection import GridSearchCV

import sys

In [None]:
nltk.download('punkt')

In [None]:
tqdm.pandas()

# Opening Files: 

In [None]:
# Open Pickle without lemmatisation: 

with open('merged_data_no_duplicates.pickle', 'rb') as handle:
    (X_train_text, X_test_text, Y_train, Y_test, _, _) = pickle.load(handle)

# Feature Extraction:

In [None]:
sys.path.append('../src')

In [None]:
# call method
from methods import feature_extraction, evaluation

## Count Vectorizer:

In [None]:
X_train, X_test = feature_extraction('CountVectorizer', X_train_text, X_test_text, fe_filename = '/homes/lgf21/API/app/CV_tactic.pickle')

## TF-IDF:

In [None]:
X_train, X_test = feature_extraction('TfIdfVectorizer', X_train_text, X_test_text, fe_filename="tfidf_min10.pickle")

## word2vec Google news:

In [None]:
w2v_google = gensim.downloader.load('word2vec-google-news-300')

In [None]:
X_train, X_test = feature_extraction('embedding', X_train_text, X_test_text, embedding_type = w2v_google, weighted=False)

## Glove:

In [None]:
glv = gensim.downloader.load('glove-wiki-gigaword-100')

In [None]:
X_train, X_test = feature_extraction('embedding', X_train_text, X_test_text, embedding_type = glv)

## Trained word2vec:

In [None]:
w2v = Word2Vec.load("word2vec.model").wv

In [None]:
X_train, X_test = feature_extraction("embedding", X_train_text, X_test_text, embedding_type = w2v, weighted=False)

## Trained Doc2Vec:

In [None]:
doc2vec = Doc2Vec.load("doc2vec.model")

In [None]:
X_train, X_test = feature_extraction("embedding", X_train_text, X_test_text, embedding_type = doc2vec, weighted=False)

# Linear Classifiers:

## Naive Bayes:

In [None]:
naive_bayes_classifier = OneVsRestClassifier(MultinomialNB())
naive_bayes_classifier.fit(X_train, Y_train)

In [None]:
y_pred_proba = pd.DataFrame(naive_bayes_classifier.predict_proba(X_test), columns = Y_test.columns)

In [None]:
y_pred = (y_pred_proba > 0.005).astype(int) # if increase threshold, recall decreases and precision (could) increase

In [None]:
evaluation(y_pred, Y_test)

## SVC:

In [None]:
# Train and test: First delete techniques less than 9 
# We fix the random state to have the same dataset in our different tests

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42), n_jobs = 1)
sv_classifier.fit(X_train, Y_train)


In [None]:
Y_pred = pd.DataFrame(sv_classifier.predict(X_test), columns=Y_test.columns)

In [None]:
evaluation(Y_pred, Y_test)

## Logistic Regression:

## Non-Linear Classifiers:

In [None]:
# reduce dimension using pca: 

pca = PCA(n_components=75)
pca.fit(X_train)
pca_result = pca.transform(X_train)


In [None]:
log_reg = OneVsRestClassifier(LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs', max_iter = 1000)).fit(X_train, Y_train)

predictions = log_reg.predict(X_test)


In [None]:
evaluation(predictions, Y_test)

## Decision Tree:

In [None]:
dt = OneVsRestClassifier(DecisionTreeClassifier(random_state=0))
dt.fit(X_train, Y_train)

In [None]:
Y_pred = pd.DataFrame(dt.predict(X_test), columns=Y_test.columns)

In [None]:
evaluation(Y_pred, Y_test)

## DT AdaBoost: 

In [None]:
dt_adaboost = OneVsRestClassifier(AdaBoostClassifier(n_estimators=100, random_state=0)).fit(X_train, Y_train)
predictions_ada = dt_adaboost.predict(X_test)

In [None]:
evaluation(predictions_ada, Y_test)

## Multi-label KNN: 

In [None]:
# train
knn.fit(X_train.values, Y_train.values)

# predict
predictions = knn.predict(X_test.values)

In [None]:
knn = MLkNN(k = 3)

In [None]:
evaluation(predictions, Y_test)

# Classifer Chain: 

In [None]:
def chain_model(model):
    model_chain = ClassifierChain(model, order='random', random_state=0)
    return model_chain

In [None]:
chain = chain_model(dt_adaboost) # change model appropriately

In [None]:
chainModel = chain.fit(X_train, Y_train)
predictions = chainModel.predict(X_test)

In [None]:
evaluation(predictions, Y_test)

# Neural Networks:

In [None]:
model = GridSearchCV(MLPClassifier(random_state=1), 
                     {'hidden_layer_sizes': [[100,100], [100, 100, 1000], [200, 200], [1000, 200], [300, 300], [300, 200]],'learning_rate':['adaptive', 'constant']},
                    scoring='f1_macro')

model.fit(X_train, Y_train)

In [None]:
pd.DataFrame(model.cv_results_)

In [None]:
mlp = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes = [1000, 200]).fit(X_train, Y_train)

In [None]:
predictions_mlp = mlp.predict(X_test)

In [None]:
evaluation(predictions_mlp, Y_test)

## Multi Layer Perceptron: 

In [None]:
mlp = MLPClassifier(random_state=1, max_iter=100, hidden_layer_sizes = [100]).fit(X_train, Y_train)

In [None]:
predictions_mlp = mlp.predict(X_test)

In [None]:
evaluation(predictions_mlp, Y_test)

## Loading data from flair:

In [None]:
with open('fasttext_format_test.txt', 'w') as file:
    for i in range(len(Y_test)):
        file.write(' '.join(['__label__'+ col for col in Y_test.columns if Y_test.iloc[i][col] == 1]) + ' ' + X_test_text.iloc[i] + '\n')

In [None]:

flair.device = 'cpu'
# this is the folder in which train, test and dev files reside
data_folder = '.'

# load corpus containing training, test and dev data
corpus = ClassificationCorpus(data_folder,
                                      test_file='fasttext_format_test.txt',
                                      dev_file='fasttext_format_test.txt',
                                      train_file='fasttext_format_train.txt',
                                      label_type='tactic',
                                      )

In [None]:
# label to be predicted: 
label_type = 'tactic'
# create the label dictionary
label_dict = corpus.make_label_dictionary(label_type=label_type)

## Transformers:

In [None]:
# initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('binay1999/text_classification_cybertexts', fine_tune=True)

# create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type, multi_label=True)

# initialize trainer
trainer = ModelTrainer(classifier, corpus)

# run training with fine-tuning
trainer.fine_tune('test_model',
                  learning_rate=5.0e-4,
                  mini_batch_size=30,
                  max_epochs=5,
                  )