In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import pickle

In [None]:
y_train = pd.read_csv('../preprocessed_data/trn_title.csv', delimiter = ',', names=['title','label']).label
y_test = pd.read_csv('../preprocessed_data/tst_title.csv', delimiter = ',', names=['title','label']).label

# BoW

In [None]:
x_train_bow = pickle.load(open("../preprocessed_embeddings/bow_trn.pkl", "rb"))
x_test_bow = pickle.load(open("../preprocessed_embeddings/bow_tst.pkl", "rb"))

In [None]:
classifier_bow = LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200)
classifier_bow.fit(x_train_bow, y_train)

LogisticRegression(C=1, max_iter=200)

In [None]:
# Performance on Training and Test data
print("Training accuracy = {}".format(classifier_bow.score(x_train_bow, y_train)))
print("Test accuracy = {}".format(classifier_bow.score(x_test_bow, y_test)))

Training accuracy = 0.9845091331569096
Test accuracy = 0.9582204155374887


In [None]:
predictions_bow = classifier_bow.predict(x_test_bow)
print(classification_report(y_test, predictions_bow))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      2286
           1       0.97      0.95      0.96      2142

    accuracy                           0.96      4428
   macro avg       0.96      0.96      0.96      4428
weighted avg       0.96      0.96      0.96      4428



# ELMo

In [None]:
y_train = pickle.load(open("../preprocessed_embeddings/elmo_trn_title_labels.pkl", "rb"))
y_test = pickle.load(open("../preprocessed_embeddings/elmo_tst_title_labels.pkl", "rb"))

In [None]:
data = pickle.load(open("../preprocessed_embeddings/elmo_trn_title.pkl", "rb")).tolist()
for i in range(len(data)):
    data[i] = data[i].mean(axis=0).tolist()
x_train_elmo = np.array(data)

In [None]:
data = pickle.load(open("../preprocessed_embeddings/elmo_tst_title.pkl", "rb")).tolist()
for i in range(len(data)):
    data[i] = data[i].mean(axis=0).tolist()
x_test_elmo = np.array(data)

In [None]:
classifier_elmo = LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200)
classifier_elmo.fit(x_train_elmo, y_train)

LogisticRegression(C=1, max_iter=200)

In [None]:
print("Training accuracy = {}".format(classifier_elmo.score(x_train_elmo, y_train)))
print("Test accuracy = {}".format(classifier_elmo.score(x_test_elmo, y_test)))

Training accuracy = 0.9999031820822307
Test accuracy = 0.9952574525745257


In [None]:
predictions_elmo = classifier_elmo.predict(x_test_elmo)
print(classification_report(y_test, predictions_elmo))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2286
           1       1.00      0.99      1.00      2142

    accuracy                           1.00      4428
   macro avg       1.00      1.00      1.00      4428
weighted avg       1.00      1.00      1.00      4428



# GloVe

In [None]:
x_train_glove = pickle.load(open("../preprocessed_embeddings/GloVe_trn_title.pkl", "rb"))
x_test_glove = pickle.load(open("../preprocessed_embeddings/GloVe_tst_title.pkl", "rb"))

In [None]:
classifier_glove = LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200)
classifier_glove.fit(x_train_glove, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
print("Training accuracy = {}".format(classifier_glove.score(x_train_glove, y_train)))
print("Test accuracy = {}".format(classifier_glove.score(x_test_glove, y_test)))

Training accuracy = 0.9513651326405473
Test accuracy = 0.9417344173441734


In [None]:
predictions_glove = classifier_glove.predict(x_test_elmo)
print(classification_report(y_test, predictions_glove))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      2286
           1       0.94      0.94      0.94      2142

    accuracy                           0.94      4428
   macro avg       0.94      0.94      0.94      4428
weighted avg       0.94      0.94      0.94      4428



# Skipgram

In [None]:
!pip install fasttext
import fasttext
import pandas as pd

trn = pd.read_csv('../preprocessed_data/trn_title.csv', delimiter = ',', names=['title','label'])
tst = pd.read_csv('../preprocessed_data/tst_title.csv', delimiter = ',', names=['title','label'])


In [3]:
model_sg = fasttext.load_model("model_sg.bin")

f1 = open("sg_train.txt", "r")
f2 = open("sg_test.txt", "r")

trn_embeddings = []
tst_embeddings = [] 

for line in f1:
    line_stripped = line.strip()  
    trn_embeddings.append(model_sg.get_sentence_vector(line_stripped))

f1.close()


for line in f2:
    line_stripped = line.strip()
    tst_embeddings.append(model_sg.get_sentence_vector(line_stripped))

y_train = trn.label
y_test = tst.label


In [4]:
from sklearn.linear_model import LogisticRegression


classifier_sg = LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200)
classifier_sg.fit(trn_embeddings, y_train)


print("Training accuracy = {}".format(classifier_sg.score(trn_embeddings, y_train)))
print("Test accuracy = {}".format(classifier_sg.score(tst_embeddings, y_test)))

Training accuracy = 0.9949406561237153
Test accuracy = 0.9913178984861977


In [6]:
predictions_sg = classifier_sg.predict(tst_embeddings)
print(classification_report(y_test, predictions_sg))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2349
           1       1.00      0.99      0.99      2143

    accuracy                           0.99      4492
   macro avg       0.99      0.99      0.99      4492
weighted avg       0.99      0.99      0.99      4492

