In [1]:
import spacy
import re
import numpy as np
import pandas as pd
import pickle
import gensim
from gensim.models.word2vec import Word2Vec
import nltk


#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import log_loss, recall_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier


import xgboost as xgb

nlp = spacy.load('en')

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [3]:
df_ori = pd.read_csv("dataset.csv", sep = ",")
df_w2v = pd.read_csv("data/w2v_intermediates/w2v_300_dim.csv", sep = ",")

In [4]:
df = pd.concat([df_ori, df_w2v], axis = 1)

## Split to train and validation data

In [5]:
x, y = df.drop("label", axis = 1).drop("text", axis = 1), df.label

x_train, x_val, y_train, y_val = train_test_split(x, y, 
                                                  stratify=y, 
                                                  random_state=0, 
                                                  test_size=0.2)

In [36]:
nb_parameters = {'var_smoothing': (1e-8, 1e-9, 1e-10),
             }

logreg_parameters = {'C': (0.01, 0.1, 1),
             }

svc_parameters = {'C': (0.01, 0.1, 1),
             }

rforest_parameters = {
    'criterion':['gini','entropy'],
    'max_features':['sqrt','log2',0.3,0.5,0.8],
    'max_depth':[2,5,15,20,25,None]
    }

In [37]:
nb_model = GridSearchCV(GaussianNB(), nb_parameters, n_jobs = 10, verbose = 1, cv = 3)
nb_model.fit(x_train, y_train)

logreg_model = GridSearchCV(LogisticRegression(solver = 'liblinear'), logreg_parameters, n_jobs = 10, verbose = 1, cv = 3)
logreg_model.fit(x_train, y_train)

svc_model = GridSearchCV(SVC(kernel = "linear", probability = True), svc_parameters, n_jobs = 10, verbose = 1, cv = 3)
svc_model.fit(x_train, y_train)

rforest_model = GridSearchCV(RandomForestClassifier(), rforest_parameters, n_jobs = 10, verbose = 1, cv = 3)
rforest_model.fit(x_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:    3.0s finished


Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:    1.6s finished


Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:  1.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'C': (0.01, 0.1, 1)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=1)

In [67]:
y_pred_nb = nb_model.predict(x_val)

print("Train accuracy for nb: " + str(nb_model.score(x_train, y_train)))
print("Val accuracy for nb: " + str(nb_model.score(x_val, y_val)))
print("Val precision for nb: " + str(precision_score(y_val, y_pred_nb)))
print("Val recall for nb: " + str(recall_score(y_val, y_pred_nb)))

Train accuracy for nb: 0.7715060192936299
Val accuracy for nb: 0.7713647959183674
Val precision for nb: 0.8494911622924478
Val recall for nb: 0.7843719090009891


In [68]:
y_pred_logreg = logreg_model.predict(x_val)

print("Train accuracy for logreg: " + str(logreg_model.score(x_train, y_train)))
print("Val accuracy for logreg: " + str(logreg_model.score(x_val, y_val)))
print("Val precision for logreg: " + str(precision_score(y_val, y_pred_logreg)))
print("Val recall for logreg: " + str(recall_score(y_val, y_pred_logreg)))

Train accuracy for logreg: 0.8779398867894443
Val accuracy for logreg: 0.876594387755102
Val precision for logreg: 0.88183092013078
Val recall for logreg: 0.933728981206726


In [69]:
y_pred_svc = svc_model.predict(x_val)

print("Train accuracy for svc: " + str(svc_model.score(x_train, y_train)))
print("Val accuracy for svc: " + str(svc_model.score(x_val, y_val)))
print("Val precision for svc: " + str(precision_score(y_val, y_pred_svc)))
print("Val recall for svc: " + str(recall_score(y_val, y_pred_svc)))

Train accuracy for svc: 0.8856732839033724
Val accuracy for svc: 0.8855229591836735
Val precision for svc: 0.8965188364329996
Val recall for svc: 0.9297725024727992


In [60]:
y_pred_rforest = rforest_model.predict(x_val)

print("Train accuracy for rforest: " + str(rforest_model.score(x_train, y_train)))
print("Val accuracy for rforest: " + str(rforest_model.score(x_val, y_val)))
print("Val precision for rforest: " + str(precision_score(y_val, y_pred_rforest)))
print("Val recall for rforest: " + str(recall_score(y_val, y_pred_rforest)))

Train accuracy for rforest: 0.9956150841106594
Val accuracy for rforest: 0.8405612244897959
Val precision for rforest: 0.8658653846153846
Val recall for rforest: 0.890702274975272


In [66]:
y_pred_proba_logreg = logreg_model.predict_proba(x)
y_pred_proba_nb = nb_model.predict_proba(x)
y_pred_proba_svc = svc_model.predict_proba(x)
y_pred_proba_rforest = rforest_model.predict_proba(x)

In [77]:
y_pred_proba_logreg = y_pred_proba_logreg[:,1]
y_pred_proba_nb = y_pred_proba_nb[:,1]
y_pred_proba_svc = y_pred_proba_svc[:,1]
y_pred_proba_rforest = y_pred_proba_rforest[:,1]

IndexError: too many indices for array

In [81]:
filename = "models/w2v/nb_model.pkl"
with open(filename,'wb') as file:
    pickle.dump(nb_model ,file)

In [82]:
filename = "models/w2v/logreg_model.pkl"
with open(filename,'wb') as file:
    pickle.dump(logreg_model ,file)

In [83]:
filename = "models/w2v/svc_model.pkl"
with open(filename,'wb') as file:
    pickle.dump(svc_model ,file)

In [84]:
filename = "models/w2v/rforest_model.pkl"
with open(filename,'wb') as file:
    pickle.dump(rforest_model ,file)

In [93]:
new_df = pd.DataFrame(
    {
     'prob_nb': y_pred_proba_nb,
     'prob_logreg': y_pred_proba_logreg,
     'prob_svc': y_pred_proba_svc,
     'prob_rforest': y_pred_proba_rforest
    })

new_df.to_csv("w2v_output.csv", index = False)

testing

In [7]:
filename = "models/w2v/w2v_logreg_model.pkl"
with open(filename,'rb') as file:
    logreg_model = pickle.load(file)

In [8]:
y_pred_logreg = logreg_model.predict(x_val)

print("Train accuracy for logreg: " + str(logreg_model.score(x_train, y_train)))
print("Val accuracy for logreg: " + str(logreg_model.score(x_val, y_val)))
print("Val precision for logreg: " + str(precision_score(y_val, y_pred_logreg)))
print("Val recall for logreg: " + str(recall_score(y_val, y_pred_logreg)))

Train accuracy for logreg: 0.8779398867894443
Val accuracy for logreg: 0.876594387755102
Val precision for logreg: 0.88183092013078
Val recall for logreg: 0.933728981206726


In [9]:
from sklearn.metrics import classification_report

In [10]:
print(classification_report(y_val, y_pred_logreg))

              precision    recall  f1-score   support

           0       0.87      0.77      0.82      1114
           1       0.88      0.93      0.91      2022

   micro avg       0.88      0.88      0.88      3136
   macro avg       0.87      0.85      0.86      3136
weighted avg       0.88      0.88      0.87      3136

