In [103]:
import spacy
import re
import numpy as np
import pandas as pd
import pickle
import gensim
from gensim.models.word2vec import Word2Vec
import nltk


#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, recall_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


nlp = spacy.load('en')

In [42]:
df = pd.read_csv("dataset.csv", sep = ",")

## Split to train and validation data

In [43]:
x, y = df.text, df.label

x_train, x_val, y_train, y_val = train_test_split(x, y, 
                                                  stratify=y, 
                                                  random_state=0, 
                                                  test_size=0.2)

## tf-idf models using Grid Search

We use naive-bayes, logistic regression and support vector classifier.

In [77]:
nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df = 3,  max_features=None, 
                                                strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                                                smooth_idf=1,sublinear_tf=1, stop_words = 'english')),
                     ('nb_clf', MultinomialNB())
                    ])

logreg_pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df = 3,  max_features=None, 
                                                strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                                                smooth_idf=1,sublinear_tf=1, stop_words = 'english')),
                     ('logreg_clf', LogisticRegression(solver = 'liblinear'))
                    ])

svc_pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df = 3,  max_features=None, 
                                                strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                                                smooth_idf=1,sublinear_tf=1, stop_words = 'english')),
                     ('svc_clf', SVC(kernel = "linear"))
                    ])

In [72]:
nb_parameters = {'tfidf__ngram_range': [(1, 1), (1, 2),(1, 3)],
                 'tfidf__use_idf': (True, False),
                 'nb_clf__alpha': (1e-2, 1e-3),
             }

logreg_parameters = {'tfidf__ngram_range': [(1, 1), (1, 2),(1, 3)],
                 'tfidf__use_idf': (True, False),
                 'logreg_clf__C': (0.01, 0.1, 1),
             }

svc_parameters = {'tfidf__ngram_range': [(1, 1), (1, 2),(1, 3)],
                 'tfidf__use_idf': (True, False),
                 'svc_clf__C': (0.01, 0.1, 1),
             }

In [73]:
nb_model = GridSearchCV(nb_pipeline, nb_parameters, n_jobs = 10, verbose = 1, cv = 3)
nb_model.fit(x_train, y_train)

logreg_model = GridSearchCV(logreg_pipeline, logreg_parameters, n_jobs = 10, verbose = 1, cv = 3)
logreg_model.fit(x_train, y_train)

svc_model = GridSearchCV(svc_pipeline, svc_parameters, n_jobs = 10, verbose = 1, cv = 3)
svc_model.fit(x_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  36 out of  36 | elapsed:  2.5min finished


Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  2.0min
[Parallel(n_jobs=10)]: Done  54 out of  54 | elapsed:  3.7min finished


Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


KeyboardInterrupt: 

In [78]:
svc_model = GridSearchCV(svc_pipeline, svc_parameters, n_jobs = 10, verbose = 1, cv = 3)
svc_model.fit(x_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


KeyboardInterrupt: 

In [74]:
y_pred_nb = nb_model.predict(x_val)

print("Train accuracy for nb: " + str(nb_model.score(x_train, y_train)))
print("Val accuracy for nb: " + str(nb_model.score(x_val, y_val)))
print("Val precision for nb: " + str(precision_score(y_val, y_pred_nb)))
print("Val recall for nb: " + str(recall_score(y_val, y_pred_nb)))

Train accuracy for nb: 0.9959339870844296
Val accuracy for nb: 0.9528061224489796
Val precision for nb: 0.9487547892720306
Val recall for nb: 0.9797230464886251


In [75]:
y_pred_logreg = logreg_model.predict(x_val)

print("Train accuracy for logreg: " + str(logreg_model.score(x_train, y_train)))
print("Val accuracy for logreg: " + str(logreg_model.score(x_val, y_val)))
print("Val precision for logreg: " + str(precision_score(y_val, y_pred_logreg)))
print("Val recall for logreg: " + str(recall_score(y_val, y_pred_logreg)))

Train accuracy for logreg: 0.9775970660926413
Val accuracy for logreg: 0.9604591836734694
Val precision for logreg: 0.9588974854932302
Val recall for logreg: 0.9807121661721068


In [98]:
y_pred_proba_logreg = logreg_model.predict_proba(x)

In [99]:
y_pred_proba_nb = nb_model.predict_proba(x)

In [102]:
y_pred_proba_logreg = y_pred_proba_logreg[:,1]
y_pred_proba_nb = y_pred_proba_nb[:,1]

In [105]:
filename = "tfidf/nb_model.pkl"
with open(filename,'wb') as file:
    pickle.dump(nb_model ,file)

In [106]:
filename = "tfidf/logreg_model.pkl"
with open(filename,'wb') as file:
    pickle.dump(logreg_model ,file)

In [121]:
new_df = pd.DataFrame(
    {'text': x,
     'prob_nb': y_pred_proba_nb,
     'prob_logreg': y_pred_proba_logreg
    })

new_df.to_csv("tfidf_output.csv", index = False)