In [66]:
import spacy
import re
import numpy as np
import pandas as pd
import pickle
import gensim
from gensim.models.word2vec import Word2Vec
import nltk
import time

#sklearn imports
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import log_loss, recall_score, precision_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib

import xgboost as xgb

nlp = spacy.load('en')

In [10]:
df = pd.read_csv('features_and_label.csv')

In [12]:
X, y = df.drop("is_fake", axis = 1), df.is_fake

X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.2)

In [41]:
X2, y = df.iloc[:,:-7], df.is_fake

X2_train, X2_val, y_train, y_val = train_test_split(X2, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.2)

In [18]:
pipeline = Pipeline([('std_scaler', StandardScaler()),
                     ('xgb', xgb.XGBClassifier())
                    ])

In [19]:
param = {
    'xgb__learning_rate':[0.01,0.015,0.025,0.05,0.1],
    'xgb__gamma':[0.05,0.1,0.3,0.5,0.7,0.9,1.0],
    'xgb__max_depth':[3,5,7,9,12,15,17,25],
    'xgb__min_child_weight':[1,3,5,7],
    'xgb__subsample':[0.6,0.7,0.8,0.9,1.0],
    'xgb__reg_lambda':[0.01,0.05,0.1,1.0],
    'xgb__reg_alpha':[0,0.1,0.5,1.0]
}

We train the model with only the manual features.

In [51]:
start = time.time()
xgb_model = RandomizedSearchCV(pipeline, param, cv=3,verbose=1, n_iter = 1000, n_jobs = 10)
#fit model
xgb_model.fit(X2_train,y_train)
time.time() - start

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   19.2s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  1.2min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  3.3min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  6.2min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:  9.8min
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed: 14.2min
[Parallel(n_jobs=10)]: Done 2430 tasks      | elapsed: 19.3min
[Parallel(n_jobs=10)]: Done 3000 out of 3000 | elapsed: 24.2min finished


1460.838391304016

In [54]:
y_val_pred_xgb = xgb_model.predict(X2_val)
y_all_pred_xgb = xgb_model.predict(X2)

print("Train accuracy for xgb: " + str(xgb_model.score(X2_train, y_train)))
print("Val accuracy for xgb: " + str(xgb_model.score(X2_val, y_val)))
print("Val precision for xgb: " + str(precision_score(y_val, y_val_pred_xgb)))
print("Val recall for xgb: " + str(recall_score(y_val, y_val_pred_xgb)))
print("f1_score for xgb: " + str(f1_score(y_val, y_val_pred_xgb))) 

Train accuracy for xgb: 0.9732121502033007
Val accuracy for xgb: 0.7908163265306123
Val precision for xgb: 0.8124428179322964
Val recall for xgb: 0.8783382789317508
f1_score for xgb: 0.844106463878327


In [61]:
print(classification_report(y_val, y_val_pred_xgb))

              precision    recall  f1-score   support

           0       0.74      0.63      0.68      1114
           1       0.81      0.88      0.84      2022

   micro avg       0.79      0.79      0.79      3136
   macro avg       0.78      0.76      0.76      3136
weighted avg       0.79      0.79      0.79      3136



In [46]:
logreg_pipeline = Pipeline([('std_scaler', StandardScaler()),
                     ('lr', LogisticRegression(solver = 'liblinear'))
                    ])

logreg_parameters = {'lr__C': (0.01, 0.1, 1),
             }

logreg_model = GridSearchCV(logreg_pipeline, logreg_parameters, n_jobs = 10, verbose = 1, cv = 3)
logreg_model.fit(X2_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:    0.2s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'lr__C': (0.01, 0.1, 1)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=1)

In [50]:
y_pred_logreg = logreg_model.predict(X2_val)
y_all_pred_logreg = logreg_model.predict(X2)

print("Train accuracy for logreg: " + str(logreg_model.score(X2_train, y_train)))
print("Val accuracy for logreg: " + str(logreg_model.score(X2_val, y_val)))
print("Val precision for logreg: " + str(precision_score(y_val, y_pred_logreg)))
print("Val recall for logreg: " + str(recall_score(y_val, y_pred_logreg)))
print("f1_score for logreg: " + str(f1_score(y, y_all_pred_logreg))) 

Train accuracy for logreg: 0.691301921390417
Val accuracy for logreg: 0.6935586734693877
Val precision for logreg: 0.7008708822415751
Val recall for logreg: 0.9154302670623146
f1_score for logreg: 0.7933113800624386
