In [1]:
import spacy
import re
import numpy as np
import pandas as pd
import pickle
import gensim
from gensim.models.word2vec import Word2Vec
import nltk
import time

#sklearn imports
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import log_loss, recall_score, precision_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib

import xgboost as xgb

nlp = spacy.load('en')

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
df = pd.read_csv('features_and_label.csv')

In [23]:
X, y = df.drop("is_fake", axis = 1), df.is_fake

X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.2)

In [16]:
print(df.columns)

Index(['total_sentences', 'num_sentences_with_adj_in_phrase',
       'num_sentences_with_adv_in_phrase', 'total_word_count',
       'avg_word_length', 'lexical_diversity', 'repetition_top',
       'repetition_all', 'NNP_percent', 'NNPS_percent', 'noun_percent',
       'verb_percent', 'part_percent', 'det_percent',
       'unknown_or_foreign_percent', 'tfidf_prob_nb', 'tfidf_prob_logreg',
       'w2v_prob_nb', 'w2v_prob_logreg', 'w2v_prob_svc', 'w2v_prob_rforest',
       'is_fake'],
      dtype='object')


In [25]:
pipeline = Pipeline([('std_scaler', StandardScaler()),
                     ('xgb', xgb.XGBClassifier())
                    ])

In [26]:
param = {
    'xgb__learning_rate':[0.01,0.015,0.025,0.05,0.1],
    'xgb__gamma':[0.05,0.1,0.3,0.5,0.7,0.9,1.0],
    'xgb__max_depth':[3,5,7,9,12,15,17,25],
    'xgb__min_child_weight':[1,3,5,7],
    'xgb__subsample':[0.6,0.7,0.8,0.9,1.0],
    'xgb__reg_lambda':[0.01,0.05,0.1,1.0],
    'xgb__reg_alpha':[0,0.1,0.5,1.0]
}

We train the model with only the manual features.

In [27]:
start = time.time()
xgb_model = RandomizedSearchCV(pipeline, param, cv=3,verbose=1, n_iter = 10, n_jobs = 10)
#fit model
xgb_model.fit(X_train,y_train)
time.time() - start

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 out of  30 | elapsed:   13.1s finished


15.557334184646606

In [28]:
y_val_pred_xgb = xgb_model.predict(X_val)

print("Train accuracy for xgb: " + str(xgb_model.score(X_train, y_train)))
print("Val accuracy for xgb: " + str(xgb_model.score(X_val, y_val)))
print("Val precision for xgb: " + str(precision_score(y_val, y_val_pred_xgb)))
print("Val recall for xgb: " + str(recall_score(y_val, y_val_pred_xgb)))
print("f1_score for xgb: " + str(f1_score(y_val, y_val_pred_xgb))) 

Train accuracy for xgb: 0.9988041138483617
Val accuracy for xgb: 0.9933035714285714
Val precision for xgb: 0.9935865811544153
Val recall for xgb: 0.9960435212660732
f1_score for xgb: 0.9948135342059767


In [29]:
print(classification_report(y_val, y_val_pred_xgb))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1114
           1       0.99      1.00      0.99      2022

   micro avg       0.99      0.99      0.99      3136
   macro avg       0.99      0.99      0.99      3136
weighted avg       0.99      0.99      0.99      3136

