# Importing all the dependancies 

In [455]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from scipy import stats
import statsmodels.api as sm
import math
import re
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import sys
import pandas.core.algorithms as algos
pd.pandas.set_option('display.max_columns',None)

In [456]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Vishal
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [457]:
#Loading Dataset
#df = pd.read_csv('/content/drive/MyDrive/train.csv')
#test_data = pd.read_csv('E:\Fake News Detection/test.csv')
#submit = pd.read_csv('E:\Fake News Detection/submit.csv')
#df = pd.read_csv('train.csv')
df = pd.read_csv('E:\Fake News Detection/train.csv')


In [458]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


# Data Preprocessing

In [459]:
df = df.fillna('')

In [460]:
#merging the train and test data across rows
#df = pd.concat([train_data, test_data], axis = 'rows')
#df.info()

In [461]:
df['author'] = df['author'].str.lower()

### Don't execute the following if you are using Word lemmatization. 
      
Because lemmatization selects the meaningful part of a wor, it's better to apply it over the "title" column
and then add the lowercased "author" column later to form the new "Content" column 

### Word Stemmer 


In [296]:
#merging the "author" and "title" columns to form "Content"
df['content'] = df['author']+' '+df['title'] #+df['text']
df['content'].head()

0    darrell lucus House Dem Aide: We Didn’t Even S...
1    daniel j. flynn FLYNN: Hillary Clinton, Big Wo...
2    consortiumnews.com Why the Truth Might Get You...
3    jessica purkiss 15 Civilians Killed In Single ...
4    howard portnoy Iranian woman jailed for fictio...
Name: content, dtype: object

In [299]:
#Defining a function "clean" to stem to the words in content
stemmer = PorterStemmer()
def clean(text):
    text="".join([re.sub('[^a-zA-Z]',' ',char) for char in text])
    text=text.lower()
    text=text.split()
    text=[stemmer.stem(word) for word in text if word not in set(stopwords.words("english"))]
    text=" ".join(text)
    return text

In [300]:
df['content'] = df['content'].apply(clean)

In [453]:
# separating the data & label
X = df.drop(columns='label', axis=1)
y = df['label']

In [454]:
print(df['content'])

0        darrell lucus house dem aide even see comey le...
1        daniel j. flynn flynn hillary clinton big woma...
2                 consortiumnews.com truth might get fired
3        jessica purkiss civilian killed single u airst...
4        howard portnoy iranian woman jailed fictional ...
                               ...                        
20795    jerome hudson rapper trump poster child white ...
20796    benjamin hoffman n f l playoff schedule matchu...
20797    michael j. de la merced and rachel abrams macy...
20798    alex ansary nato russia hold parallel exercise...
20799                           david swanson keep f alive
Name: content, Length: 20800, dtype: object


### Word Lemmatization
 Here I have implemented word lemmatization and Stemmer. Only one should be executed before moving onto modelling. 

In [462]:
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()

def clean(text):
    text="".join([re.sub('[^a-zA-Z]',' ',char) for char in text])
    text=text.lower()
    text=text.split()
    text=[wordnet.lemmatize(word) for word in text if word not in set(stopwords.words("english"))]
    text=" ".join(text)
    return text

In [463]:
df['title'] = df['title'].apply(clean)


In [464]:
df['content'] = df['author']+' '+df['title'] #+df['text']
df['content'].head()

0    darrell lucus house dem aide even see comey le...
1    daniel j. flynn flynn hillary clinton big woma...
2             consortiumnews.com truth might get fired
3    jessica purkiss civilian killed single u airst...
4    howard portnoy iranian woman jailed fictional ...
Name: content, dtype: object

In [465]:
# separating the data & label
X = df.drop(columns='label', axis=1)
y = df['label']

In [466]:
print(df['content'])

0        darrell lucus house dem aide even see comey le...
1        daniel j. flynn flynn hillary clinton big woma...
2                 consortiumnews.com truth might get fired
3        jessica purkiss civilian killed single u airst...
4        howard portnoy iranian woman jailed fictional ...
                               ...                        
20795    jerome hudson rapper trump poster child white ...
20796    benjamin hoffman n f l playoff schedule matchu...
20797    michael j. de la merced and rachel abrams macy...
20798    alex ansary nato russia hold parallel exercise...
20799                           david swanson keep f alive
Name: content, Length: 20800, dtype: object


### After using either one of Lemmatization OR Stemmer run the following cells 

In [467]:
#Separating lables and values
X = df['content'].values
y = df['label'].values

In [468]:
print(X)
#print(y)

['darrell lucus house dem aide even see comey letter jason chaffetz tweeted'
 'daniel j. flynn flynn hillary clinton big woman campus breitbart'
 'consortiumnews.com truth might get fired' ...
 'michael j. de la merced and rachel abrams macy said receive takeover approach hudson bay new york time'
 'alex ansary nato russia hold parallel exercise balkan'
 'david swanson keep f alive']


In [469]:
X.shape

(20800,)

In [470]:
#Converting the textual data into numerical data using vertorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [471]:
print(X)

  (0, 19633)	0.3898334584250517
  (0, 16877)	0.2434231886663421
  (0, 11241)	0.33838114561245775
  (0, 10892)	0.27186640212962043
  (0, 9847)	0.23066376406357905
  (0, 8918)	0.20553409740940168
  (0, 6426)	0.25787634753604916
  (0, 4822)	0.2800374370965301
  (0, 4555)	0.3349354477607908
  (0, 3676)	0.22972642932451354
  (0, 3097)	0.34215548056905637
  (0, 443)	0.27922090210909
  (1, 20970)	0.26470934209118435
  (1, 8698)	0.19372949495800215
  (1, 7155)	0.726569378492845
  (1, 4517)	0.2704004256207564
  (1, 3492)	0.19421743536442002
  (1, 2793)	0.373249665793938
  (1, 2401)	0.15787930205664566
  (1, 1930)	0.29902641324955254
  (2, 19548)	0.397654988300042
  (2, 12050)	0.4717261145949637
  (2, 7716)	0.3412820420603414
  (2, 7018)	0.4651214075065619
  (2, 3912)	0.44062273209258024
  :	:
  (20797, 15418)	0.3098813010103687
  (20797, 15155)	0.24267060114886227
  (20797, 12847)	0.07857340412208622
  (20797, 12015)	0.17062859352394522
  (20797, 11942)	0.2887777850945119
  (20797, 11341)	0.353

In [472]:
#Splitting test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=42)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, stratify=y_train, random_state=42)

# Classification using Logistic Regression

In [340]:
model = LogisticRegression()
model.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [341]:
# Accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9882211538461538


In [342]:
#Accuracy score on test data
X_test_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_pred, y_test)
print ('Accuracy score of the testing data :', test_data_accuracy)

Accuracy score of the testing data : 0.973798076923077


In [343]:
#Implementing Bagging ensemble on Logistic Regression Classifier
from sklearn.ensemble import BaggingClassifier
bagging_clf = BaggingClassifier(base_estimator = LogisticRegression(), n_estimators = 100, max_features = 15000, random_state = 42)
bagging_clf.fit(X_train, y_train).score(X_test, y_test)

0.9699519230769231

# Classification using XGBoost

In [473]:
import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [345]:
XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [346]:
model_xgboost = xgboost.XGBClassifier(n_estimators = 100,
                                     eval_metric = 'auc')

In [347]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(model_xgboost, X, y, cv =10)

In [348]:
print("The mean validaiton score on 10 fold CV is : {:.4f}\nThe standard deviation of the spread is : {:.4f} "
      .format(score.mean(),score.std()))


The mean validaiton score on 10 fold CV is : 0.9897
The standard deviation of the spread is : 0.0016 


## Hyperparameter tuning on XGBoost

In [428]:
from sklearn.model_selection import RandomizedSearchCV


In [429]:
classifier = XGBClassifier()

In [351]:
#Selecting superset of parameters to train on
params = {
    "learning_rate" : [0.01, 0.10, 0.25, 0.5, 1],
    "max_depth" : [3,5,10],
    "min_child_weight" : [1,3,5,7, 10, 13,17, 20],
    "gamma" : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "colsample_bytree" : [0.3, 0.4, 0.5, 0.7, 1],
    
}

### RandomizedSearchCV

In [30]:
#Hyperparameter tuning using Randomized Search CV
random_search =  RandomizedSearchCV(classifier, 
                                    param_distributions = params, 
                                    n_iter=10, #random_state = 1,
                                    scoring ="accuracy",  n_jobs =-1, cv=10, verbose =3)

In [31]:
random_search.fit(X,y)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.1min finished


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=None,
                                           gpu_id=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=N...
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5, 0.7

In [32]:
#Getting the best params from Random search
random_search.best_params_

{'min_child_weight': 1,
 'max_depth': 10,
 'learning_rate': 0.25,
 'gamma': 0.4,
 'colsample_bytree': 0.4}

In [366]:
#Fitting the model with the best params we got
classifier = xgboost.XGBClassifier(min_child_weight= 1,
  max_depth= 9,
  learning_rate= 0.5,
  gamma= 0.7,
  colsample_bytree= 1,n_estimators = 100,eval_metric = 'auc')

In [367]:
score_r = cross_val_score(classifier, X, y, cv =10, scoring = 'accuracy')


In [368]:

print("The mean validaiton score on 10 fold CV using RandomizedSearchCV : {:.4f}\nThe standard deviation of the spread is : {:.4f} "
      .format(score_r.mean(),score_r.std()))


The mean validaiton score on 10 fold CV using RandomizedSearchCV : 0.9915
The standard deviation of the spread is : 0.0021 


### GridSearchCV

In [36]:
random_search.best_params_

{'min_child_weight': 1,
 'max_depth': 10,
 'learning_rate': 0.25,
 'gamma': 0.4,
 'colsample_bytree': 0.4}

In [372]:
params2 = {
    "learning_rate" : [ 0.5, 1],
    "max_depth" : [5,9, 10],
    "min_child_weight" : [1,3],
    "gamma" : [ 0.2, 0.5, 0.7],
    "colsample_bytree" : [ 0.5, 0.7, 1]
    
}

In [373]:
grid_search = GridSearchCV( 
    estimator = classifier, param_grid = params2, scoring ="accuracy", n_jobs =-1, cv=5, verbose =3)

grid_search = grid_search.fit(X,y)

#print(model.best_score_)
#print(model.best_estimator_.get_params())

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed: 19.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 20.7min finished


In [374]:
grid_search.best_params_


{'colsample_bytree': 1,
 'gamma': 0.7,
 'learning_rate': 0.5,
 'max_depth': 9,
 'min_child_weight': 1}

In [375]:
#Fitting the model with the best params we got
classifier = xgboost.XGBClassifier(min_child_weight= 1,
  max_depth= 9,
  learning_rate= 0.5,
  gamma= 0.7,
  colsample_bytree= 1,n_estimators = 100,eval_metric = 'auc')

In [376]:
#Using Stratified Kfold validation for accuracy
skf = StratifiedKFold(n_splits=10)

score_g = cross_val_score(classifier, X, y, cv =skf, scoring = 'accuracy')

In [377]:
print("The mean validaiton score on 10 fold CV using GridSearchCV : {:.4f}\nThe standard deviation of the spread is : {:.4f} "
      .format(score_g.mean(),score_g.std()))


The mean validaiton score on 10 fold CV using GridSearchCV : 0.9915
The standard deviation of the spread is : 0.0021 


### Bayesian Optimization using HyperOpt

In [378]:
from sklearn.metrics import precision_recall_fscore_support as score
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval

In [379]:
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [ 0.01, 0.1, 0.25, 0.5, 1]),
    'max_depth' : hp.choice('max_depth', [3,5,9,11]),
    'min_child_weight' : hp.choice('min_child_weight', range(1,4)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,8)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,11)])
    #'reg_alpha' : hp.choice('reg_alpha', [0, 1e-5, 1e-2, 0.1, 1, 10, 100]), 
    #'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100])
}
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5)
# Objective function
def objective(params):
    
    xgboost = XGBClassifier(seed=42, **params)
    scores = cross_val_score(xgboost, X, y, cv=kfold, scoring='accuracy', n_jobs=-1)
    # Extract the best score
    best_score = max(scores)
    # Loss must be minimized
    loss = - best_score
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}
# Trials to track progress
bayes_trials = Trials()
# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = bayes_trials)

100%|█████████████████████████████████████████████| 100/100 [23:17<00:00, 13.97s/trial, best loss: -0.9935096153846154]


In [380]:
# Print the index of the best parameters
# print(best)
# Print the values of the best parameters
print(space_eval(space, best))


{'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.5, 'max_depth': 9, 'min_child_weight': 1}


In [386]:
#Fitting the model with the best params we got
classifier = xgboost.XGBClassifier(min_child_weight= 1,
  max_depth= 9,
  learning_rate= 0.5,
  gamma= 0.1,
  colsample_bytree= 1,
  n_estimators = 100,eval_metric = 'auc')

In [387]:
#Using Stratified Kfold validation for accuracy
skf = StratifiedKFold(n_splits=10)
score_b = cross_val_score(classifier, X, y, cv =skf, scoring = 'accuracy')

In [388]:
print("The mean validaiton score on 10 fold CV using GridSearchCV : {:.4f} \nThe standard deviation of the spread is : {:.4f}"
      .format(score_b.mean(),score_b.std()))


The mean validaiton score on 10 fold CV using GridSearchCV : 0.9916 
The standard deviation of the spread is : 0.0023


### TPOT Classifier

In [430]:
import torch
from torch import nn
from tpot import TPOTClassifier

In [397]:
params_tpot = {
    "learning_rate" : [0.1, 0.25, 0.5, 1],
    "max_depth" : [3,5,9,11],
    "min_child_weight" : [1,3],
    "gamma" : [0.0,0.1,0.3, 0.5, 0.6, 0.7],
    "colsample_bytree" : [0.4, 0.5, 0.7, 1]
    
}

In [400]:
tpot_classifier = TPOTClassifier(generations =10, population_size = 24, offspring_size = 12,
                              verbosity = 2, early_stop = 12, random_state = 42,
                              config_dict = {'xgboost.XGBClassifier' : params_tpot},
                               cv = 5, scoring = 'accuracy') #cv = 5

#skf = StratifiedKFold(n_splits=5)
#score_tpot = cross_val_score(tpot_classifier, X, y, cv =skf, scoring = 'accuracy')
tpot_classifier.fit(X, y) ##if cv = 5 is passed as a parameter inside TPOTClassifier then run this. 



HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=144.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.9907211538461539

Generation 2 - Current best internal CV score: 0.9907211538461539

Generation 3 - Current best internal CV score: 0.9910096153846155

Generation 4 - Current best internal CV score: 0.9910096153846155

Generation 5 - Current best internal CV score: 0.9910096153846155

Generation 6 - Current best internal CV score: 0.9910096153846155

Generation 7 - Current best internal CV score: 0.9910096153846155

Generation 8 - Current best internal CV score: 0.9910096153846155

Generation 9 - Current best internal CV score: 0.9910096153846155

Generation 10 - Current best internal CV score: 0.9910096153846155

Best pipeline: XGBClassifier(CombineDFs(input_matrix, input_matrix), colsample_bytree=0.5, gamma=0.5, learning_rate=0.5, max_depth=9, min_child_weight=1)


TPOTClassifier(config_dict={'xgboost.XGBClassifier': {'colsample_bytree': [0.4,
                                                                           0.5,
                                                                           0.7,
                                                                           1],
                                                      'gamma': [0.0, 0.1, 0.3,
                                                                0.5, 0.6, 0.7],
                                                      'learning_rate': [0.1,
                                                                        0.25,
                                                                        0.5,
                                                                        1],
                                                      'max_depth': [3, 5, 9,
                                                                    11],
                                                      'min_child

In [401]:
#Fitting the model with the best params we got
classifier = xgboost.XGBClassifier(min_child_weight= 1,
  max_depth= 9,
  learning_rate= 0.5,
  gamma= 0.5,
  colsample_bytree= 0.5,
  n_estimators = 100,eval_metric = 'auc')

In [407]:
#Using Stratified Kfold validation for accuracy
skf = StratifiedKFold(n_splits=10)
score_tpot = cross_val_score(classifier, X, y, cv =skf, scoring = 'accuracy')

In [408]:
print("The mean validaiton score on 10 fold CV using TPOTClassifier : {:.4f} \nThe standard deviation of the spread is : {:.4f}"
      .format(score_tpot.mean(),score_tpot.std()))

The mean validaiton score on 10 fold CV using TPOTClassifier : 0.9917 
The standard deviation of the spread is : 0.0023


In [404]:
#accuracy = tpot_classifier.score(X_test, y_test)
#print('The accuracy on test data is :' ,accuracy)
#print(score_tpot.mean())

In [405]:
#skf = StratifiedKFold(n_splits=5)
#score_tpot = cross_val_score(tpot_classifier, X, y, cv =skf, scoring = 'accuracy')

In [406]:
#score_tpot.mean()

# Classification using AdaBoost

In [474]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [475]:
AdaBoostClassifier().get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [476]:
tree = DecisionTreeClassifier(criterion = 'entropy',
                              #random_state = 42,
                              )

In [477]:
boost = AdaBoostClassifier(base_estimator = tree,
                           n_estimators = 100,
                           algorithm = 'SAMME.R')
                           #random_state = 42)

#boost.fit(X_train, y_train)

In [478]:
score_ada = cross_val_score(boost, X, y, cv =10)

In [480]:
print("The mean validaiton score on 10 fold CV : {:.4f} \nThe standard deviation of the spread is : {:.4f}"
      .format(score_ada.mean(),score_ada.std()))

The mean validaiton score on 10 fold CV : 0.9938 
The standard deviation of the spread is : 0.0015
