# Fake News Detection

In [1]:
# Download the dataset

!wget -O "liar.zip" "https://www.cs.ucsb.edu/~william/data/liar_dataset.zip"

--2021-05-07 05:49:35--  https://www.cs.ucsb.edu/~william/data/liar_dataset.zip
Resolving www.cs.ucsb.edu (www.cs.ucsb.edu)... 23.185.0.3, 2620:12a:8001::3, 2620:12a:8000::3
Connecting to www.cs.ucsb.edu (www.cs.ucsb.edu)|23.185.0.3|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://sites.cs.ucsb.edu/~william/data/liar_dataset.zip [following]
--2021-05-07 05:49:35--  https://sites.cs.ucsb.edu/~william/data/liar_dataset.zip
Resolving sites.cs.ucsb.edu (sites.cs.ucsb.edu)... 128.111.27.164
Connecting to sites.cs.ucsb.edu (sites.cs.ucsb.edu)|128.111.27.164|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1013571 (990K) [application/zip]
Saving to: ‘liar.zip’


2021-05-07 05:49:36 (3.09 MB/s) - ‘liar.zip’ saved [1013571/1013571]



In [2]:
# Module downloads

!pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/2b/21/d13081805e1e1afc71f5bb743ece324c8bd576237c51b899ecb38a717502/optuna-2.7.0-py3-none-any.whl (293kB)
[K     |█▏                              | 10kB 15.9MB/s eta 0:00:01[K     |██▎                             | 20kB 14.7MB/s eta 0:00:01[K     |███▍                            | 30kB 10.4MB/s eta 0:00:01[K     |████▌                           | 40kB 9.0MB/s eta 0:00:01[K     |█████▋                          | 51kB 5.5MB/s eta 0:00:01[K     |██████▊                         | 61kB 5.8MB/s eta 0:00:01[K     |███████▉                        | 71kB 6.3MB/s eta 0:00:01[K     |█████████                       | 81kB 6.8MB/s eta 0:00:01[K     |██████████                      | 92kB 6.3MB/s eta 0:00:01[K     |███████████▏                    | 102kB 5.3MB/s eta 0:00:01[K     |████████████▎                   | 112kB 5.3MB/s eta 0:00:01[K     |█████████████▍                  | 122kB 5.3MB/s eta 0:

In [66]:
# Required modules

import os
import nltk
import string
import optuna
import numpy as np
import pandas as pd

from zipfile import ZipFile
from matplotlib import pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [4]:
# Package downloads

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Initial Configration

%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 7)

In [6]:
# Extracting the dataset

with ZipFile('liar.zip', 'r') as zf:
    zf.extractall('./')

In [7]:
# Use this column names

columns = ['ID', 'Label', 'Statement', 'Subject', 'Speaker', 'Speaker Job', 'State info', 'Party Affiliation', 'BT', 'F', 'HT', 'MT', 'Pants on fire', 'Context']

In [8]:
# Loading the dataset

train = pd.read_csv('train.tsv', sep='\t', header=None, names=columns)
train.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Speaker Job,State info,Party Affiliation,BT,F,HT,MT,Pants on fire,Context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [9]:
# Inspecting the data

train.info()
train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 10240 non-null  object 
 1   Label              10240 non-null  object 
 2   Statement          10240 non-null  object 
 3   Subject            10238 non-null  object 
 4   Speaker            10238 non-null  object 
 5   Speaker Job        7343 non-null   object 
 6   State info         8032 non-null   object 
 7   Party Affiliation  10238 non-null  object 
 8   BT                 10238 non-null  float64
 9   F                  10238 non-null  float64
 10  HT                 10238 non-null  float64
 11  MT                 10238 non-null  float64
 12  Pants on fire      10238 non-null  float64
 13  Context            10138 non-null  object 
dtypes: float64(5), object(9)
memory usage: 1.1+ MB


Unnamed: 0,BT,F,HT,MT,Pants on fire
count,10238.0,10238.0,10238.0,10238.0,10238.0
mean,11.53321,13.286482,17.133718,16.434265,6.201407
std,18.973764,24.112936,35.846511,36.151688,16.128927
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,2.0,2.0,3.0,3.0,1.0
75%,12.0,12.0,13.0,11.0,5.0
max,70.0,114.0,160.0,163.0,105.0


In [10]:
# Loading the validation data

valid = pd.read_csv('valid.tsv', sep='\t', header=None, names=columns)

## Preprocessing

In [11]:
# Applying some preprocessing

train['Subject'] = train['Subject'].replace(np.nan, "")
train['Speaker'] = train['Speaker'].replace(np.nan, "")
train['Speaker Job'] = train['Speaker Job'].replace(np.nan, "")
train['State info'] = train['State info'].replace(np.nan, "")
train['Party Affiliation'] = train['Party Affiliation'].replace(np.nan, "")

valid['Subject'] = valid['Subject'].replace(np.nan, "")
valid['Speaker'] = valid['Speaker'].replace(np.nan, "")
valid['Speaker Job'] = valid['Speaker Job'].replace(np.nan, "")
valid['State info'] = valid['State info'].replace(np.nan, "")
valid['Party Affiliation'] = valid['Party Affiliation'].replace(np.nan, "")

In [12]:
# Important functions

def remove_punctuation(text, puncts):
    return "".join([c for c in text if c not in puncts])

def remove_stopwords(text, stopwords):
    tokens = word_tokenize(text)

    return " ".join([tok for tok in tokens if tok not in stopwords])

In [13]:
stems = PorterStemmer()

MAPPER = {'pants-fire': 'Fake', 'false': 'Fake', 'barely-true': 'Fake', 'half-true': 'Real', 'mostly-true': 'Real', 'true': 'Real'}
english_sw = stopwords.words('english')
punctuations = string.punctuation

In [14]:
# Preprocessing features

train['preprocessed'] = train['Statement'].apply(lambda x: remove_stopwords(x, english_sw))
train['preprocessed'] = train['preprocessed'].apply(lambda x: remove_punctuation(x, punctuations))
train['preprocessed'] = train['preprocessed'].apply(lambda x: stems.stem(x))

valid['preprocessed'] = valid['Statement'].apply(lambda x: remove_stopwords(x, english_sw))
valid['preprocessed'] = valid['preprocessed'].apply(lambda x: remove_punctuation(x, punctuations))
valid['preprocessed'] = valid['preprocessed'].apply(lambda x: stems.stem(x))

train['Label_mapped'] = train['Label'].apply(lambda x: MAPPER[x])
valid['Label_mapped'] = valid['Label'].apply(lambda x: MAPPER[x])

In [15]:
# Separating out features and labels

X_train = train['preprocessed']
y_train = train['Label_mapped']

X_valid = valid['preprocessed']
y_valid = valid['Label_mapped']

## Hyperparameter Tuning

### Count Vectorizer

In [16]:
# Creating a count vectorizer

count_vec = CountVectorizer()
count_vec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [17]:
# Getting the train and valid vectors

count_train = count_vec.transform(X_train)
count_valid = count_vec.transform(X_valid)

### Logistic Regression

In [64]:
# Parameter grid

def objective(trial):
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet', 'none'])
    C = trial.suggest_loguniform('C', 10e-6, 1)

    params = {'penalty': penalty, 'C': C}

    clf = LogisticRegression(**params)

    return cross_val_score(clf, count_train, y_train, cv=10, n_jobs=-1).mean()

In [65]:
# Logistic Regression fitting

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[32m[I 2021-05-07 08:35:13,711][0m A new study created in memory with name: no-name-b7586bb1-b4ee-4aa5-8d36-3e7934db427f[0m
[33m[W 2021-05-07 08:35:13,823][0m Trial 0 failed, because the objective function returned nan.[0m
[33m[W 2021-05-07 08:35:13,923][0m Trial 1 failed, because the objective function returned nan.[0m
[32m[I 2021-05-07 08:35:16,809][0m Trial 2 finished with value: 0.55 and parameters: {'penalty': 'none', 'C': 0.00030054665176308193}. Best is trial 2 with value: 0.55.[0m
[32m[I 2021-05-07 08:35:19,550][0m Trial 3 finished with value: 0.55 and parameters: {'penalty': 'none', 'C': 0.0001681449258638127}. Best is trial 2 with value: 0.55.[0m
[32m[I 2021-05-07 08:35:22,357][0m Trial 4 finished with value: 0.55 and parameters: {'penalty': 'none', 'C': 0.003097480684858737}. Best is trial 2 with value: 0.55.[0m
[32m[I 2021-05-07 08:35:22,835][0m Trial 5 finished with value: 0.5810546875 and parameters: {'penalty': 'l2', 'C': 0.003944515869288046}. Best i

In [47]:
# Best trial

trial = study.best_trial

best_logreg = LogisticRegression(**trial.params)
best_logreg.fit(count_train, y_train)
(best_logreg.predict(count_train) == y_train).mean(), (best_logreg.predict(count_valid) == y_valid).mean()


Setting penalty='none' will ignore the C and l1_ratio parameters


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



(0.999609375, 0.5654205607476636)

### Multinomail Naive Bayes

In [18]:
# Parametr grid

def objective(trial):
    alpha = trial.suggest_int('alpha', 1.0, 8.0)

    clf = MultinomialNB(alpha=alpha)

    return cross_val_score(clf, count_train, y_train, cv=10, n_jobs=-1).mean()

In [19]:
# Multinomail navie bayes fitting

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2021-05-07 05:51:42,821][0m A new study created in memory with name: no-name-784f0fcf-87b0-4deb-8be0-608d6e07ce94[0m
[32m[I 2021-05-07 05:51:44,366][0m Trial 0 finished with value: 0.60791015625 and parameters: {'alpha': 6}. Best is trial 0 with value: 0.60791015625.[0m
[32m[I 2021-05-07 05:51:44,653][0m Trial 1 finished with value: 0.60654296875 and parameters: {'alpha': 8}. Best is trial 0 with value: 0.60791015625.[0m
[32m[I 2021-05-07 05:51:44,954][0m Trial 2 finished with value: 0.60791015625 and parameters: {'alpha': 6}. Best is trial 0 with value: 0.60791015625.[0m
[32m[I 2021-05-07 05:51:45,230][0m Trial 3 finished with value: 0.60615234375 and parameters: {'alpha': 1}. Best is trial 0 with value: 0.60791015625.[0m
[32m[I 2021-05-07 05:51:45,518][0m Trial 4 finished with value: 0.6095703125 and parameters: {'alpha': 2}. Best is trial 4 with value: 0.6095703125.[0m
[32m[I 2021-05-07 05:51:45,815][0m Trial 5 finished with value: 0.60927734375 and parame

In [20]:
# Best params

best_trial = study.best_trial

nb_best = MultinomialNB(**best_trial.params)
nb_best.fit(count_train, y_train)
(nb_best.predict(count_train) == y_train).mean(), (nb_best.predict(count_valid) == y_valid).mean()

(0.77001953125, 0.6043613707165109)

### Support vector Machines

In [29]:
# Parameter grid

def objective(trial):
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    if kernel is 'poly':
        degree = trial.suggest_int('degree', 1, 4)
        gamma = trial.suggest_categorical('gamma', ['scale','auto'])
        clf = SVC(kernel=kernel, degree=degree, gamma=gamma)

    if kernel in ['rbf', 'sigmoid']:
        gamma = trial.suggest_categorical('gamma', ['scale','auto'])
        clf = SVC(kernel=kernel, gamma=gamma)

    clf = SVC(kernel=kernel)

    return cross_val_score(clf, count_train, y_train, cv=10, n_jobs=-1).mean()

In [30]:
# Support Vector Machines fitting

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[32m[I 2021-05-07 06:23:51,305][0m A new study created in memory with name: no-name-14f953e5-8fcb-40b2-8f14-d70004b5b20f[0m
[32m[I 2021-05-07 06:25:36,906][0m Trial 0 finished with value: 0.573046875 and parameters: {'kernel': 'poly', 'degree': 4, 'gamma': 'auto'}. Best is trial 0 with value: 0.573046875.[0m
[32m[I 2021-05-07 06:27:10,317][0m Trial 1 finished with value: 0.60732421875 and parameters: {'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 1 with value: 0.60732421875.[0m
[32m[I 2021-05-07 06:28:56,327][0m Trial 2 finished with value: 0.573046875 and parameters: {'kernel': 'poly', 'degree': 1, 'gamma': 'auto'}. Best is trial 1 with value: 0.60732421875.[0m
[32m[I 2021-05-07 06:30:42,531][0m Trial 3 finished with value: 0.573046875 and parameters: {'kernel': 'poly', 'degree': 2, 'gamma': 'scale'}. Best is trial 1 with value: 0.60732421875.[0m
[32m[I 2021-05-07 06:32:36,256][0m Trial 4 finished with value: 0.61474609375 and parameters: {'kernel': 'rbf', 'gam

In [31]:
# Best params

best_trial = study.best_trial

svc_best = SVC(**best_trial.params)
svc_best.fit(count_train, y_train)
(svc_best.predict(count_train) == y_train).mean(), (svc_best.predict(count_valid) == y_valid).mean()

(0.92197265625, 0.617601246105919)

### Random Forest

In [48]:
# Parameters grid

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, 100)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_categorical('max_depth', [4, 5, 6, None])
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])

    clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, max_features=max_features)

    return cross_val_score(clf, count_train, y_train, cv=10, n_jobs=-1).mean()

In [49]:
# Random Forest fitting

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[32m[I 2021-05-07 06:57:50,171][0m A new study created in memory with name: no-name-3de7f1da-7c3e-4ea9-90e1-722a767244c9[0m
[32m[I 2021-05-07 07:05:23,609][0m Trial 0 finished with value: 0.61162109375 and parameters: {'n_estimators': 300, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto'}. Best is trial 0 with value: 0.61162109375.[0m

A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.

[32m[I 2021-05-07 07:12:37,172][0m Trial 1 finished with value: 0.61591796875 and parameters: {'n_estimators': 300, 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2'}. Best is trial 1 with value: 0.61591796875.[0m
[32m[I 2021-05-07 07:13:06,751][0m Trial 2 finished with value: 0.56171875 and parameters: {'n_estimators': 600, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.61591796875.[0m
[32m[I 2021-05-07 07:13:29,151][0m Trial 3 finished wi

In [50]:
# Best params

best_trial = study.best_trial

rf_best = RandomForestClassifier(**best_trial.params)
rf_best.fit(count_train, y_train)
(rf_best.predict(count_train) == y_train).mean(), (rf_best.predict(count_valid) == y_valid).mean()

(0.999609375, 0.6082554517133957)

### Passive Aggressive 

In [56]:
# Parameters grid

def objective(trial):
    C = trial.suggest_loguniform('C', 1, 10e6)
    max_iter = trial.suggest_int('max_iter', 1000, 2000, 100)
    tol = trial.suggest_loguniform('tol', 10e-6, 1)

    clf = PassiveAggressiveClassifier(C=C, max_iter=max_iter, tol=tol)

    return cross_val_score(clf, count_train, y_train, cv=10, n_jobs=-1).mean()

In [57]:
# Passive Agressive fitting

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[32m[I 2021-05-07 08:20:03,729][0m A new study created in memory with name: no-name-698776a7-ae14-4b26-b4c9-9d5b1f5ff167[0m
[32m[I 2021-05-07 08:20:05,382][0m Trial 0 finished with value: 0.551953125 and parameters: {'C': 360.13768802278463, 'max_iter': 1500, 'tol': 0.0055916187503166764}. Best is trial 0 with value: 0.551953125.[0m
[32m[I 2021-05-07 08:20:06,823][0m Trial 1 finished with value: 0.54462890625 and parameters: {'C': 89071.76681055999, 'max_iter': 1600, 'tol': 1.6445577883763665e-05}. Best is trial 0 with value: 0.551953125.[0m
[32m[I 2021-05-07 08:20:08,289][0m Trial 2 finished with value: 0.54541015625 and parameters: {'C': 7.289980359492993, 'max_iter': 1300, 'tol': 1.7858525837341203e-05}. Best is trial 0 with value: 0.551953125.[0m
[32m[I 2021-05-07 08:20:08,747][0m Trial 3 finished with value: 0.5544921875 and parameters: {'C': 37661.44746002921, 'max_iter': 1100, 'tol': 0.006076774807693098}. Best is trial 3 with value: 0.5544921875.[0m
[32m[I 2021-

In [58]:
# Best trial

best_trial = study.best_trial

pa_best = PassiveAggressiveClassifier(**best_trial.params)
pa_best.fit(count_train, y_train)
(pa_best.predict(count_train) == y_train).mean(), (pa_best.predict(count_valid) == y_valid).mean()

(0.88486328125, 0.5763239875389408)

## Creating Count Vectorizer piplines

In [67]:
# logreg pipeline

logreg_pipeline = Pipeline([
     ('count_vec', CountVectorizer()),
     ('logreg', LogisticRegression(penalty='l2', C=0.003944515869288046))
])

In [68]:
# nb pipeline

nb_pipeline = Pipeline([
      ('count_vec', CountVectorizer()),
      ('nb', MultinomialNB(alpha=3))
])

In [80]:
# svc pipeline

svc_pipeline = Pipeline([
      ('count_vec', CountVectorizer()),
      ('svc', SVC(kernel='rbf', gamma='scale', probability=True))
])

In [70]:
# rf pipeline

rf_pipeline = Pipeline([
      ('count_vec', CountVectorizer()),
      ('rf', RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=None, max_features='log2'))
])

In [71]:
# pa pipeline

pa_pipeline = Pipeline([
      ('count_vec', CountVectorizer()),
      ('pa', PassiveAggressiveClassifier(C=410.5848058864682, max_iter=1200, tol=0.05548504547958647))
])

In [None]:
# Ensemble of the above classifiers

list_clfs = [('logreg_pipeline', logreg_pipeline), ('nb_pipeline', nb_pipeline), ('svc_pipeline', svc_pipeline), ('rf_pipeline', rf_pipeline), ('pa_pipeline', pa_pipeline)]

count_detector = VotingClassifier(estimators=list_clfs, voting='soft', n_jobs=-1)
count_detector.fit(X_train, y_train)


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [79]:
# Prediction from voting classifier

(count_detector.predict(X_train) == y_train).mean()

AttributeError: ignored

In [None]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train, y_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
tfidf_train = tfidf.transform(X_train)

In [None]:
nb.fit(tfidf_train, y_train)

MultinomialNB(alpha=6.8, class_prior=None, fit_prior=True)

In [None]:
(nb.predict(tfidf_train) == y_train).mean()

0.6060546875