# IT-550 Information Retrieval Assignment - 8
### Student ID - 202011032

## Importing required libraries and setting paths

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

TRAIN_PATH = "./fake-news/train.csv"
TEST_PATH = "./fake-news/test.csv"

## Loading and cleaning Training and Testing data

In [2]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [3]:
train.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


In [8]:
train = train.fillna('unknown', axis=1)
train.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,unknown,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,unknown,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


## Generating TF-IDF representation of `title` and `text` columns

In [12]:
import string

def tokenize(s):
    regexp_token = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return regexp_token.sub(r' \\1 ', s).split()

In [13]:
vect = TfidfVectorizer(strip_accents='unicode', tokenizer=tokenize, stop_words='english', sublinear_tf=True)

In [14]:
title_tfidf = vect.fit_transform(train['title'])
text_tfidf = vect.fit_transform(train['text'])

In [16]:
title_tfidf, text_tfidf

(<20800x22484 sparse matrix of type '<class 'numpy.float64'>'
 	with 195398 stored elements in Compressed Sparse Row format>,
 <20800x183655 sparse matrix of type '<class 'numpy.float64'>'
 	with 5474937 stored elements in Compressed Sparse Row format>)

## Fitting Voting and Stacking Ensemble Classifier on the Train dataset using title and text columns.
### Following classifiers will be used for the ensemble approach:
1. Mulitnomial Naive Bayes
2. RandomForest Classifier
3. Logistic Regression

In [17]:
# Creating classifiers to use
clf1 = MultinomialNB()
clf2 = RandomForestClassifier(n_estimators=50, random_state=3)
clf3 = LogisticRegression(random_state=3)

In [18]:
# Creating ensemble classifiers
eclf1 = VotingClassifier(estimators=[
    ('mnb', clf1), ('rf', clf2), ('lr', clf3)
], verbose=1)

eclf2 = StackingClassifier(estimators=[
    ('mnb', clf1), ('rf', clf2), ('lr', clf3)
], verbose=1)

## Checking the performance of the ensemble classifiers using cross-validation with 5 folds

### Using title column

In [19]:
cv_score_eclf1_title = cross_val_score(eclf1, title_tfidf, train['label'])
cv_score_eclf2_title = cross_val_score(eclf2, title_tfidf, train['label'])

[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  19.7s
[Voting] ....................... (3 of 3) Processing lr, total=   0.3s
[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  19.3s
[Voting] ....................... (3 of 3) Processing lr, total=   0.2s
[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  20.4s
[Voting] ....................... (3 of 3) Processing lr, total=   0.4s
[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  19.4s
[Voting] ....................... (3 of 3) Processing lr, total=   0.2s
[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  19.4s
[Votin

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [22]:
print("Cross-validation score for 5 folds for Voting ensemble classifier using title column:")
print(cv_score_eclf1_title)
print(f"Mean Accuracy: {np.mean(cv_score_eclf1_title):.3f}")
print("----------------------------------------------------------------")
print("Cross-validation score for 5 folds for Stacking ensemble classifier using title column:")
print(cv_score_eclf2_title)
print(f"Mean Accuracy: {np.mean(cv_score_eclf2_title):.3f}")

Cross-validation score for 5 folds for Voting ensemble classifier using title column:
[0.94014423 0.93966346 0.940625   0.92908654 0.93341346]
Mean Accuracy: 0.937
----------------------------------------------------------------
Cross-validation score for 5 folds for Stacking ensemble classifier using title column:
[0.94615385 0.95       0.94495192 0.93798077 0.93990385]
Mean Accuracy: 0.944


### Using text column

In [24]:
cv_score_eclf1_text = cross_val_score(eclf1, text_tfidf, train['label'])
cv_score_eclf2_text = cross_val_score(eclf2, text_tfidf, train['label'])

[Voting] ...................... (1 of 3) Processing mnb, total=   0.1s
[Voting] ....................... (2 of 3) Processing rf, total=  37.8s
[Voting] ....................... (3 of 3) Processing lr, total=   2.7s
[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  35.0s
[Voting] ....................... (3 of 3) Processing lr, total=   2.4s
[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  35.0s
[Voting] ....................... (3 of 3) Processing lr, total=   3.0s
[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  35.3s
[Voting] ....................... (3 of 3) Processing lr, total=   3.4s
[Voting] ...................... (1 of 3) Processing mnb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=  34.1s
[Votin

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [25]:
print("Cross-validation score for 5 folds for Voting ensemble classifier using text column:")
print(cv_score_eclf1_text)
print(f"Mean Accuracy: {np.mean(cv_score_eclf1_text):.3f}")
print("----------------------------------------------------------------")
print("Cross-validation score for 5 folds for Stacking ensemble classifier using text column:")
print(cv_score_eclf2_text)
print(f"Mean Accuracy: {np.mean(cv_score_eclf2_text):.3f}")

Cross-validation score for 5 folds for Voting ensemble classifier using text column:
[0.94326923 0.93990385 0.95216346 0.946875   0.94254808]
Mean Accuracy: 0.945
----------------------------------------------------------------
Cross-validation score for 5 folds for Stacking ensemble classifier using text column:
[0.96826923 0.96442308 0.97019231 0.96634615 0.97091346]
Mean Accuracy: 0.968


## We can observe from the results that Stacking ensemble classifier performs better in fitting the data and the `text` column gives a better score

### Best classifier: ***Stacking Classifier***
### Best data column: ***`text` column***