In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import itertools
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
fakedata=pd.read_csv('Fakenews_Fake_data.csv')
truedata=pd.read_csv('Fakenews_True_data.csv')

In [3]:
df1=pd.DataFrame(fakedata)
df2=pd.DataFrame(truedata)

In [4]:
#adding labels to the datasets
df1['label'] = 0
df2['label'] = 1

In [5]:
#concating both the dataframes
df=pd.concat([df1, df2])
df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [6]:
#Shuffles the dataset and reindex
df = df.sample(frac=1).reset_index(drop=True)

In [7]:
df

Unnamed: 0,title,text,subject,date,label
0,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0
1,DODGY DOSSIER: The Trump-Russia Dossier Was Fu...,21st Century Wire says This week an unverified...,US_News,"January 13, 2017",0
2,FLINT RESIDENTS TOLD TO PAY BILLS FOR POISON W...,You seriously can t make this up. This Democra...,politics,"Jan 25, 2016",0
3,LOL! DEMOCRATS Express Concerns Over Possible ...,A progressive group charged Saturday that the ...,politics,"Feb 26, 2017",0
4,Vehemently Anti-Gay Pastor Arrested On 70 Cou...,"On May 9, Pastor David Reynolds, formerly of C...",News,"May 15, 2016",0
...,...,...,...,...,...
44893,COMEDY GENIUS: [Video] “Bob Ross” Paints Sick ...,Steven Crowder knocks it out of the park with ...,politics,"Sep 17, 2016",0
44894,"Medicaid, pension costs create budget complica...",NEW YORK (Reuters) - A sluggish forecast for U...,politicsNews,"July 24, 2017",1
44895,Warren to keep up assault on White House hopef...,WASHINGTON (Reuters) - Democratic U.S. Senator...,politicsNews,"June 9, 2016",1
44896,FLASHBACK [VIDEO]: Libertarian Gary Johnson Di...,Libertarian Presidential candidate Gary Johnso...,left-news,"Sep 8, 2016",0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [9]:
#Making a copy of dataset to messages for preprocessing

messages=df.copy()

We can see that there are no null data in the dataset

In [10]:

 #stemmatize news titles

ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
corpus[:10]

['new low trump exploit la vega shoot rais money campaign tweet',
 'dodgi dossier trump russia dossier fund polit rival',
 'flint resid told pay bill poison water may children taken away',
 'lol democrat express concern possibl cheat democrat dnc chair vote',
 'vehement anti gay pastor arrest count child porn',
 'senat panel releas draft controversi encrypt bill',
 'watch creepi young ted cruz say want teen film rule everyth',
 'trump continu destroy relat china insan twitter tirad',
 'pull nfl relat ad papa john get unlik sponsor nazi',
 'day announc shameless money grab clock boy explain want come back texa']

In [12]:
#vectorize all preprocessed titles in corpus
## Applying Countvectorizer
# Creating the Bag of Words model
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
x = cv.fit_transform(corpus).toarray()

In [13]:
y=messages.iloc[:,-1]

In [14]:
# Divide the dataset into Train and Test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [15]:
cv.get_feature_names()[:20]

['abadi',
 'abandon',
 'abba',
 'abc',
 'abc news',
 'abe',
 'abedin',
 'abil',
 'abl',
 'abort',
 'abort law',
 'abroad',
 'absolut',
 'abus',
 'academ',
 'accept',
 'access',
 'accid',
 'accident',
 'accomplish']

In [16]:
count_df = pd.DataFrame(x_train, columns=cv.get_feature_names())

In [17]:
count_df.head()

Unnamed: 0,abadi,abandon,abba,abc,abc news,abe,abedin,abil,abl,abort,...,zero,zika,zika fund,zimbabw,zimbabw mnangagwa,zimbabw mugab,zone,zor,zuckerberg,zuma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Building the models

In [18]:
mn = MultinomialNB()
lg = LogisticRegression()
dtc = DecisionTreeClassifier()
knc = KNeighborsClassifier()
rfc = RandomForestClassifier()

In [19]:
model=[mn,lg,dtc,knc,rfc]
for m in model:
    m.fit(x_train,y_train)
    m.score(x_train,y_train)
    pred=m.predict(x_test)
    print("Accuracy score of ",m,"is : ")
    print(accuracy_score(y_test,pred))
    print(confusion_matrix(y_test,pred))
    print(classification_report(y_test,pred))
    print('\n')

Accuracy score of  MultinomialNB() is : 
0.9335412026726058
[[5565  329]
 [ 417 4914]]
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      5894
           1       0.94      0.92      0.93      5331

    accuracy                           0.93     11225
   macro avg       0.93      0.93      0.93     11225
weighted avg       0.93      0.93      0.93     11225



Accuracy score of  LogisticRegression() is : 
0.9496659242761692
[[5556  338]
 [ 227 5104]]
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      5894
           1       0.94      0.96      0.95      5331

    accuracy                           0.95     11225
   macro avg       0.95      0.95      0.95     11225
weighted avg       0.95      0.95      0.95     11225



Accuracy score of  DecisionTreeClassifier() is : 
0.9019153674832963
[[5461  433]
 [ 668 4663]]
              precision    recall  f1-score   support

           0

From the above models accuracy scores and matrics we can observe that MultinomialNB(), LogisticRegression() and RandomForestClassifier() are giving a better score, now let us consider these and check for cross validation and Parameter tuning.

In [20]:
from sklearn.model_selection import cross_val_score

#Cross validation

m1=[mn,lg,rfc]
for m in m1:
    score=cross_val_score(m,x_train,y_train,cv=5)
    print("Score of : ",m )
    print(score)
    print(score.mean()
         )
    print(score.std())
    print('\n')

Score of :  MultinomialNB()
[0.92917595 0.92739421 0.93184855 0.93451143 0.92604693]
0.929795413759779
0.003054731153845713


Score of :  LogisticRegression()
[0.94491463 0.94580549 0.94877506 0.94772795 0.94371844]
0.9461883131816317
0.0018401520894550261


Score of :  RandomForestClassifier()
[0.92531552 0.93259094 0.92858203 0.93154143 0.92649243]
0.9289044701962297
0.0028052150716863706




In [22]:
#Parameter tuning for LogisticRegression

from sklearn.model_selection import GridSearchCV

params1={'penalty':['l1','l2','elasticnet']}
grd1=GridSearchCV(estimator=lg,param_grid=params1,cv=5)
grd1.fit(x_train,y_train)

In [23]:
grd1.best_score_

0.9461883131816317

In [24]:
grd1.best_params_

{'penalty': 'l2'}

As we can see that LogisticRegression model is giving us the best score of 94.61% where penalty is 'l2', we can finalize this as our final model

In [27]:
# Finalizing the best model

final_classifier=LogisticRegression(penalty='l2')
final_classifier.fit(x_train,y_train)
pred=final_classifier.predict(x_test)
print(accuracy_score(y_test,pred))
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

0.9496659242761692
[[5556  338]
 [ 227 5104]]
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      5894
           1       0.94      0.96      0.95      5331

    accuracy                           0.95     11225
   macro avg       0.95      0.95      0.95     11225
weighted avg       0.95      0.95      0.95     11225



In [28]:
# We can save the final model now

import joblib
joblib.dump(final_classifier,'Fake_news_classifier.obj')

['Fake_news_classifier.obj']

Loading the saved model and predicting the values

In [30]:
Fake_news=joblib.load('Fake_news_classifier.obj')

In [31]:
pred=Fake_news.predict(x_test)
print("Predicted values :",pred)

Predicted values : [0 1 1 ... 1 0 0]


In [32]:
y_test

22216    0
27917    1
25007    1
1377     0
32476    0
        ..
15578    1
29394    0
3120     1
25388    0
14337    0
Name: label, Length: 11225, dtype: int64

In [33]:
# Making a DataFrame of Predicted values and Original values

df_predicted=pd.DataFrame({'Predicted values':pred,'Original values':y_test})
df_predicted

Unnamed: 0,Predicted values,Original values
22216,0,0
27917,1,1
25007,1,1
1377,0,0
32476,1,0
...,...,...
15578,1,1
29394,0,0
3120,1,1
25388,0,0
