# Fake News Detection Using Naive Bayes Classifier

In [77]:
##Import the necessary modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [78]:
##Importing the dataset
df = pd.read_csv('fake_or_real_news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [79]:
##Creating series to store the label
y = df.label
y.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [80]:
##Creating training and test set
X_train,X_test,y_train,y_test = train_test_split(df["text"],y,test_size=0.33,random_state=53)

## Creating Features Using Count Vectorizer

In [81]:
##To generate the features initialize the CountVectorizer object
count_vectorizer = CountVectorizer(stop_words='english')

In [82]:
##Creating bag of word dictionary 
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [83]:
##To print first 10 features of count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']


## Creating Tfidf vectors for our document

In [84]:
##import
from sklearn.feature_extraction.text import TfidfVectorizer
##To generate the features initialize the TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [85]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [86]:
print(tfidf_vectorizer.get_feature_names()[:10])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']


In [87]:
##Print first 5 vectors in the vectors in train
print(tfidf_train.A[:5])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Inspecting the vectors created by count vectorizer and tfidf vectorizer

In [88]:
#Create dataframe for both
count_df = pd.DataFrame(count_train.A,columns=count_vectorizer.get_feature_names())
tfidf_df = pd.DataFrame(tfidf_train.A,columns=tfidf_vectorizer.get_feature_names())

In [89]:
count_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000ft,000km,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
tfidf_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000ft,000km,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
##To check whether the two dataframes are same or not
count_df.equals(tfidf_df)

False

In [92]:
##For classification use naive bayes classifier
##import necessary libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


## Training and testing using count vectorizer

In [93]:
##Initializing the classifier
def train_and_predict(alpha):    
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(count_train,y_train)
    y_pred = nb_classifier.predict(count_test)
    score=metrics.accuracy_score(y_test,y_pred)
    return score
y_pred

array(['REAL', 'REAL', 'REAL', ..., 'REAL', 'FAKE', 'REAL'], dtype='<U4')

In [94]:
##To check which alpha is giving best accuracy
import numpy as np
alphas = list(np.arange(0,1,0.1))
for alpha in alphas:
    print('Alpha:', alpha)
    print('Score', train_and_predict(alpha))

Alpha: 0.0
Score 0.8847441415590627
Alpha: 0.1
Score 0.8928742228598756
Alpha: 0.2




Score 0.8923959827833573
Alpha: 0.30000000000000004
Score 0.893352462936394
Alpha: 0.4
Score 0.893352462936394
Alpha: 0.5
Score 0.8943089430894309
Alpha: 0.6000000000000001
Score 0.8938307030129125
Alpha: 0.7000000000000001
Score 0.8928742228598756
Alpha: 0.8
Score 0.8928742228598756
Alpha: 0.9
Score 0.8928742228598756


In [95]:
## With alpha=0.6 we are getting the best accuracy 

## Training and testing using tfidf vectorizer

In [96]:
##Initializing the classifier
def train_and_predict(alpha):    
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train,y_train)
    y_pred = nb_classifier.predict(tfidf_test)
    score=metrics.accuracy_score(y_test,y_pred)
    return score
y_pred

array(['REAL', 'REAL', 'REAL', ..., 'REAL', 'FAKE', 'REAL'], dtype='<U4')

In [97]:
##To check which alpha is giving best accuracy
import numpy as np
alphas = list(np.arange(0,1,0.1))
for alpha in alphas:
    print('Alpha:', alpha)
    print('Score', train_and_predict(alpha))

Alpha: 0.0
Score 0.8813964610234337
Alpha: 0.1
Score 0.8976566236250598
Alpha: 0.2
Score 0.8938307030129125
Alpha: 0.30000000000000004




Score 0.8900047824007652
Alpha: 0.4
Score 0.8857006217120995
Alpha: 0.5
Score 0.8842659014825442
Alpha: 0.6000000000000001
Score 0.874701099952176
Alpha: 0.7000000000000001
Score 0.8703969392635102
Alpha: 0.8
Score 0.8660927785748446
Alpha: 0.9
Score 0.8589191774270684


In [98]:
## Performing best with alpha=0.1

# Result : Naive Bayes Tfidf Vectorizer Performed best with accuracy of 89.76% 