In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

In [2]:
#Read the data
df=pd.read_csv('train.csv')

#Get shape and head
df.shape
df.head()

Unnamed: 0,title,text,subject,date,label
0,PRESIDENT TRUMP Explains New “America First” R...,That s what we re talking about! Another campa...,politics,2-Aug-17,Fake
1,TERMINALLY ILL FORMER MISS WI: “Until my last ...,How is it that Sean Hannity is the only media ...,politics,4-Oct-16,Fake
2,Cruz Humiliated By Moderator After Lie About ...,Almost immediately after learning that longtim...,News,13-Feb-16,Fake
3,"Russia revels in Trump victory, looks to sanct...",MOSCOW (Reuters) - For all their mutual praise...,politicsNews,9-Nov-16,Real
4,Trump's bid to open U.S. monuments to developm...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,26-May-17,Real


In [37]:
columnsList = df.columns
columnsList

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [38]:
df.count()

title      40019
text       40019
subject    40000
date       40000
label      40000
dtype: int64

In [39]:
df.isna().sum()
#as can be observed there are null values in the dataset and we can drop them since they arent too many

title      38
text       38
subject    57
date       57
label      57
dtype: int64

In [40]:
df.describe()

Unnamed: 0,title,text,subject,date,label
count,40019,40019.0,40000,40000,40000
unique,35086,34976.0,19,1023,13
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,20-Dec-17,Fake
freq,12,552.0,10075,171,20868


In [41]:
df = df.dropna()
df.count()
df.head(5)

Unnamed: 0,title,text,subject,date,label
0,PRESIDENT TRUMP Explains New “America First” R...,That s what we re talking about! Another campa...,politics,2-Aug-17,Fake
1,TERMINALLY ILL FORMER MISS WI: “Until my last ...,How is it that Sean Hannity is the only media ...,politics,4-Oct-16,Fake
2,Cruz Humiliated By Moderator After Lie About ...,Almost immediately after learning that longtim...,News,13-Feb-16,Fake
3,"Russia revels in Trump victory, looks to sanct...",MOSCOW (Reuters) - For all their mutual praise...,politicsNews,9-Nov-16,Real
4,Trump's bid to open U.S. monuments to developm...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,26-May-17,Real


In [42]:
df['label'].value_counts()

Fake                                                                                                                                                                                                                                                                                                                                                                                                                              20868
Real                                                                                                                                                                                                                                                                                                                                                                                                                              19113
 Stone enlisted some 200 hardened criminals for the realistic riot scene.Cult Crimes & The Daily ShooterWhen concerning America s cult crimes           

In [43]:
df = df.loc[(df['label'] >= 'Fake') & (df['label'] <= 'Real' )]
labels = df['label'].loc[(df['label'] >= 'Fake') & (df['label'] <= 'Real' )]


In [44]:
labels.value_counts()

Fake    20868
Real    19113
Name: label, dtype: int64

**More Pre-Processing**

In [45]:
df['title'] = df['title'].str.lower()
df['text'] = df['text'].str.lower()

#Can also use stemming

In [46]:
#Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['title'], labels, test_size=0.2, random_state=7)

In [47]:
#Initialize a TfidfVectorizer
#Stop words from english defined so as to ignore words that add no meaning.
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [72]:
# print(tfidf_test)

In [48]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.69%


In [49]:
#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['Fake','Real'])

#4000 Fake were correctly predicted, while 243 were not
#255 Real were not correctly predicted while 3500 were

array([[3933,  252],
       [ 253, 3559]], dtype=int64)

In [50]:
from IPython.display import Image

In [52]:
predHeadline = " Sarah Palin Just Openly Admitted She Prefers Being Interviewed By Children Over The Press".lower()
predVec = tfidf_vectorizer.transform([predHeadline])
pred = pac.predict(predVec)

In [53]:
predHeadline = " Trump has attained Victory in the elections in which Russia Revels ".lower()
predVec = tfidf_vectorizer.transform([predHeadline])
pred = pac.predict(predVec)

In [54]:
pred

array(['Real'], dtype='<U4')

In [55]:
#Testing it on our testing dataset of 4000 datapoints:
dtest=pd.read_csv('test.csv')

In [56]:
predTitles = dtest['text']
predTitles

0        Fantastic testimony on the disastrous results...
1       MEXICO CITY (Reuters) - Mexican President Enri...
2       Hillary Clinton has picked up a huge endorseme...
3       Did anyone else think it was the ultimate iron...
4       There have been a lot of strange this that hav...
                              ...                        
3995     Advocates for big government and progressive ...
3996    WASHINGTON (Reuters) - U.S. President Donald T...
3997    It doesn t take a rocket scientist to know tha...
3998    On New Jersey 101.5, Chris Christie made the a...
3999    The Donald Trump Administration has to be the ...
Name: text, Length: 4000, dtype: object

In [57]:
predictionTestVector = tfidf_vectorizer.transform(predTitles)
predTest = pac.predict(predictionTestVector)

In [58]:
predTest

array(['Fake', 'Real', 'Fake', ..., 'Fake', 'Fake', 'Fake'], dtype='<U4')

In [59]:
#Exporting our trained model:
joblib.dump(pac,'pblModel.joblib')

['pblModel.joblib']

In [None]:
from sklearn.externals import joblib
from firebase_admin import storage
bucket = storage.bucket(name='dspbl-6a563.appspot.com')
b = bucket.blob('model-v1/model.joblib')
b.upload_from_filename('model.joblib')
print('model uploaded!')