In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [69]:
df = pd.read_csv("fakenews.csv")

In [70]:
pd.set_option("display.max_columns",None)

In [71]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [72]:
df.drop(columns=["Unnamed: 0"],axis = 1,inplace = True)

In [73]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [74]:
df["label"].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [75]:
df["label"] = df["label"].map({"FAKE":1,"REAL":0})

In [76]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


In [77]:
df.isnull().sum()

title    0
text     0
label    0
dtype: int64

In [78]:
df.duplicated().sum()

29

In [79]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
6330    False
6331    False
6332    False
6333    False
6334    False
Length: 6335, dtype: bool

In [80]:
df.drop_duplicates(inplace = True,keep = False)

In [81]:
df.duplicated().sum()

0

In [82]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
Ps = PorterStemmer()
from nltk.tokenize import word_tokenize
import string

In [83]:
import re
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.split()
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content

In [84]:
df["trans_text"] = df["text"].apply(stemming)

In [85]:
df["trans_title"] = df["title"].apply(stemming)

In [86]:
df.head()

Unnamed: 0,title,text,label,trans_text,trans_title
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1,Daniel Greenfield a Shillman Journalism Fellow...,You Can Smell Hillary s Fear
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1,Google Pinterest Digg Linkedin Reddit Stumbleu...,Watch The Exact Moment Paul Ryan Committed Pol...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0,U S Secretary of State John F Kerry said Monda...,Kerry to go to Paris in gesture of sympathy
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1,Kaydee King KaydeeKing November The lesson fro...,Bernie supporters on Twitter erupt in anger ag...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0,It s primary day in New York and front runners...,The Battle of New York Why This Primary Matters


In [87]:
df.drop(columns=["title","text"],axis = 1,inplace=True)

In [88]:
df["Label"] = df["label"]
df.drop(columns = ["label"],axis = 1, inplace = True)

In [89]:
df.head()

Unnamed: 0,trans_text,trans_title,Label
0,Daniel Greenfield a Shillman Journalism Fellow...,You Can Smell Hillary s Fear,1
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,Watch The Exact Moment Paul Ryan Committed Pol...,1
2,U S Secretary of State John F Kerry said Monda...,Kerry to go to Paris in gesture of sympathy,0
3,Kaydee King KaydeeKing November The lesson fro...,Bernie supporters on Twitter erupt in anger ag...,1
4,It s primary day in New York and front runners...,The Battle of New York Why This Primary Matters,0


In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer()

In [97]:
X = vector.fit_transform(df["trans_text"],df["trans_title"]).toarray()
y = df.iloc[:,-1].values

In [98]:
from sklearn.model_selection import train_test_split

In [99]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 2)

In [100]:
from sklearn.naive_bayes import MultinomialNB
MnB = MultinomialNB()
MnB.fit(X_train,y_train)

In [101]:
y_pred = MnB.predict(X_test)

In [102]:
from sklearn.metrics import accuracy_score, precision_score

In [103]:
accuracy_score(y_test,y_pred= y_pred)

0.7415074309978769

In [104]:
precision_score(y_test,y_pred)

0.9939516129032258

In [105]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=5)
RFC.fit(X_train,y_train)
y_pred_1 = RFC.predict(X_test)

In [106]:
accuracy_score(y_test,y_pred_1)

0.7823779193205945

In [107]:
precision_score(y_test,y_pred_1)

0.8058252427184466

In [108]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train,y_train)
y_pred_2 = log.predict(X_test)

In [109]:
accuracy_score(y_test,y_pred_2)

0.9124203821656051

In [110]:
precision_score(y_test,y_pred_2)

0.9101010101010101

In [111]:
from sklearn.neighbors import KNeighborsClassifier  
classifier= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  
classifier.fit(X_train, y_train)  

In [112]:
y_pred_3 = classifier.predict(X_test)


In [113]:
accuracy_score(y_test,y_pred_3)

0.7452229299363057

In [114]:
precision_score(y_test,y_pred_3)

0.6875471698113208

In [115]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(criterion='entropy', random_state=0)
DTC.fit(X_train,y_train)

In [116]:
y_pred_4 = DTC.predict(X_test)

In [117]:
accuracy_score(y_test,y_pred_4)

0.7924628450106157

In [118]:
precision_score(y_test,y_pred_4)

0.7989795918367347

In [119]:
import pickle
pickle.dump(vector,open("vectorize.pkl","wb"))
pickle.dump(log,open("model.pkl","wb"))