In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [32]:
fake_news = pd.read_csv('Fake.csv')
true_news = pd.read_csv('True.csv')

In [34]:
true_news.head(2)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"


In [35]:
fake_news.head(2)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"


In [36]:
fake_news['class'] = 0
true_news['class'] = 1

In [37]:
fake_news.shape,true_news.shape

((23481, 5), (21417, 5))

In [38]:
# Create a test dataframe with the last 10 rows of each dataset
# and drop the 10 columns from the original dataset

fake_news_manual_testing = fake_news.tail(10)

for i in range (23480, 23470, -1):
    fake_news.drop([i], axis=0, inplace = True)
    
true_news_manual_testing = true_news.tail(10)

for i in range (21416, 21406, -1):
    true_news.drop([i], axis=0, inplace = True)

In [39]:
true_news_manual_testing.head(1)

Unnamed: 0,title,text,subject,date,class
21407,"Mata Pires, owner of embattled Brazil builder ...","SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",worldnews,"August 22, 2017",1


In [40]:
news_merge = pd.concat([fake_news, true_news], axis=0)
news_merge.head(2)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0


In [41]:
# drop unnecessary columns
news = news_merge.drop(['title','subject','date'], axis=1)
news.head()

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [42]:
# Give me the number of null values
news.isnull().sum()

text     0
class    0
dtype: int64

In [43]:
# Random shuffling of the data
news = news.sample(frac=1)
news.head()

Unnamed: 0,text,class
7384,"In February, video surfaced of evangelical pas...",0
17709,When a disgusting ad appeared on TV showing a ...,0
7556,CLEVELAND (Reuters) - As a perennial swing sta...,1
12821,GAZA (Reuters) - A senior Palestinian official...,1
18151,BEIRUT (Reuters) - Syria s army and its allies...,1


In [44]:
news.reset_index(inplace=True)
news.drop(['index'], axis=1, inplace=True)
news.head()

Unnamed: 0,text,class
0,"In February, video surfaced of evangelical pas...",0
1,When a disgusting ad appeared on TV showing a ...,0
2,CLEVELAND (Reuters) - As a perennial swing sta...,1
3,GAZA (Reuters) - A senior Palestinian official...,1
4,BEIRUT (Reuters) - Syria s army and its allies...,1


In [45]:
# Function to process the text

def textprocessor(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.S+', '',text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\n", "", text)
    text = re.sub("\w*\d\w*", "", text)
    return text

In [17]:
news['text'] = news['text'].apply(textprocessor)

In [46]:
x = news['text']
y = news['class']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.25)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(X_train)
xv_test = vectorization.transform(X_test)

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
LR = LogisticRegression()
LR.fit(xv_train, y_train)

In [53]:
pred-lr = LR.predict(xv_test)

In [56]:
score = LR.score(xv_test, y_test)
score

0.9868983957219252

In [57]:
print(classification_report(y_test,pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5855
           1       0.98      0.99      0.99      5365

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [59]:
from sklearn.tree import DecisionTreeClassifier

In [60]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [61]:
pred_dt = DT.predict(xv_test)

In [62]:
DT.score(xv_test, y_test)

0.9959001782531194

In [63]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5855
           1       1.00      1.00      1.00      5365

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [64]:
from sklearn.ensemble import GradientBoostingClassifier

In [67]:
GB = GradientBoostingClassifier(random_state = 0)
GB.fit(xv_train, y_train)

In [68]:
pred_gb = GB.predict(xv_test)

In [69]:
GB.score(xv_test, y_test)

0.9949197860962566

In [70]:
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5855
           1       0.99      1.00      0.99      5365

    accuracy                           0.99     11220
   macro avg       0.99      1.00      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [71]:
from sklearn.ensemble import RandomForestClassifier

In [74]:
RF = RandomForestClassifier(random_state = 0)
RF.fit(xv_train,y_train)

In [75]:
pred_rf = RF.predict(xv_test)

In [76]:
RF.score(xv_test, y_test)

0.9888591800356507

In [77]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5855
           1       0.99      0.99      0.99      5365

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [None]:
def output_label(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"

