In [100]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string


In [101]:
df_fake = pd.read_csv('./data/Fake.csv')
df_true = pd.read_csv('./data/True.csv')
print(df_fake.head())
print(df_true.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

In [102]:
df_fake['class'] = 0
df_true['class'] = 1


In [103]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

In [104]:
#null values
df_fake.isnull().sum()

title      0
text       0
subject    0
date       0
class      0
dtype: int64

In [105]:
df_true.isnull().sum()

title      0
text       0
subject    0
date       0
class      0
dtype: int64

In [106]:
#duplicate checking
duplicate = df_fake[df_fake.duplicated(subset=['text'], keep=False)]
duplicate

Unnamed: 0,title,text,subject,date,class
9087,BARBRA STREISAND Gives Up On Dream Of Impeachi...,Barbra Streisand was an Obama sycophant and on...,politics,"Dec 31, 2017",0
9088,WATCH: SENATOR LINDSEY GRAHAM DROPS BOMBSHELL…...,Everyone suspected the sketchy Steele Dossier ...,politics,"Dec 31, 2017",0
9089,“CONSERVATIVE GAY GUY” BLASTS Pence’s Aspen Ne...,It s been said that good fences make good neig...,politics,"Dec 30, 2017",0
9091,BILL NYE The FAKE Science Guy THREATENS Conser...,"Friday on MSNBC, climate activist Bill Nye war...",politics,"Dec 30, 2017",0
9093,EMBARRASSING: Anti-Trump “THE HILL” Gets SLAMM...,"#JokeNewsAfter record, cold temperatures were ...",politics,"Dec 30, 2017",0
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [107]:
df_fake = df_fake.drop_duplicates()

In [108]:
df_fake.shape

(23478, 5)

In [109]:
duplicate = df_true[df_true.duplicated(subset=['text'], keep=False)]
duplicate


Unnamed: 0,title,text,subject,date,class
165,What is in the Republicans' final tax bill,(Reuters) - Republicans in the U.S. Congress r...,politicsNews,"December 14, 2017",1
166,Factbox: What is in the U.S. Republicans' fina...,(Reuters) - Republicans in the U.S. Congress r...,politicsNews,"December 14, 2017",1
247,Aide tries to refocus tax debate after Trump's...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,"December 8, 2017",1
250,Aide tries to refocus U.S. tax debate after Tr...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,"December 7, 2017",1
416,Senate tax bill stalls on deficit-focused 'tri...,WASHINGTON (Reuters) - The U.S. Senate on Thur...,politicsNews,"November 30, 2017",1
...,...,...,...,...,...
21290,"Europeans, Africans agree renewed push to tack...",PARIS (Reuters) - Europe s big four continen...,worldnews,"August 28, 2017",1
21345,Thailand's ousted PM Yingluck has fled abroad:...,BANGKOK (Reuters) - Ousted Thai prime minister...,worldnews,"August 25, 2017",1
21353,Thailand's ousted PM Yingluck has fled abroad:...,BANGKOK (Reuters) - Ousted Thai prime minister...,worldnews,"August 25, 2017",1
21406,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1


In [110]:
df_true = df_true.drop_duplicates()
df_true.shape

(21211, 5)

In [111]:
#balance
from sklearn.utils import resample
df_fake = resample(df_fake, 
                            replace=False, 
                            n_samples=len(df_true), 
                            random_state=42)

In [112]:
df_fake.shape, df_true.shape

((21211, 5), (21211, 5))

In [113]:
#combine datasets
df = pd.concat([df_fake, df_true], axis=0)
df.head()

Unnamed: 0,title,text,subject,date,class
13055,IDENTITY OF HILLARY’S MYSTERY “HANDLER” Is Fin...,"About a month ago, people started noticing a l...",politics,"Sep 9, 2016",0
16621,OBAMA’S LAST MOVE: Here’s How He’ll Force Subu...,Obama and HUD want to give one last freebie to...,Government News,"May 9, 2016",0
23224,FOX News Anchor Shepard Smith Finally ‘Comes O...,"21st Century Wire says For many viewers, this ...",Middle-east,"October 18, 2016",0
8274,Trump Flip Flops On Super Bowl Pick After Fin...,"As usual, Donald Trump s support only goes as ...",News,"February 7, 2016",0
6433,Ann Coulter: Trump Doesn’t Need GOP’s Support...,Conservative columnist Ann Coulter took her vo...,News,"May 11, 2016",0


In [114]:
df = df.drop(['title', 'subject', 'date'], axis=1)
df.head()

Unnamed: 0,text,class
13055,"About a month ago, people started noticing a l...",0
16621,Obama and HUD want to give one last freebie to...,0
23224,"21st Century Wire says For many viewers, this ...",0
8274,"As usual, Donald Trump s support only goes as ...",0
6433,Conservative columnist Ann Coulter took her vo...,0


In [115]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,text,class
0,"TOKYO (Reuters) - As Japan looks for a quick, ...",1
1,This Benghazi timeline by Rep Roskam explains ...,0
2,WASHINGTON (Reuters) - President Donald Trump ...,1
3,BEIRUT (Reuters) - Syria s Foreign Ministry on...,1
4,What the heck! Hillary and Bill really are the...,0


In [116]:
def wordopt(text):
    text = text.lower()                                     
    text = re.sub(r'\[.*?\]', ' ', text)                    
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)     
    text = re.sub(r'<.*?>+', ' ', text)                  
    text = re.sub(r'[^a-z\s]', ' ', text)               
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip() 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text
    

In [117]:
df['text'] = df['text'].apply(wordopt)
df.head()

Unnamed: 0,text,class
0,tokyo reuters as japan looks for a quick resol...,1
1,this benghazi timeline by rep roskam explains ...,0
2,washington reuters president donald trump will...,1
3,beirut reuters syria s foreign ministry on thu...,1
4,what the heck hillary and bill really are the ...,0


In [118]:
x = df['text']
y = df['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [119]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
x_train = vectorization.fit_transform(x_train)
x_test = vectorization.transform(x_test)

In [120]:

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train, y_train)

In [121]:
pred_lr = LR.predict(x_test)
LR.score(x_test, y_test)

0.9857627757872902

In [122]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5282
           1       0.98      0.99      0.99      5324

    accuracy                           0.99     10606
   macro avg       0.99      0.99      0.99     10606
weighted avg       0.99      0.99      0.99     10606



In [123]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)

In [124]:
pred_dt = DT.predict(x_test)
DT.score(x_test, y_test)

0.9959456911182349

In [125]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5282
           1       1.00      1.00      1.00      5324

    accuracy                           1.00     10606
   macro avg       1.00      1.00      1.00     10606
weighted avg       1.00      1.00      1.00     10606



In [126]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier()
GB.fit(x_train, y_train)


In [130]:
pred_gb = GB.predict(x_test)
GB.score(x_test, y_test)

0.9950971148406562

In [131]:
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5282
           1       0.99      1.00      1.00      5324

    accuracy                           1.00     10606
   macro avg       1.00      1.00      1.00     10606
weighted avg       1.00      1.00      1.00     10606



In [132]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state=0)
RF.fit(x_train, y_train)
pred_rf = RF.predict(x_test)
RF.score(x_test, y_test)

0.9872713558363191

In [134]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5282
           1       0.98      0.99      0.99      5324

    accuracy                           0.99     10606
   macro avg       0.99      0.99      0.99     10606
weighted avg       0.99      0.99      0.99     10606



In [135]:
def output_labels(n):
    if n == 0:
        return 'Fake News'
    elif n==1:
        return 'Not a Fake News'

def testing(news):
    testing_news = {'text':[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test['text'].apply(wordopt)
    new_x_test = new_def_test['text']
    new_x_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_x_test)
    pred_DT = LR.predict(new_x_test)
    pred_GB = LR.predict(new_x_test)
    pred_RF = LR.predict(new_x_test)

    return print('\n\n LR prediction: {} \n DT prediction: {} \nGB prediction:{} \nRF predcition: {}'.format(output_labels(pred_LR[0]),
                                                                                                            output_labels(pred_DT[0]),
                                                                                                            output_labels(pred_GB[0]),
                                                                                                            output_labels(pred_RF[0])))

In [139]:
news = str(input())
testing(news)

 NASA’s Perseverance rover has successfully collected its first rock sample from Mars, a core from Jezero Crater that could provide key clues about past life.




 LR prediction: Fake News 
 DT prediction: Fake News 
GB prediction:Fake News 
RF predcition: Fake News
