In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
import re
import string

In [2]:
data_fake = pd.read_csv("Fake.csv")
data_true = pd.read_csv("True.csv")

In [3]:
data_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
data_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
data_fake["class"] = 0
data_true["class"] = 1

# Removing 10 data points for testing

In [6]:
data_fake.shape, data_true.shape

((23481, 5), (21417, 5))

In [7]:
data_fake_manual_test = data_fake.tail(10)
for i in range(23480, 23470, -1):
  data_fake.drop([i], axis = 0, inplace = True)

data_true_manual_test = data_true.tail(10)
for i in range(21416, 21406, -1):
  data_true.drop([i], axis = 0, inplace = True)

In [8]:
data_fake.shape, data_true.shape

((23471, 5), (21407, 5))

In [9]:
data_fake_manual_test["class"] = 0
data_true_manual_test["class"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fake_manual_test["class"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_true_manual_test["class"] = 1


In [10]:
data_true_manual_test.head()

Unnamed: 0,title,text,subject,date,class
21407,"Mata Pires, owner of embattled Brazil builder ...","SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",worldnews,"August 22, 2017",1
21408,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1
21409,"U.S., North Korea clash at U.N. arms forum on ...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1
21410,Headless torso could belong to submarine journ...,COPENHAGEN (Reuters) - Danish police said on T...,worldnews,"August 22, 2017",1
21411,North Korea shipments to Syria chemical arms a...,UNITED NATIONS (Reuters) - Two North Korean sh...,worldnews,"August 21, 2017",1


# Merge true and false datasets and process the merged dataset


In [11]:
data_merge = pd.concat([data_fake, data_true], axis = 0)

In [12]:
data = data_merge.drop(['title', 'subject', 'date'], axis = 1)

In [13]:
data.isnull().sum()

text     0
class    0
dtype: int64

In [14]:
data = data.sample(frac = 1)

In [15]:
data.head()

Unnamed: 0,text,class
15917,WASHINGTON (Reuters) - The United States urged...,1
4118,"NEW YORK (Reuters) - Hillary Clinton, who unti...",1
11438,AROUND 70 per cent of female refugees in north...,0
21072,We hear so much talk from the Left about worki...,0
152,WASHINGTON (Reuters) - The final version of co...,1


In [16]:
data.reset_index(inplace = True)
data.drop(['index'], axis = 1, inplace = True)

data.head()

Unnamed: 0,text,class
0,WASHINGTON (Reuters) - The United States urged...,1
1,"NEW YORK (Reuters) - Hillary Clinton, who unti...",1
2,AROUND 70 per cent of female refugees in north...,0
3,We hear so much talk from the Left about worki...,0
4,WASHINGTON (Reuters) - The final version of co...,1


# Function to define text

This function is designed to clean and normalize text data by converting it to lowercase, removing specific patterns (such as URLs and HTML tags), replacing non-word characters with spaces, removing punctuation, and eliminating words containing digits.

In [17]:
def wordopt(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub("\\W"," ", text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return text

In [18]:
data['text'] = data['text'].apply(wordopt)

In [19]:
x = data['text']
y = data['class']

In [20]:
x.head()

0    washington  reuters    the united states urged...
1    new york  reuters    hillary clinton  who unti...
2    around  per cent of female refugees in norther...
3    we hear so much talk from the left about worki...
4    washington  reuters    the final version of co...
Name: text, dtype: object

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25)

# Vectorization of text

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression()
reg.fit(xv_train, y_train)

In [24]:
pred_lr = reg.predict(xv_test)

In [25]:
reg.score(xv_test, y_test)

0.9854723707664884

In [26]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5869
           1       0.98      0.99      0.98      5351

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



# Decision Tree Classification

In [27]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(xv_train, y_train)

In [28]:
pred_dt = dt.predict(xv_test)

In [29]:
dt.score(xv_test, y_test)

0.9945632798573975

In [30]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      5869
           1       0.99      0.99      0.99      5351

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



# Gradient Boosting Classifier

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=0)
gb.fit(xv_train, y_train)

In [32]:
pred_gb = gb.predict(xv_test)

In [33]:
gb.score(xv_test, y_test)

0.9950089126559715

In [34]:
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5869
           1       0.99      1.00      0.99      5351

    accuracy                           1.00     11220
   macro avg       0.99      1.00      0.99     11220
weighted avg       1.00      1.00      1.00     11220



# Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0)
rf.fit(xv_train, y_train)

In [36]:
pred_rf = rf.predict(xv_test)

In [37]:
rf.score(xv_test, y_test)

0.9849376114081997

In [38]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5869
           1       0.98      0.98      0.98      5351

    accuracy                           0.98     11220
   macro avg       0.98      0.98      0.98     11220
weighted avg       0.98      0.98      0.98     11220



In [39]:
def output_label(n):
  if n==0:
    return "Fake News"
  elif n==1:
    return "Not a Fake News"

def manual_testing(news):
  testing_news = {"text":[news]}
  new_def_test = pd.DataFrame(testing_news)
  new_def_test["text"] = new_def_test["text"].apply(wordopt)
  new_x_test = new_def_test["text"]
  new_xv_test = vectorization.transform(new_x_test)
  pred_lr = reg.predict(new_xv_test)
  pred_dt = dt.predict(new_xv_test)
  pred_gb = gb.predict(new_xv_test)
  pred_rf = rf.predict(new_xv_test)

  return print("\n\nLogistic Regression Prediction: {} \nDecision Tree Prediction: {} \nGradient Boost Classifier Prediction: {} \nRandom Forest Prediction: {}".format(output_label(pred_lr[0]), output_label(pred_dt[0]), output_label(pred_gb[0]), output_label(pred_rf[0])))

In [40]:
news = str(input())
manual_testing(news)



Logistic Regression Prediction: Fake News 
Decision Tree Prediction: Fake News 
Gradient Boost Classifier Prediction: Fake News 
Random Forest Prediction: Fake News


In [41]:
news = str(input())
manual_testing(news)



Logistic Regression Prediction: Fake News 
Decision Tree Prediction: Fake News 
Gradient Boost Classifier Prediction: Fake News 
Random Forest Prediction: Fake News


In [42]:
news = str(input())
manual_testing(news)



Logistic Regression Prediction: Fake News 
Decision Tree Prediction: Fake News 
Gradient Boost Classifier Prediction: Fake News 
Random Forest Prediction: Fake News
