<a href="https://colab.research.google.com/github/ChanderValasai/ML-Projects/blob/main/FakeNewsDetectionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Importing Dependencies**

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
df_fake = pd.read_csv('Fake.csv.zip')
df_real = pd.read_csv('True.csv.zip')

##**Data Preprocessing and Exploratory Data Analysis (EDA)**

---


In [5]:
df_fake['label'] = 0
df_real['label'] = 1

In [6]:
df_fake

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [7]:
df_combined = pd.concat([df_fake, df_real], ignore_index=True)
df_combined

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [8]:
df_combined = df_combined.sample(frac=1).reset_index(drop=True)
df_combined

Unnamed: 0,title,text,subject,date,label
0,OBAMA To Stay On Famous Private Island To “Wri...,After signing a multi-million dollar book deal...,politics,"Mar 27, 2017",0
1,BLACK MEN FOR BERNIE FOUNDER Campaigns In Swin...,"Apparently, using thug Michael Ferguson s mom ...",left-news,"Aug 27, 2016",0
2,The Daily Show’s Hasan Minaj Breaks Character...,The effects of a Donald Trump presidency are a...,News,"November 10, 2016",0
3,Hillary Has BIG Lead Over Trump In Latest Nat...,It appears Clinton s post convention bump ha...,News,"August 25, 2016",0
4,It’s Happening: FBI Carries Out Predawn Raid ...,As Donald Trump finds himself plagued with sca...,News,"August 9, 2017",0
...,...,...,...,...,...
44893,WOW! ANOTHER YOUNG MAN Found DEAD After Servin...,That s the third suspicious death of a man tie...,left-news,"Aug 5, 2016",0
44894,"Gorka, a Trump adviser and Bannon ally, is out...",WASHINGTON (Reuters) - White House adviser Seb...,politicsNews,"August 26, 2017",1
44895,Austria promises to consult Rome on passport o...,VIENNA (Reuters) - Austria s new coalition gov...,worldnews,"December 19, 2017",1
44896,Pathetic: Jeb Bush Gets His Mommy To Take On ...,One of the hallmarks about Jeb Bush s failing ...,News,"January 22, 2016",0


In [9]:
df_combined.shape

(44898, 5)

In [10]:
df_combined['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,23481
1,21417


In [11]:
df_combined.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


##**Stemming**
It's about removing prefix or suffix to any word to get the root, which is actually efficient for model to learn.

In [26]:
port_Stemmer = PorterStemmer()

In [27]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # Remove everything except English alphabet
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_Stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
# Needed to import in order to see the progress of stemming process
from tqdm import tqdm
tqdm.pandas()

In [None]:
# df_combined['text'] = df_combined['text'].progress_apply(stemming)

In [None]:
# df_combined.to_pickle('stemmed_fake_news_combined_df.pkl')

In [12]:
#Run this cell, it contains stemmed dataset, if you wan to run above then it could take around 30 minutes
df_combined = pd.read_pickle('stemmed_fake_news_combined_df.pkl')

In [13]:
df_combined

Unnamed: 0,title,text,subject,date,label
0,Darrell Issa: Trump’s Muslim Ban Is OK Becaus...,gop congressman darrel issa offer disgust defe...,News,"January 30, 2017",0
1,Chicago touts new debt structure aimed at savi...,chicago reuter plan announc wednesday mayor of...,politicsNews,"August 9, 2017",1
2,What Mississippi Just Passed Isn’t Only Ridic...,across nation mainli south number religi freed...,News,"March 31, 2016",0
3,Speed up Brexit transition talks or deal will ...,london reuter british financ minist philip ham...,worldnews,"October 11, 2017",1
4,U.S. Spy Chief James Clapper: U.S. Must Be Pre...,well get busi peopl cyber attack happen place ...,Government News,"Sep 10, 2015",0
...,...,...,...,...,...
44893,Energy secretary took charter flight day befor...,new york reuter u energi secretari rick perri ...,politicsNews,"October 4, 2017",1
44894,Even Trump Vodka Was A Charity Scam,donald trump undeni wealthi appar never taught...,News,"June 8, 2016",0
44895,Supreme Court has option to duck travel ban ru...,washington reuter trump administr announc sund...,politicsNews,"September 25, 2017",1
44896,Japan weighs plans to deal with North Korean e...,tokyo reuter japan studi plan cope influx perh...,worldnews,"November 16, 2017",1


In [14]:
X = df_combined['text']
Y = df_combined['label']

In [15]:
print(X)
print(Y)

0        gop congressman darrel issa offer disgust defe...
1        chicago reuter plan announc wednesday mayor of...
2        across nation mainli south number religi freed...
3        london reuter british financ minist philip ham...
4        well get busi peopl cyber attack happen place ...
                               ...                        
44893    new york reuter u energi secretari rick perri ...
44894    donald trump undeni wealthi appar never taught...
44895    washington reuter trump administr announc sund...
44896    tokyo reuter japan studi plan cope influx perh...
44897    berni sander known minc word shi speak mind de...
Name: text, Length: 44898, dtype: object
0        0
1        1
2        0
3        1
4        0
        ..
44893    1
44894    0
44895    1
44896    1
44897    0
Name: label, Length: 44898, dtype: int64


In [16]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [17]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6792176 stored elements and shape (44898, 89633)>
  Coords	Values
  (0, 494)	0.05983347048934866
  (0, 724)	0.09080011544552011
  (0, 1512)	0.05893058944094748
  (0, 2242)	0.04201182842923036
  (0, 2703)	0.05017397503352947
  (0, 3538)	0.03566377066498064
  (0, 4281)	0.05992735077111989
  (0, 4292)	0.06028656304447946
  (0, 5742)	0.17612135501203705
  (0, 5851)	0.13175844694704716
  (0, 5897)	0.03779782626106717
  (0, 7603)	0.06278975676566997
  (0, 8068)	0.09230521379671819
  (0, 8697)	0.05152767111244872
  (0, 8992)	0.068270176014046
  (0, 11553)	0.08556684029870885
  (0, 11894)	0.04088116681066075
  (0, 14055)	0.033724230903091026
  (0, 14119)	0.036447853649461105
  (0, 14259)	0.09187316007457685
  (0, 14624)	0.057109545899495795
  (0, 14694)	0.028656127445964776
  (0, 14880)	0.1093915951437824
  (0, 15062)	0.0834004395166358
  (0, 15109)	0.061076474788386556
  :	:
  (44897, 70318)	0.09145360518072645
  (44897, 71133)	0.0

## Spliting dataset into training and testing

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, stratify=Y, random_state=2)

##**Training the Logistic Regression Model**

In [61]:
model = LogisticRegression(class_weight='balanced')

In [62]:
model.fit(X_train, Y_train)

In [63]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [64]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9907725595010818


In [65]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [66]:
print('Accuracy score of the training data : ', testing_data_accuracy)

Accuracy score of the training data :  0.9832962138084632


In [69]:
X_new = ["The UN has reported that over 735 million people face hunger globally, urging immediate international response."]
X_new = [stemming(text) for text in X_new]

In [70]:
X_new_transformed = vectorizer.transform(X_new)
prediction = model.predict(X_new_transformed)

print(prediction)

if prediction[0] == 0:
    print('The news is Fake')
else:
    print('The news is Real')

[0]
The news is Fake
