In [121]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [122]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [123]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [124]:
fake_data = pd.read_csv('/content/Fake.csv')
true_data = pd.read_csv('/content/True.csv')

In [125]:
fake_data.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [126]:
true_data.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [127]:
fake_data['label'] = 1
true_data['label'] = 0

In [128]:
news_data = pd.concat([fake_data, true_data], ignore_index=True)

In [129]:
news_data = news_data.sample(frac=1).reset_index(drop=True)

In [130]:
news_data

Unnamed: 0,title,text,subject,date,label
0,HILLARY SUPPORTER BRAGS About Looting “White B...,In addition to be a disgusting criminal and an...,left-news,"Aug 15, 2016",1
1,Libertarians Want Security Briefings – Maybe ...,Libertarians are convinced that their presiden...,News,"September 15, 2016",1
2,MARIA BARTIROMO Gets Into Heated Interview Wit...,The DNC Chair Tom Perez took his delusional an...,politics,"Nov 8, 2017",1
3,WATCH: New Clinton Ad Brilliantly Goes After ...,If you listen to Donald Trump at any of his ma...,News,"July 1, 2016",1
4,Factbox: What to watch for at China's Communis...,BEIJING (Reuters) - China s ruling Communist P...,worldnews,"October 15, 2017",0
...,...,...,...,...,...
45734,HOW PRESIDENT TRUMP Can Remove Congress From T...,President Trump tweeted that he was considerin...,politics,"Jul 30, 2017",1
45735,Fugitive Italian 'cocaine king' arrested in Ur...,ROME/MONTEVIDEO (Reuters) - One of Italy s mos...,worldnews,"September 4, 2017",0
45736,U.S. senators vote to bolster travel security ...,WASHINGTON (Reuters) - The U.S. Senate voted o...,politicsNews,"April 7, 2016",0
45737,Samantha Bee BLASTS GOP Over ‘C*ckblocking’ O...,Full Frontal host Samantha Bee is NOT one to m...,News,"March 22, 2016",1


In [131]:
 news_data.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [132]:
# merging title and author column
news_data['content'] = news_data['title']+' '+news_data['text']

In [133]:
news_data['content']

Unnamed: 0,content
0,HILLARY SUPPORTER BRAGS About Looting “White B...
1,Libertarians Want Security Briefings – Maybe ...
2,MARIA BARTIROMO Gets Into Heated Interview Wit...
3,WATCH: New Clinton Ad Brilliantly Goes After ...
4,Factbox: What to watch for at China's Communis...
...,...
45734,HOW PRESIDENT TRUMP Can Remove Congress From T...
45735,Fugitive Italian 'cocaine king' arrested in Ur...
45736,U.S. senators vote to bolster travel security ...
45737,Samantha Bee BLASTS GOP Over ‘C*ckblocking’ O...


In [134]:
news_data = news_data.drop(['subject'], axis=1)

In [135]:
news_data = news_data.drop(['date'], axis=1)

In [136]:
news_data

Unnamed: 0,title,text,label,content
0,HILLARY SUPPORTER BRAGS About Looting “White B...,In addition to be a disgusting criminal and an...,1,HILLARY SUPPORTER BRAGS About Looting “White B...
1,Libertarians Want Security Briefings – Maybe ...,Libertarians are convinced that their presiden...,1,Libertarians Want Security Briefings – Maybe ...
2,MARIA BARTIROMO Gets Into Heated Interview Wit...,The DNC Chair Tom Perez took his delusional an...,1,MARIA BARTIROMO Gets Into Heated Interview Wit...
3,WATCH: New Clinton Ad Brilliantly Goes After ...,If you listen to Donald Trump at any of his ma...,1,WATCH: New Clinton Ad Brilliantly Goes After ...
4,Factbox: What to watch for at China's Communis...,BEIJING (Reuters) - China s ruling Communist P...,0,Factbox: What to watch for at China's Communis...
...,...,...,...,...
45734,HOW PRESIDENT TRUMP Can Remove Congress From T...,President Trump tweeted that he was considerin...,1,HOW PRESIDENT TRUMP Can Remove Congress From T...
45735,Fugitive Italian 'cocaine king' arrested in Ur...,ROME/MONTEVIDEO (Reuters) - One of Italy s mos...,0,Fugitive Italian 'cocaine king' arrested in Ur...
45736,U.S. senators vote to bolster travel security ...,WASHINGTON (Reuters) - The U.S. Senate voted o...,0,U.S. senators vote to bolster travel security ...
45737,Samantha Bee BLASTS GOP Over ‘C*ckblocking’ O...,Full Frontal host Samantha Bee is NOT one to m...,1,Samantha Bee BLASTS GOP Over ‘C*ckblocking’ O...


In [137]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stemmer.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    return ' '.join(stemmed_content)


In [138]:
# Apply the function
news_data['content'] = news_data['content'].apply(clean_and_stem)

In [139]:
print(news_data)

                                                   title  \
0      HILLARY SUPPORTER BRAGS About Looting “White B...   
1       Libertarians Want Security Briefings – Maybe ...   
2      MARIA BARTIROMO Gets Into Heated Interview Wit...   
3       WATCH: New Clinton Ad Brilliantly Goes After ...   
4      Factbox: What to watch for at China's Communis...   
...                                                  ...   
45734  HOW PRESIDENT TRUMP Can Remove Congress From T...   
45735  Fugitive Italian 'cocaine king' arrested in Ur...   
45736  U.S. senators vote to bolster travel security ...   
45737   Samantha Bee BLASTS GOP Over ‘C*ckblocking’ O...   
45738  U.S. will only talk to North Korea about freei...   

                                                    text  label  \
0      In addition to be a disgusting criminal and an...      1   
1      Libertarians are convinced that their presiden...      1   
2      The DNC Chair Tom Perez took his delusional an...      1   
3      If y

In [140]:
x = news_data['content'].values
y = news_data['label'].values

In [141]:
print(x)
print(y)

['hillari support brag loot “white businesses” milwaukee…honor hillari equal opportun card…br sister along video addit disgust crimin idiot jerom also hillari support twitter account clear show besid open crimin racist also hillari support bio show support blacklivesmatt hillari imwithherjayrom william also tweet support hillaryher jayrom damn autocorrect keep tri make name jerom brag go loot busi take advantag cri riot death arm black man run polic refus drop stolen gun specif mention white own busi gonna loot jayrom brag loot got hard night work steal white own busi jayrom brag bring sister along loot true hillari support will give sister opportun steal white man would retweet hillari support jayrom william wonder support hillari want take gun hand legal gun owner leav thug gun dont send threat gang aint war caus strap gun yall cant afford forgi jayforgi august'
 'libertarian want secur brief – mayb gari johnson would know aleppo libertarian convinc presidenti ticket get secur brief 

In [142]:
y.shape

(45739,)

In [143]:
# converting textual data into numerical data
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=10000
)
x = vectorizer.fit_transform(x)

In [144]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5920804 stored elements and shape (45739, 10000)>
  Coords	Values
  (0, 4090)	0.3211151000864323
  (0, 8652)	0.2555310673995522
  (0, 1071)	0.303251798695266
  (0, 5221)	0.4745730761123932
  (0, 9765)	0.1409640012344703
  (0, 4158)	0.06611880311045634
  (0, 2942)	0.06465034270591559
  (0, 6204)	0.11877726845886588
  (0, 1332)	0.0726958785863122
  (0, 8116)	0.23604714727579543
  (0, 9548)	0.0347578752008041
  (0, 89)	0.0547401616274891
  (0, 2521)	0.0691885829834871
  (0, 2058)	0.10832952358933326
  (0, 4278)	0.07697384373315252
  (0, 4683)	0.22164648884503457
  (0, 9201)	0.04699011198781139
  (0, 57)	0.05061906672429856
  (0, 1616)	0.04020284197289747
  (0, 858)	0.08019918548931312
  (0, 6196)	0.0432588864450473
  (0, 7059)	0.05756005646239268
  (0, 931)	0.09524561611024045
  (0, 9800)	0.14134866883831393
  (0, 9194)	0.049535515378227785
  :	:
  (45738, 2620)	0.05405954697478213
  (45738, 8756)	0.2321703897790139
  (45738, 7

In [145]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2,stratify=y, random_state=3)

In [146]:
print(x.shape, x_train.shape, x_test.shape)

(45739, 10000) (36591, 10000) (9148, 10000)


In [147]:
model = LogisticRegression()

In [148]:
model.fit(x_train, y_train)

In [149]:
# training data accuracy score
x_train_pred = model.predict(x_train)
train_data_accuracy = accuracy_score(x_train_pred, y_train)

# testing data accuracy score
x_test_pred = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_pred, y_test)

# result
print(f"Accuracy score of training data: {train_data_accuracy:.2f}")
print(f"Accuracy score of testing data: {test_data_accuracy:.2f}")

Accuracy score of training data: 0.99
Accuracy score of testing data: 0.99


In [150]:
print(x_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1192555 stored elements and shape (9148, 10000)>
  Coords	Values
  (0, 4390)	0.08985761409361372
  (0, 3734)	0.2264160024354978
  (0, 573)	0.08509016326284527
  (0, 76)	0.07668322826453189
  (0, 7749)	0.04359092535807261
  (0, 7661)	0.06750006050897699
  (0, 7481)	0.05998539296445679
  (0, 7427)	0.044219009503693396
  (0, 7610)	0.2260440156135236
  (0, 1994)	0.11268834873507412
  (0, 1426)	0.19828626081924788
  (0, 5493)	0.18940460598615172
  (0, 6636)	0.06739569716396207
  (0, 7237)	0.2536177162029586
  (0, 8348)	0.17153565294001857
  (0, 9611)	0.06751625848940351
  (0, 4291)	0.09031616983794542
  (0, 9746)	0.11965590222946676
  (0, 4124)	0.16041592381112157
  (0, 7256)	0.17884132498924704
  (0, 1598)	0.12719076552380357
  (0, 7393)	0.21446881693642844
  (0, 422)	0.09036528978698395
  (0, 3012)	0.09475227440379884
  (0, 3501)	0.07381635497353352
  :	:
  (9147, 2840)	0.04246110174138328
  (9147, 6410)	0.0606196060127129
  (9

In [151]:
print(x.shape)  # should be (num_samples, num_features)
print(vectorizer.get_feature_names_out()[:10])  # sample features


(45739, 10000)
['aaplo' 'aaron' 'aba' 'abadi' 'abandon' 'abba' 'abbasi' 'abbott' 'abc'
 'abcpolit']


In [152]:
news_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,23850
0,21889


# Making predictive system

In [153]:
x_new = x_test[111]

prediction = model.predict(x_new)
print(prediction)

if prediction[0] == 0:
  print('The news is real')
else:
  print('The news is fake')

[1]
The news is fake


In [154]:
print(y_test[111])

1


In [155]:
user_input = input("Enter the news: ")
user_input_stemmed = stemming(user_input)
user_input_vectorized = vectorizer.transform([user_input_stemmed])
prediction = model.predict(user_input_vectorized)

if prediction[0] == 0:
  print('The news is real')
else:
  print('The news is fake')

Enter the news: Donald Trump spent a good portion of his day at his golf club, marking the 84th day he s done so since taking the oath of office. It must have been a bad game because just after that, Trump lashed out at FBI Deputy Director Andrew McCabe on Twitter following a report saying McCabe plans to retire in a few months. The report follows McCabe s testimony in front of congressional committees this week, as well as mounting criticism from Republicans regarding the Russia probe.So, naturally, Trump attacked McCabe with a lie. How can FBI Deputy Director Andrew McCabe, the man in charge, along with leakin  James Comey, of the Phony Hillary Clinton investigation (including her 33,000 illegally deleted emails) be given $700,000 for wife s campaign by Clinton Puppets during investigation?  Trump tweeted.How can FBI Deputy Director Andrew McCabe, the man in charge, along with leakin  James Comey, of the Phony Hillary Clinton investigation (including her 33,000 illegally deleted emai

# Deployment on Streamlit

In [156]:
import pickle

In [157]:
file_name = 'model_news.sav'
pickle.dump(model, open(file_name, 'wb'))

In [158]:
with open("feature_extraction.pkl", "wb") as f:
    pickle.dump(vectorizer, f)