<a href="https://colab.research.google.com/github/Deepanshu-Pal7701/Fake-News-Prediction-Using-Logistic-Regression/blob/main/Fake_News_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing the** **dependencies**

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
#printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

### Data Pre-processing

In [4]:
df_fake = pd.read_csv("/content/Fake.csv")
df_true = pd.read_csv("/content/True.csv")

In [5]:
df_fake.shape

(23481, 4)

In [6]:
df_true.shape

(21417, 4)

In [7]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [8]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Inserting a column "Label" as target feature

In [9]:
df_fake["label"] = 0
df_true["label"] = 1

In [10]:
df_true.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [11]:
df_fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


**Merging True and Fake Dataframe**

In [12]:
news_dataset = pd.concat([df_fake, df_true], axis =0 )
news_dataset.head(10)

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",0
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017",0
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",0
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",0
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017",0


In [13]:
news_dataset.shape

(44898, 5)

**counting the number of missing values in the dataset**

In [14]:
news_dataset.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


**Random Shuffling the dataframe**

In [15]:
news_dataset = news_dataset.sample(frac = 1)

In [16]:
news_dataset.head()

Unnamed: 0,title,text,subject,date,label
4990,Trump Fans Threaten Jewelers Who Donated Ivan...,Jewelry designers Jill Martinelli and Sabine L...,News,"August 18, 2016",0
19045,WATCH: G.W. BUSH Gushes Over Kimmel’s Anti-Tru...,George W. Bush heaped praise on Jimmy Kimmel f...,left-news,"Mar 3, 2017",0
1848,Former GOP Rep. Thinks Adam Schiff Should Rec...,"Earlier this week, Devin Nunes finally recused...",News,"April 8, 2017",0
19553,Czech ruling party says wage growth must be pr...,PRAGUE (Reuters) - The next Czech government s...,worldnews,"September 20, 2017",1
17110,STATE DEPT EMPLOYEE TAPPED TO OVERSEE HILLARY ...,You seriously can t make up this stuff State D...,Government News,"Sep 8, 2015",0


**Removing columns which are not required**

In [17]:
news_dataset = news_dataset.drop(["date"], axis = 1)

**Merging title and subject columns**

In [18]:
news_dataset['content'] = news_dataset['subject'] + ' - ' + news_dataset['title']

In [19]:
print(news_dataset['content'])

4990     News -  Trump Fans Threaten Jewelers Who Donat...
19045    left-news - WATCH: G.W. BUSH Gushes Over Kimme...
1848     News -  Former GOP Rep. Thinks Adam Schiff Sho...
19553    worldnews - Czech ruling party says wage growt...
17110    Government News - STATE DEPT EMPLOYEE TAPPED T...
                               ...                        
5046     News -  BOMBSHELL: Ivanka Trump Just Went On V...
20226    left-news - ALL KIDDING ASIDE…DID HILLARY JUST...
12255    politics - BRILLIANT! TUCKER CARLSON Humiliate...
8297     politicsNews - Democrat Clinton raised more th...
20752    worldnews - Malaysia says foils hijacking of T...
Name: content, Length: 44898, dtype: object


**Stemming: Process of reducing a word to its Root word**

example: actor, actress, acting  -->  act

In [23]:
port_stem = PorterStemmer()

In [28]:
# here we make a stemming function as we don't have any pre-built function
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)   # it substitutes all punctuation and numbers to whitespace except alphabet from (a-z or A-Z)
  stemmed_content = stemmed_content.lower()           # convert all the upper content to lower case
  stemmed_content = stemmed_content.split()           # split by whitespaces and store in a list
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]      # Stemming only those words which are not stopwords
  stemmed_content = ' '.join(stemmed_content)         # at last joining these words back
  return stemmed_content

In [30]:
# now we apply stemming function to our content column
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [31]:
print(news_dataset['content'])

4990     news trump fan threaten jewel donat ivanka mon...
19045    left news watch g w bush gush kimmel anti trum...
1848     news former gop rep think adam schiff recu rus...
19553    worldnew czech rule parti say wage growth must...
17110    govern news state dept employ tap overs hillar...
                               ...                        
5046     news bombshel ivanka trump went vacat putin se...
20226    left news kid asid hillari seizur middl q jour...
12255    polit brilliant tucker carlson humili jill ste...
8297     politicsnew democrat clinton rai million augus...
20752    worldnew malaysia say foil hijack thai tanker ...
Name: content, Length: 44898, dtype: object


**Separating the data and label**

here we don't take text column in our data as it have paragraphs of words

In [42]:
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [43]:
print(X)

['news trump fan threaten jewel donat ivanka money hillari karma b tch'
 'left news watch g w bush gush kimmel anti trump oscar monologu refu speak obama good countri former presid undermin current presid'
 'news former gop rep think adam schiff recu russia probe reason' ...
 'polit brilliant tucker carlson humili jill stein campaign manag phoni recount scam video'
 'politicsnew democrat clinton rai million august campaign'
 'worldnew malaysia say foil hijack thai tanker pirat arrest']


In [44]:
print(Y)

[0 0 0 ... 0 1 1]


In [45]:
X.shape

(44898,)

In [46]:
Y.shape

(44898,)

**Converting textual data to numerical data** (feature vector)


TF-IDF (Term Frequency-Inverse Document Frequency)  -->   used to reflect how important a word is to a document in a collection

In [47]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)              # convert all those numbers to feature vector
X = vectorizer.transform(X)

In [48]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 463173 stored elements and shape (44898, 13094)>
  Coords	Values
  (0, 3369)	0.3152317238769449
  (0, 4080)	0.3020710952470091
  (0, 5378)	0.1840441089286443
  (0, 6002)	0.3124813371014669
  (0, 6091)	0.47556125173864
  (0, 6232)	0.3540186965574195
  (0, 7498)	0.28191290655968115
  (0, 7805)	0.09281983322198128
  (0, 11457)	0.3991910739323789
  (0, 11616)	0.2631246695796045
  (0, 11908)	0.09848318825769302
  (1, 456)	0.17320516470069264
  (1, 1616)	0.2276454150517352
  (1, 2567)	0.2125553042306516
  (1, 2753)	0.2895569243657263
  (1, 4433)	0.18471250351549198
  (1, 4846)	0.22526147185008236
  (1, 5071)	0.3490844904780645
  (1, 6308)	0.275546697408225
  (1, 6567)	0.11257971599422217
  (1, 7509)	0.3490844904780645
  (1, 7805)	0.06963906468192341
  (1, 7990)	0.12595899497320887
  (1, 8172)	0.2747336370423704
  (1, 8919)	0.28831972598191347
  :	:
  (44895, 6100)	0.3403729629985975
  (44895, 6978)	0.2835039369666607
  (44895, 861

**Splitting the dataset into training and testing data**

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [50]:
print(X.shape, X_train.shape, X_test.shape)

(44898, 13094) (35918, 13094) (8980, 13094)


**Training the Model: Logistic Regression**

In [51]:
model = LogisticRegression()

In [52]:
model.fit(X_train, Y_train)

**Evaluation**--> accuracy score

In [53]:
# accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [54]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9999443176123393


In [55]:
# accuracy score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [56]:
print('Accuracy score of the testing data : ', testing_data_accuracy)

Accuracy score of the testing data :  0.9998886414253898


**Making a Predictive System**


Fake News --> 0

True News --> 1

In [64]:
X_new = X_test[1000]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Fake')
else:
  print('The news is True')

[1]
The news is True


In [65]:
print(Y_test[1000])

1
