In [2]:
import pandas as pd
import numpy as np
import re           #useful for searching words in a text or documents
from nltk.corpus import stopwords # words that don't add much value to the text or data like was were is am etc
from nltk.stem.porter import PorterStemmer  # nltk = natural language tool kit used for stemming process
from sklearn.feature_extraction.text import TfidfVectorizer # used to convert text into feature vectors
from sklearn.model_selection import train_test_split # used to train data by splitting
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Data pre-processing

In [5]:
df = pd.read_csv('fake_news_dataset.csv')

In [6]:
df.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [7]:
df.shape

(20000, 7)

In [8]:
df['label'] = df['label'].map({'real':0,'fake':1})

In [9]:
df.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,0
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,1
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,1
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,1
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,1


In [10]:
df.isnull().sum()

title          0
text           0
date           0
source      1000
author      1000
category       0
label          0
dtype: int64

In [11]:
df = df.fillna('') # replacing null value with empty string

In [12]:
df['content'] = df['author']+' '+df['title'] # merging the two column

In [13]:
print(df['content'])

0                     Paula George Foreign Democrat final.
1          Joseph Hill To offer down resource great point.
2              Julia Robinson Himself church myself carry.
3                Mr. David Foster DDS You unit its should.
4        Austin Walker Billion believe employee summer ...
                               ...                        
19995                         Gary Miles House party born.
19996    Maria Mcbride Though nation people maybe price...
19997     Kristen Franklin Yet exist with experience unit.
19998                  David Wise School wide itself item.
19999        James Peterson Offer chair cover senior born.
Name: content, Length: 20000, dtype: object


In [14]:
df = df.drop(['date','source','category'],axis=1)

In [15]:
# seperating data and label
x = df.drop(columns='label',axis=1)
y = df['label']
print(x)
print(y)

                                       title  \
0                    Foreign Democrat final.   
1        To offer down resource great point.   
2               Himself church myself carry.   
3                       You unit its should.   
4       Billion believe employee summer how.   
...                                      ...   
19995                      House party born.   
19996  Though nation people maybe price box.   
19997        Yet exist with experience unit.   
19998               School wide itself item.   
19999         Offer chair cover senior born.   

                                                    text  \
0      more tax development both store agreement lawy...   
1      probably guess western behind likely next inve...   
2      them identify forward present success risk sev...   
3      phone which item yard Republican safe where po...   
4      wonder myself fact difficult course forget exa...   
...                                                  ...   
199

In [16]:
x.shape

(20000, 4)

In [17]:
x.isnull().sum()

title      0
text       0
author     0
content    0
dtype: int64

Stemming :
It is the process of reducing a word to its Root word
Eg : Actor, Actress,acting -> act

In [18]:
port_stem = PorterStemmer()

In [19]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # removing special characters and numbers
    stemmed_content = stemmed_content.lower() # converting to lower case
    stemmed_content = stemmed_content.split() # splitting the words
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] # stemming the words and removing stopwords
    return ' '.join(stemmed_content) # joining the words back together

In [20]:
df['content'] = df['content'].apply(stemming)

In [21]:
print(df['content'])

0                      paula georg foreign democrat final
1                   joseph hill offer resourc great point
2                             julia robinson church carri
3                                 mr david foster dd unit
4             austin walker billion believ employe summer
                               ...                       
19995                           gari mile hous parti born
19996    maria mcbride though nation peopl mayb price box
19997              kristen franklin yet exist experi unit
19998                         david wise school wide item
19999         jame peterson offer chair cover senior born
Name: content, Length: 20000, dtype: object


In [23]:
# seperating data and label
x = df['content'].values
y = df['label'].values
print(x)
print(y)

['paula georg foreign democrat final'
 'joseph hill offer resourc great point' 'julia robinson church carri' ...
 'kristen franklin yet exist experi unit' 'david wise school wide item'
 'jame peterson offer chair cover senior born']
[0 1 1 ... 0 1 1]


In [24]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer()# Tf=turn frequency, idf=inverse document frequency
vectorizer.fit(x)

x = vectorizer.transform(x)

In [25]:
print(x)

  (0, 553)	0.39378500362567437
  (0, 729)	0.38563113779991653
  (0, 760)	0.4303325215423282
  (0, 820)	0.4702667357326091
  (0, 1607)	0.5384098017728571
  (1, 855)	0.4075140492253222
  (1, 945)	0.4349807003583625
  (1, 1106)	0.39395595700897607
  (1, 1544)	0.4059231913258755
  (1, 1650)	0.40644929622403225
  (1, 1761)	0.3994347584771229
  (2, 340)	0.4619493340010572
  (2, 402)	0.46849498889312446
  (2, 1116)	0.5803950203356232
  (2, 1802)	0.4798508920150325
  (3, 529)	0.3893432059572742
  (3, 535)	0.46879786583185773
  (3, 765)	0.5331103685464007
  (3, 1474)	0.36176526198760245
  (3, 2168)	0.4621251552704844
  (4, 128)	0.4431492340687908
  (4, 182)	0.39333180511748317
  (4, 209)	0.39858312187055117
  (4, 655)	0.41121899358509134
  (4, 2054)	0.3943487167650092
  :	:
  (19996, 1316)	0.38290410625708515
  (19996, 1345)	0.3396980161532583
  (19996, 1349)	0.44657233844084904
  (19996, 1494)	0.3011404108931225
  (19996, 1617)	0.338246234612171
  (19996, 1677)	0.31677886528565996
  (19996, 21

Spliting dataset for training and testing

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

Training the model : Logistic Regression

In [27]:
model = LogisticRegression()

In [28]:
model.fit(X_train,Y_train)

Evaluation 
Accuracy Score

In [29]:
# accuracy score on the training data
# binary classification problems logistic regression is best model
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [30]:
print("Accuracy score of the training data : ", training_data_accuracy)

Accuracy score of the training data :  0.6490625


In [31]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [32]:
print("Accuracy score of the test data : ", testing_data_accuracy)

Accuracy score of the test data :  0.4945
