<a href="https://colab.research.google.com/github/Avinash-2803/Avinash-2803/blob/main/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import dependencies

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#printing the stopwords in english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Preprocessing

In [None]:
#loading datasets to a panda DataFrame
news_dataset= pd.read_csv('/content/IFND.csv', on_bad_lines='skip', encoding='latin-1')

In [None]:
news_dataset.shape

(46953, 7)

In [None]:
#print the first 5 rows of datasets
news_dataset.head()

Unnamed: 0,id,Statement,Image,Web,Category,Date,Label
0,2,"WHO praises India's Aarogya Setu app, says it ...",https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,COVID-19,Oct-20,True
1,3,"In Delhi, Deputy US Secretary of State Stephen...",https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,VIOLENCE,Oct-20,True
2,4,LAC tensions: China's strategy behind delibera...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,TERROR,Oct-20,True
3,5,India has signed 250 documents on Space cooper...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,COVID-19,Oct-20,True
4,6,Tamil Nadu chief minister's mother passes away...,https://cdn.dnaindia.com/sites/default/files/s...,DNAINDIA,ELECTION,Oct-20,True


In [None]:
#count number of missing values in datasets
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
Statement,0
Image,0
Web,1
Category,1
Date,1982
Label,1


In [None]:
#replacing null values with empty string
news_dataset= news_dataset.fillna('')

In [None]:
#merging statement and category of news_datasets
news_dataset['content'] = news_dataset['Category'] + ' '+ news_dataset['Statement']

In [None]:
print(news_dataset['content'])

0        COVID-19 WHO praises India's Aarogya Setu app,...
1        VIOLENCE In Delhi, Deputy US Secretary of Stat...
2        TERROR LAC tensions: China's strategy behind d...
3        COVID-19 India has signed 250 documents on Spa...
4        ELECTION Tamil Nadu chief minister's mother pa...
                               ...                        
46948    VIOLENCE Video from West Bengal passed off as ...
46949    VIOLENCE Bihar Mob Violence Shared With A Comm...
46950    POLITICS Samajwadi Party workers did not shout...
46951    MISLEADING No, advocates are not exempted from...
46952     Times Now tries fact-checking: Declares India...
Name: content, Length: 46953, dtype: object


In [None]:
#separating the data and label
X= news_dataset.drop(columns='Label', axis=1)
Y= news_dataset['Label']

In [None]:
print(X)
print(Y)

          id                                          Statement  \
0          2  WHO praises India's Aarogya Setu app, says it ...   
1          3  In Delhi, Deputy US Secretary of State Stephen...   
2          4  LAC tensions: China's strategy behind delibera...   
3          5  India has signed 250 documents on Space cooper...   
4          6  Tamil Nadu chief minister's mother passes away...   
...      ...                                                ...   
46948  46950  Video from West Bengal passed off as communal ...   
46949  46951  Bihar Mob Violence Shared With A Communal Twis...   
46950  46952  Samajwadi Party workers did not shout ÔPakista...   
46951  46953  No, advocates are not exempted from paying tol...   
46952  46954  Times Now tries fact-checking: Declares Indian...   

                                                   Image       Web  \
0      https://cdn.dnaindia.com/sites/default/files/s...  DNAINDIA   
1      https://cdn.dnaindia.com/sites/default/files/s..

stemming: stemming is the process of reducing a word to its root word. example-> actor,actress,acting->act

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
news_dataset['content']= news_dataset['content'].apply(stemming)

In [None]:
print(news_dataset['content'])

0        covid prais india aarogya setu app say help id...
1        violenc delhi deputi us secretari state stephe...
2        terror lac tension china strategi behind delib...
3        covid india sign document space cooper countri...
4           elect tamil nadu chief minist mother pass away
                               ...                        
46948    violenc video west bengal pass commun violenc ...
46949    violenc bihar mob violenc share commun twist s...
46950    polit samajwadi parti worker shout pakistan zi...
46951                    mislead advoc exempt pay toll fee
46952    time tri fact check declar indian parodi handl...
Name: content, Length: 46953, dtype: object


In [None]:
#separating the data and labels
X= news_dataset['content'].values
Y= news_dataset['Label'].values

In [None]:
print(X)

['covid prais india aarogya setu app say help identifi covid cluster'
 'violenc delhi deputi us secretari state stephen biegun pitch pax indo pacifica'
 'terror lac tension china strategi behind deliber fail talk india' ...
 'polit samajwadi parti worker shout pakistan zindabad mumbai railway station'
 'mislead advoc exempt pay toll fee'
 'time tri fact check declar indian parodi handl pakistani propaganda']


In [None]:
print(Y)

['TRUE' 'TRUE' 'TRUE' ... 'Fake' 'Fake' '']


In [None]:
Y.shape

(46953,)

In [None]:
#converting textual data to numerical data
vectorizer= TfidfVectorizer()
vectorizer.fit(news_dataset['content'].values)
X= vectorizer.transform(news_dataset['content'].values)

# Encode the labels
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(news_dataset['Label'].values)

In [None]:
print(X)
print(Y)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 457468 stored elements and shape (46953, 17161)>
  Coords	Values
  (0, 23)	0.39619489635653626
  (0, 788)	0.3122501728655736
  (0, 2960)	0.4436695089527491
  (0, 3377)	0.24670591239991485
  (0, 6534)	0.2572428573989403
  (0, 6900)	0.3605396107053589
  (0, 7091)	0.15458448184391702
  (0, 11716)	0.31709331950600395
  (0, 13476)	0.1663636626117061
  (0, 13727)	0.3735490478488615
  (1, 1725)	0.3788934228191084
  (1, 3836)	0.14967219074883512
  (1, 3923)	0.2610794844625723
  (1, 7113)	0.28088508557322067
  (1, 10894)	0.4206646331757944
  (1, 11181)	0.3858256155316364
  (1, 11451)	0.29396823902315666
  (1, 13598)	0.26436183308754474
  (1, 14636)	0.18047469746348246
  (1, 14674)	0.34949535600318166
  (1, 16258)	0.19782702547639827
  (1, 16571)	0.10488462757671702
  (2, 1519)	0.31063120986337844
  (2, 2763)	0.25858090431520614
  (2, 3842)	0.44266277091564676
  :	:
  (46950, 10931)	0.26259773697489464
  (46950, 11085)	0.2289745645457

Splitting the datasets to training and test data

In [None]:
# Identify and remove classes with only one member
unique_classes, class_counts = np.unique(Y, return_counts=True)
classes_to_keep = unique_classes[class_counts >= 2]

# Filter X and Y to keep only samples from classes with at least 2 members
filtered_indices = np.isin(Y, classes_to_keep)
X_filtered = X[filtered_indices]
Y_filtered = Y[filtered_indices]

# Splitting the datasets to training and test data
X_train, X_test, Y_train, Y_test= train_test_split(X_filtered, Y_filtered, test_size=0.2, stratify=Y_filtered, random_state=2)

Training the model: Logistic Regression


In [None]:
model= LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

Evaluation


Accuaracy Score

In [None]:
#accuracy score on training data
X_train_prediction= model.predict(X_train)
training_data_accuracy= accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuaracy score of the training data: ', training_data_accuracy)

Accuaracy score of the training data:  0.9617954793535848


In [None]:
#accuracy score on test data
X_test_prediction= model.predict(X_test)
test_data_accuracy= accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuaracy score of the test data: ', test_data_accuracy)

Accuaracy score of the test data:  0.9562346927909701


Making a Predictive system

In [None]:
X_new= X_test[1]

prediction= model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[2]
The news is Fake


In [None]:
print(Y_test[1])

2
