IMPORT DEPENDENCIES


In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Pre-processing


In [49]:
news_dataset = pd.read_csv('/content/combined_fake_news_3.csv')

In [50]:
news_dataset.shape

(80771, 5)

In [51]:
# print first 5 rows from the dataset
news_dataset.head()

Unnamed: 0,source,text,label,title,url
0,BBC News,Highlights from Grace Road in Leicester as Eng...,1,Women's ODI Cricket,https://www.bbc.co.uk/iplayer/episode/m002d4yf...
1,CNN,"For months, President Donald Trump has suggest...",1,Why Trump has struggled to get Xi on the phone...,https://www.cnn.com/2025/06/04/politics/trump-...
2,CNN,A group of migrants the Trump administration s...,1,Migrants flown to El Salvador under Alien Enem...,https://www.cnn.com/2025/06/04/politics/migran...
3,CNN,The family of a teenager from Massachusetts wh...,1,"‘Please, get him out’: Little brother of detai...",https://www.cnn.com/2025/06/04/us/video/teen-d...
4,CNN,Hackers are tricking employees at companies in...,1,Hackers are using a modified Salesforce app to...,https://www.cnn.com/2025/06/04/tech/hackers-ab...


In [52]:
# count the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
source,0
text,1
label,0
title,0
url,51729


In [53]:
# rreplacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [54]:
# merging the other name and news title
news_dataset['content'] = news_dataset['title'] + ' ' + news_dataset['text']

print(news_dataset['content'])

In [56]:
# separating the data & label
X = news_dataset['content']
Y = news_dataset['label']

In [57]:
print(X)
print(Y)

0        Women's ODI Cricket Highlights from Grace Road...
1        Why Trump has struggled to get Xi on the phone...
2        Migrants flown to El Salvador under Alien Enem...
3        ‘Please, get him out’: Little brother of detai...
4        Hackers are using a modified Salesforce app to...
                               ...                        
80766    Russian President Implements Revolutionary Cli...
80767    Japanese Prime Minister Implements Revolutiona...
80768    European Union Reveals Surprising Cybersecurit...
80769    International Monetary Fund Declares Major Shi...
80770    Apple Announces Breakthrough in Technology Usi...
Name: content, Length: 80771, dtype: object
0        1
1        1
2        1
3        1
4        1
        ..
80766    0
80767    0
80768    0
80769    0
80770    0
Name: label, Length: 80771, dtype: int64


In [59]:
 # Load stopwords once
port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [60]:
def stemming(content):
    content = re.sub('[^a-zA-Z]', ' ', content)
    content = content.lower()
    content = content.split()
    content = [port_stem.stem(word) for word in content if word not in stop_words]
    return ' '.join(content)

In [61]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [62]:
print(news_dataset['content'])

0        women odi cricket highlight grace road leicest...
1        trump struggl get xi phone talk trade cnn poli...
2        migrant flown el salvador alien enemi act must...
3        pleas get littl brother detain teen send messa...
4        hacker use modifi salesforc app trick employe ...
                               ...                        
80766    russian presid implement revolutionari climat ...
80767    japanes prime minist implement revolutionari e...
80768    european union reveal surpris cybersecur disco...
80769    intern monetari fund declar major shift polit ...
80770    appl announc breakthrough technolog use unconv...
Name: content, Length: 80771, dtype: object


In [63]:
# seperating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [64]:
print(X)

['women odi cricket highlight grace road leicest england women take west indi second three match one day intern seri'
 'trump struggl get xi phone talk trade cnn polit month presid donald trump suggest person relationship china xi jinp unlock world consequenti trade disput yet couldnt get xi telep char'
 'migrant flown el salvador alien enemi act must allow challeng remov feder judg rule group migrant trump administr sent mega prison el salvador earlier year must opportun challeng remov alien enemi act feder judg char'
 ...
 'european union reveal surpris cybersecur discoveri challeng scientif consensu west christina ukrain march groundbreak develop european union today announc unpreced initi field cybersecur find challeng everyth thought knew explain dr michael long open entir new possibl announc come month intens research collabor intern expert initi set begin implement june mark signific departur convent approach cybersecur expert around world express excit skeptic enter unchart ter

In [65]:
print(Y)

[1 1 1 ... 0 0 0]


In [66]:
Y.shape

(80771,)

In [67]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [68]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7087578 stored elements and shape (80771, 109384)>
  Coords	Values
  (0, 20118)	0.3248477919738707
  (0, 21903)	0.1101586304515079
  (0, 27965)	0.23506225646257878
  (0, 36426)	0.24219937780552633
  (0, 40275)	0.2050057285622621
  (0, 43581)	0.3248477919738707
  (0, 44249)	0.11539916308368935
  (0, 53456)	0.3813697983727953
  (0, 58318)	0.21367807389875831
  (0, 68067)	0.40781279870982634
  (0, 68800)	0.09083921062579822
  (0, 80742)	0.19486437292567765
  (0, 84788)	0.14931198056371087
  (0, 85377)	0.18166472780481463
  (0, 92946)	0.1073312489073924
  (0, 95152)	0.13015483338521003
  (0, 104149)	0.1677239795110257
  (0, 105542)	0.30336384527962046
  (1, 15770)	0.1839297617883111
  (1, 16389)	0.13351673279259205
  (1, 17772)	0.1345434431788361
  (1, 18898)	0.2604844631552325
  (1, 19582)	0.25981033736430537
  (1, 24162)	0.15349022795572057
  (1, 24779)	0.07722935932586193
  :	:
  (80770, 74379)	0.08318020589507755
  (80770, 7

splitting the dataset to training & test data

In [69]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

Training the model

In [70]:
model = LogisticRegression(max_iter=5000, solver='liblinear')

In [71]:
model.fit(X_train,Y_train)

In [72]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [73]:
print('accuracy of the training data : ', training_data_accuracy)

accuracy of the training data :  0.8976414510337997


In [74]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [75]:
print('accuracy of the test data : ', test_data_accuracy)

accuracy of the test data :  0.8722995976477871


1 - Fake news ,
0 - Real news


Making a predictive system

In [97]:
X_new = X_test[99]
prediction = model.predict(X_new)
print(prediction)
#print("Predicted:", prediction[0])
#print("Actual:", Y_test[0])
if(prediction[0] == 0):
    print("The news is Real")
else:
    print("The news is Fake")

[0]
The news is Real


In [94]:
print(Y_test[99])

0
