In [16]:
import pandas as pd
import numpy as np

In [2]:
import nltk

In [20]:
fake=pd.read_csv("Datasets/Fake.csv")
true=pd.read_csv("Datasets/True.csv")

In [21]:
fake["label"]=0
true["label"]=1
data=pd.concat([fake, true],axis=0)
data=data.sample(frac=1,random_state=42).reset_index(drop=True)

In [22]:
data.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1


In [23]:
data['label'].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [24]:
## Preprocessing the data set

In [25]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [31]:
stop_word=set(stopwords.words('English'))
lemmatize=WordNetLemmatizer()

In [28]:
print(list(stop_word)[:10])

['were', 'this', 'shouldn', 'did', 'so', 'or', 'all', 'in', 'those', 'into']


In [29]:
# defining a function for cleaning

In [34]:
import re

In [37]:
def clean_text(text):
    text=re.sub('[^a-zA-Z]',' ',text)
    text=text.lower()
    words=text.split()
    words=[lemmatizer.lemmatize(w) for w in words if w not in stop_word]
    return " ".join(words)

In [38]:
data['cleaned_text']=data['text'].apply(clean_text)

In [40]:
data[['text','cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"21st Century Wire says Ben Stein, reputable pr...",st century wire say ben stein reputable profes...
1,WASHINGTON (Reuters) - U.S. President Donald T...,washington reuters u president donald trump re...
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,reuters puerto rico governor ricardo rossello ...
3,"On Monday, Donald Trump once again embarrassed...",monday donald trump embarrassed country accide...
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",glasgow scotland reuters u presidential candid...


In [46]:
#Featue Engineering

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=5000)
X=tfidf.fit_transform(data['cleaned_text']).toarray()
y=data['label']

In [42]:
print("TF-IDF shape:",X.shape)

TF-IDF shape: (44898, 5000)


In [45]:
#splitting

In [43]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
#training

In [44]:
    from sklearn.naive_bayes import MultinomialNB
    model=MultinomialNB()
    model.fit(X_train,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [None]:
#evaluating

In [47]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred=model.predict(X_test)

In [48]:
print("Accuracy:",accuracy_score(y_test,y_pred))
print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nClassification Report:\n",classification_report(y_test,y_pred))

Accuracy: 0.9288418708240535

Confusion Matrix:
 [[4399  311]
 [ 328 3942]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      4710
           1       0.93      0.92      0.93      4270

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980



In [67]:
#prediction

In [59]:
def prediction(news):
    cleaned=clean_text(news)
    vector=tfidf.transform([cleaned])
    result=model.predict(vector)
    return "Real" if result[0] ==1 else "Fake"

In [60]:
news_1="India successfully launches its first manned space mission in 2025."
print(prediction(news_1))

Real


In [61]:
news_2="Aliens land on Earth and sign trade agreement with humans."
print(prediction(news_2))

Fake


In [62]:
news_3="Russia started a war with America today"
print(prediction(news_3))

Fake


In [63]:
news_4="Tamil Nadu government started a hostel facility for women in 9 places on 2024."
print(prediction(news_4))

Fake


In [64]:
news_4="Tamil Nadu government started a hostel facility for women in 9 places on 2025."
print(prediction(news_4))

Fake


In [65]:
news_5="operation sindhoor - India attacked in Pakisthan"
print(prediction(news_5))

Real


In [66]:
news_6="operation sindhorr was failure"
print(prediction(news_6))

Real
