<a href="https://colab.research.google.com/github/Anibrata-Ghatak/Fake_news_detection/blob/main/Fake_News_Detection_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
fake_df = pd.read_csv("/content/drive/MyDrive/Fake.csv")
true_df = pd.read_csv("/content/drive/MyDrive/True.csv")

In [None]:
# Add labels: Fake = 1, True = 0
fake_df["label"] = 1
true_df["label"] = 0

# Combine datasets
df = pd.concat([fake_df, true_df], axis=0)

# Shuffle data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,POPE SHAMES AMERICANS From Mexico For Anti-Imm...,"Pope Francis has joined Democrat legislators, ...",left-news,"Feb 18, 2016",1
1,SCAM ALERT! USDA GIVES OBAMABUCKS TO FARMERS F...,The USDA is spreading the wealth like crazy wi...,Government News,"Aug 10, 2015",1
2,New Afghan peace talks expected in Oman but Ta...,"KABUL/PESHAWAR, Pakistan (Reuters) - Represent...",worldnews,"October 11, 2017",0
3,"Merkel, party leaders meet to rev up German co...",BERLIN (Reuters) - Key allies of Chancellor An...,worldnews,"October 29, 2017",0
4,"FBI in turmoil over Comey firing, scramble on ...",WASHINGTON (Reuters) - The FBI was reeling aft...,politicsNews,"May 10, 2017",0


In [None]:
df.describe()

Unnamed: 0,label
count,42024.0
mean,0.558752
std,0.496542
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42024 entries, 0 to 42023
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    42024 non-null  object
 1   text     42024 non-null  object
 2   subject  42024 non-null  object
 3   date     42023 non-null  object
 4   label    42024 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.6+ MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,1
label,0


In [None]:
df=df.fillna('')

In [None]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [None]:
df= df.drop(['title','subject','date'],axis=1)

In [None]:
df.head()

Unnamed: 0,text,label
0,"Pope Francis has joined Democrat legislators, ...",1
1,The USDA is spreading the wealth like crazy wi...,1
2,"KABUL/PESHAWAR, Pakistan (Reuters) - Represent...",0
3,BERLIN (Reuters) - Key allies of Chancellor An...,0
4,WASHINGTON (Reuters) - The FBI was reeling aft...,0


In [None]:
def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove special characters
    text = text.lower()
    words = text.split()
    words = [ps.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

In [None]:
df["text"] = df["text"].apply(preprocess_text)

In [None]:
# Define features (X) and labels (y)
X = df["text"]
y = df["label"]

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Train TF-IDF on full dataset
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)  # Train on entire dataset

In [None]:
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Train a Naïve Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 92.83%


In [None]:
# Save TF-IDF Vectorizer
with open("vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

# Save Model
with open("model.pkl", "wb") as file:
    pickle.dump(model, file)

In [None]:
# Load TF-IDF Vectorizer
vector_form = pickle.load(open("vectorizer.pkl", "rb"))

# Load Model
load_model = pickle.load(open("model.pkl", "rb"))

# Function to Predict Fake News
def fake_news(news):
    news = preprocess_text(news)  # ✅ Use the same preprocessing
    vectorized_news = vector_form.transform([news])  # ✅ Transform using trained TF-IDF
    prediction = load_model.predict(vectorized_news)
    return "Fake News" if prediction == 1 else "Real News"

# Test with an external news article
new_article = """Virat Kohli is not a celebrity any more."""
result = fake_news(new_article)
print("Prediction:", result)



Prediction: Fake News
