In [20]:
# DEVLEAPS AI/ML PROJECT
# FAKE NEWS DETECTION

# SOURCE CODE

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import nltk
from nltk.corpus import stopwords
import string

In [21]:
# NLTK stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\souma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
# Datasets
fake = pd.read_csv("D:/Users/Souma/Desktop/gdg leaps project/fakenewsproject/dataset/Fake.csv")
true = pd.read_csv("D:/Users/Souma/Desktop/gdg leaps project/fakenewsproject/dataset/True.csv")

In [23]:
# Add labels
fake['label'] = 0  # Fake news
true['label'] = 1  # True news


In [24]:
# Combine datasets
data = pd.concat([fake, true], axis=0).reset_index(drop=True)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

In [25]:
# Data cleaning
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

data['text'] = data['text'].apply(clean_text)

In [26]:
# Split the dataset
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
# Vectorization (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [33]:
# Model training
model = DecisionTreeClassifier()
model.fit(X_train_tfidf, y_train)


In [34]:
# Model evaluation
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)


In [35]:
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9956570155902005
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4739
           1       1.00      1.00      1.00      4241

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

Confusion Matrix:
[[4720   19]
 [  20 4221]]


In [36]:
# Fake news prediction
def predict_news(news):
    cleaned_news = clean_text(news)
    vectorized_news = tfidf.transform([cleaned_news])
    prediction = model.predict(vectorized_news)
    return "Fake" if prediction[0] == 0 else "True"



In [37]:
# Test the prediction function
test_news = "The government announces new healthcare reforms today."
print("News:", test_news)
print("Prediction:", predict_news(test_news))

News: The government announces new healthcare reforms today.
Prediction: Fake
