In [1]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")

df_fake["label"] = 0
df_real["label"] = 1

df = pd.concat([df_fake, df_real])
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = "".join([c for c in text if c not in string.punctuation])
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df["text"] = df["title"] + " " + df["text"]
df["text"] = df["text"].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Varun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Varun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text"]).toarray()
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
model = LogisticRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

Accuracy: 0.9894209354120267
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4745
           1       0.99      0.99      0.99      4235

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [7]:
def predict_news(news_text):
    cleaned_text = clean_text(news_text)
    vectorized = tfidf.transform([cleaned_text])
    prediction = model.predict(vectorized)[0]
    label = "Real News" if prediction == 1 else "Fake News"
    print(f"\nPrediction: {label}")

In [8]:
sample_input = "Breaking: Scientists confirm a new planet has been discovered beyond Pluto with the help of NASA’s latest telescope."
predict_news(sample_input)


Prediction: Fake News


In [9]:
sample_input = "The United Nations has called for an immediate ceasefire in conflict zones around the world to help humanitarian efforts during the COVID-19 pandemic. The UN Secretary-General said that the global ceasefire would allow aid workers better access to vulnerable populations and ease the burden on healthcare systems."
predict_news(sample_input)


Prediction: Real News
