In [None]:
import pandas as pd
import numpy as np
import pickle
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load dataset (Fake and Real news combined)
df = pd.read_csv('C:/Users/hp/Desktop/news.csv')  # Ensure the dataset has 'text' and 'label' columns

# Text Preprocessing function
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)

# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["label"], test_size=0.2, random_state=42)

# Model Pipeline (TF-IDF + Logistic Regression)
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', LogisticRegression())
])

# Train Model
model_pipeline.fit(X_train, y_train)

# Save Model
with open("fake_news_model.pkl", "wb") as f:
    pickle.dump(model_pipeline, f)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
