In [None]:
# ===============================
# Fake News Detector - Training
# ===============================

import pandas as pd
import numpy as np
import seaborn as sb
import re
import nltk
import pickle
import itertools
from matplotlib import pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import metrics

# -------------------------------
# Download NLTK resources
# -------------------------------
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# -------------------------------
# Load dataset
# -------------------------------
train_df = pd.read_csv(r'/miniproject/dataset/train.csv')
print("Dataset loaded:", train_df.shape)
print(train_df.head())

# Drop unused columns
train_df = train_df.drop(["author", "title", "id"], axis=1)

# -------------------------------
# Check distribution
# -------------------------------
def create_distribution(dataFile):
    return sb.countplot(x='label', data=dataFile, palette='hls')

create_distribution(train_df)

# -------------------------------
# Check for missing values
# -------------------------------
print(train_df.isnull().sum())

# -------------------------------
# Preprocess text (clean + lemmatize + remove stopwords)
# -------------------------------
def preprocess_text(text):
    review = re.sub(r'[^a-zA-Z\s]', '', str(text))
    review = review.lower()
    tokens = nltk.word_tokenize(review)
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned)

train_df['text'] = train_df['text'].apply(preprocess_text)

# -------------------------------
# Split dataset
# -------------------------------
X = train_df['text']
y = train_df['label']   # <-- make sure 'label' column still exists

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------------
# TF-IDF Vectorizer
# -------------------------------
tfidf_v = TfidfVectorizer(max_df=0.7, stop_words='english')
tfidf_X_train = tfidf_v.fit_transform(X_train)
tfidf_X_test = tfidf_v.transform(X_test)

# -------------------------------
# Train model
# -------------------------------
classifier = PassiveAggressiveClassifier(max_iter=50)
classifier.fit(tfidf_X_train, y_train)

# -------------------------------
# Evaluate model
# -------------------------------
y_pred = classifier.predict(tfidf_X_test)
score = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {round(score*100, 2)}%")

cm = metrics.confusion_matrix(y_test, y_pred)

def plot_confusion_matrix(cm, classes, title='Confusion Matrix'):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plot_confusion_matrix(cm, classes=['REAL', 'FAKE'])
plt.show()

print(metrics.classification_report(y_test, y_pred, target_names=['REAL', 'FAKE']))

# -------------------------------
# Save model and vectorizer
# -------------------------------
with open("model.pkl", "wb") as f:
    pickle.dump(classifier, f)

with open("vector.pkl", "wb") as f:
    pickle.dump(tfidf_v, f)

print("Model and vectorizer saved successfully!")

# -------------------------------
# Quick test function
# -------------------------------
def fake_news_det(news):
    cleaned = preprocess_text(news)
    input_data = [cleaned]
    vectorized_input = tfidf_v.transform(input_data)
    prediction = classifier.predict(vectorized_input)

    if prediction[0] == 1:
        print("⚠ Fake News 📰")
    else:
        print("✅ Real News 📰")

# -------------------------------
# Try manual test
# -------------------------------
news = "India successfully launched the Aditya-L1 solar mission from Sriharikota."
fake_news_det(news)
