In [6]:
### -*- coding: utf-8 -*-
"""
This script provides a command-line interface to train a fake news classifier
and predict whether a given news article is real or fake. It mirrors the
functionality of the original React component.
"""

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.exceptions import NotFittedError
import string
import warnings
import os

# Suppress convergence warnings from MLPClassifier for a cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

class FakeNewsClassifier:
    """
    A classifier for detecting fake news using a combination of TF-IDF
    vectorization, simple feature engineering, and a Multi-layer Perceptron model.
    """
    def __init__(self):
        """Initializes the vectorizer and the neural network model."""
        # TF-IDF Vectorizer to convert text into numerical features
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2)
        )
        # MLPClassifier serves as our neural network
        self.model = MLPClassifier(
            hidden_layer_sizes=(128, 64),
            activation='relu',
            solver='adam',
            max_iter=1,          # Train one epoch at a time in a loop
            warm_start=True,     # Retain weights between 'fit' calls
            random_state=42,
            learning_rate_init=0.001,
            alpha=0.0001
        )
        self.is_trained = False
        self.validation_accuracy = 0.0

    def _feature_engineering(self, texts):
        """
        Creates additional numerical features from the text.
        Args:
            texts (list of str): A list of text documents.
        Returns:
            np.array: An array of engineered features for each text.
        """
        engineered_features = []
        for text in texts:
            text_len = len(text)
            # Avoid division by zero for empty strings
            if text_len == 0:
                punc_density = 0
            else:
                punc_count = sum(1 for char in text if char in string.punctuation)
                punc_density = punc_count / text_len
            
            engineered_features.append([text_len, punc_density])
            
        return np.array(engineered_features)

    def train(self, texts, labels, validation_split=0.2, epochs=20):
        """
        Trains the model on the provided data, preventing data leakage.
        Args:
            texts (list of str): The news article texts.
            labels (list of int): The corresponding labels (0 for real, 1 for fake).
            validation_split (float): The proportion of data to use for validation.
            epochs (int): The number of training epochs.
        """
        print("✓ Building neural network and preprocessing data...")

        # 1. Split data into training and validation sets FIRST to prevent leakage
        X_train_text, X_val_text, y_train, y_val = train_test_split(
            texts, labels, test_size=validation_split, random_state=42, stratify=labels
        )
        
        print(f"✓ Data split: {len(X_train_text)} training, {len(X_val_text)} validation samples.")

        # 2. Fit and transform the vectorizer ONLY on the training data
        X_train_tfidf = self.vectorizer.fit_transform(X_train_text).toarray()
        
        # 3. Transform the validation data using the FITTED vectorizer
        X_val_tfidf = self.vectorizer.transform(X_val_text).toarray()

        # 4. Create engineered features for both sets separately
        X_train_eng = self._feature_engineering(X_train_text)
        X_val_eng = self._feature_engineering(X_val_text)
        print(f"✓ Vectorized text and added {X_train_eng.shape[1]} engineered features.")

        # 5. Combine TF-IDF features with engineered features for both sets
        X_train_combined = np.hstack((X_train_tfidf, X_train_eng))
        X_val_combined = np.hstack((X_val_tfidf, X_val_eng))
        
        print("⚡ Starting training...")
        
        # 6. Training loop to simulate epoch-by-epoch logging
        for epoch in range(epochs):
            self.model.fit(X_train_combined, y_train)

            # Calculate metrics for logging
            train_pred = self.model.predict(X_train_combined)
            train_acc = accuracy_score(y_train, train_pred)
            val_pred = self.model.predict(X_val_combined)
            val_acc = accuracy_score(y_val, val_pred)
            loss = self.model.loss_

            print(
                f"Epoch {epoch + 1}/{epochs} - "
                f"Loss: {loss:.4f}, "
                f"Accuracy: {train_acc*100:.2f}%, "
                f"Val Accuracy: {val_acc*100:.2f}%"
            )

            if epoch == epochs - 1:
                self.validation_accuracy = val_acc

        self.is_trained = True
        print(f"\n✓ Training complete! Final Validation Accuracy: {self.validation_accuracy*100:.2f}%")
        
        # --- Detailed Validation Metrics ---
        print("\n" + "-"*15 + " Detailed Validation Metrics " + "-"*15)
        final_val_pred = self.model.predict(X_val_combined)

        # Confusion Matrix
        print("\nConfusion Matrix:")
        cm = confusion_matrix(y_val, final_val_pred)
        print(f"               Predicted Real  Predicted Fake")
        print(f"Actual Real    {cm[0][0]:<15} {cm[0][1]:<15}")
        print(f"Actual Fake    {cm[1][0]:<15} {cm[1][1]:<15}")
        
        # Classification Report (Precision, Recall, F1-score)
        print("\nClassification Report:")
        print(classification_report(y_val, final_val_pred, target_names=['Real (0)', 'Fake (1)']))
        print("-" * 55)


    def predict(self, text):
        """
        Predicts the label for a single piece of text.
        Args:
            text (str): The news article text to classify.
        Returns:
            dict: A dictionary containing the predicted 'label' and 'confidence'.
        """
        if not self.is_trained:
            raise NotFittedError("Model is not trained. Please train it before making predictions.")

        text_list = [text]
        
        # Apply the same transformations as in training
        engineered_features = self._feature_engineering(text_list)
        tfidf_features = self.vectorizer.transform(text_list).toarray()
        combined_features = np.hstack((tfidf_features, engineered_features))

        # Predict probabilities for [class 0, class 1]
        probabilities = self.model.predict_proba(combined_features)[0]
        
        # We mapped 'real' to 0 and 'fake' to 1
        confidence_fake = probabilities[1]
        
        if confidence_fake > 0.5:
            label = "Fake"
            confidence = confidence_fake
        else:
            label = "Real"
            confidence = 1 - confidence_fake
            
        return {"label": label, "confidence": confidence}

def main():
    """Main function to run the command-line interface."""
    filepath = '/Users/chandu/Downloads/fakeNews.csv'
    
    try:
        print(f"Attempting to load data from '{filepath}'...")
        df = pd.read_csv(filepath)
        df = df.dropna(subset=['Text', 'label']).reset_index(drop=True)
        print("✓ Data loaded successfully.")
    except FileNotFoundError:
        print(f"✗ Error: The file '{filepath}' was not found. Please check the path and try again.")
        return
    except Exception as e:
        print(f"✗ An error occurred while reading the CSV file: {e}")
        return

    # Use the entire loaded dataset
    df_sampled = df
    print(f"✓ Loaded all {len(df_sampled)} articles from the dataset.")

    texts = df_sampled['Text'].astype(str).tolist()
    labels = [1 if str(label).lower() == 'fake' else 0 for label in df_sampled['label']]

    classifier = FakeNewsClassifier()

    while True:
        print("\n" + "="*40)
        print("  Fake News Detector CLI")
        print("="*40)
        if classifier.is_trained:
             print(f"  Model Status: Trained (Val Acc: {classifier.validation_accuracy*100:.2f}%)")
        else:
             print("  Model Status: Not Trained")
        print("-" * 40)
        print("1. Train Model")
        print("2. Classify an Article")
        print("3. Exit")
        choice = input("Enter your choice (1-3): ")

        if choice == '1':
            print("\n--- Training Model ---")
            classifier.train(texts, labels, epochs=9) # Reduced epochs to optimal number
            print("----------------------")
        elif choice == '2':
            if not classifier.is_trained:
                print("\n[!] Model is not trained. Please train the model first (Option 1).")
                continue

            print("\n--- Classify Article ---")
            news_text = input("Paste the news article text here:\n> ")
            if not news_text.strip():
                print("\n[!] No text provided. Please enter an article.")
                continue

            print("\nAnalyzing...")
            try:
                result = classifier.predict(news_text)
                print("\n--- Prediction Result ---")
                print(f"  Label: {result['label']} News")
                print(f"  Confidence: {result['confidence']*100:.2f}%")
                print("-------------------------")
            except Exception as e:
                print(f"\nAn error occurred during prediction: {e}")

        elif choice == '3':
            print("Exiting application.")
            break
        else:
            print("\n[!] Invalid choice. Please enter a number between 1 and 3.")

if __name__ == "__main__":
    main()




Attempting to load data from '/Users/chandu/Downloads/fakeNews.csv'...
✓ Data loaded successfully.
✓ Loaded all 9900 articles from the dataset.

  Fake News Detector CLI
  Model Status: Not Trained
----------------------------------------
1. Train Model
2. Classify an Article
3. Exit


Enter your choice (1-3):  1



--- Training Model ---
✓ Building neural network and preprocessing data...
✓ Data split: 7920 training, 1980 validation samples.
✓ Vectorized text and added 2 engineered features.
⚡ Starting training...
Epoch 1/9 - Loss: 4.0312, Accuracy: 49.51%, Val Accuracy: 49.49%
Epoch 2/9 - Loss: 2.7350, Accuracy: 54.23%, Val Accuracy: 53.84%
Epoch 3/9 - Loss: 2.2934, Accuracy: 86.09%, Val Accuracy: 84.85%
Epoch 4/9 - Loss: 1.3486, Accuracy: 96.34%, Val Accuracy: 96.67%
Epoch 5/9 - Loss: 2.0458, Accuracy: 96.89%, Val Accuracy: 96.82%
Epoch 6/9 - Loss: 2.1652, Accuracy: 71.26%, Val Accuracy: 71.16%
Epoch 7/9 - Loss: 1.5745, Accuracy: 95.80%, Val Accuracy: 96.31%
Epoch 8/9 - Loss: 1.3587, Accuracy: 93.46%, Val Accuracy: 93.38%
Epoch 9/9 - Loss: 0.9784, Accuracy: 81.14%, Val Accuracy: 80.40%

✓ Training complete! Final Validation Accuracy: 80.40%

--------------- Detailed Validation Metrics ---------------

Confusion Matrix:
               Predicted Real  Predicted Fake
Actual Real    978           

Enter your choice (1-3):  2



--- Classify Article ---


Paste the news article text here:
>   BUSTED: Trump Supporter Used Poll Watcher Credentials To Force Early Voters To Leave Polling Place Clearly, there is no low Trump supporters won t stoop in the effort to get him elected president.Ever since the Republican nominee told his deplorable supporters that the election is  rigged  and that they should become poll watchers there has been multiple incidents of voter intimidation, voter suppression, and voter fraud by Trump supporters and even an act of violence against a polling place.And another Trump supporter just got caught trying to intimidate early voters into leaving an Arkansas polling place.Thus far, early voting has benefited Hillary Clinton and that s a big deal with only four days left until Election Day. So Jefferson County Election Commission Stu Soffer, who is a Trump supporter, used his poll watcher credentials and stood in a doorway to yell at early voters at the Pine Bluff polling location to leave. Shut up and go home,  So


Analyzing...

--- Prediction Result ---
  Label: Fake News
  Confidence: 65.52%
-------------------------

  Fake News Detector CLI
  Model Status: Trained (Val Acc: 80.40%)
----------------------------------------
1. Train Model
2. Classify an Article
3. Exit


Enter your choice (1-3):  3


Exiting application.
