In [1]:
import pandas as pd
import numpy as np
import re
import warnings
import string
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK imports for text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

# TextBlob for sentiment analysis
from textblob import TextBlob

# TensorFlow/Keras for deep learning model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# SpaCy for additional NLP tasks (if needed)
import spacy

# Scikit-learn for machine learning models and evaluation
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.impute import SimpleImputer

from collections import Counter

In [2]:

# Suppress warnings if necessary
warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet', download_dir='/contentnltk_data/')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /contentnltk_data/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] D

True

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")
print("spaCy model loaded successfully!")


spaCy model loaded successfully!


In [7]:
class FakeNewsDetector:
    def __init__(self):
        self.models = {}
        self.vectorizer = None
        self.le = LabelEncoder()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.sia = SentimentIntensityAnalyzer()
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='mean')
        self.min_max_scaler = MinMaxScaler()
        self.n_text_features = None
        
    def preprocess_text(self, text):
        print("preprocessing the text...")
        """Clean and preprocess text data"""
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        return ' '.join(tokens)
    
    def extract_linguistic_features(self, text):
        print("linguistic functionality...")
        features = {}
        try:
            features['word_count'] = len(text.split())
            features['char_count'] = len(text)
            words = text.split()
            features['avg_word_length'] = np.mean([len(word) for word in words]) if words else 0
            features['unique_words'] = len(set(words))
            features['unique_words_ratio'] = features['unique_words'] / features['word_count'] if features['word_count'] > 0 else 0
            sentences = sent_tokenize(text)
            features['sentence_count'] = len(sentences)
            features['avg_sentence_length'] = np.mean([len(sent.split()) for sent in sentences]) if sentences else 0
            features['avg_sentence_char_length'] = np.mean([len(sent) for sent in sentences]) if sentences else 0
            punct_counts = Counter(c for c in text if c in string.punctuation)
            features['exclamation_count'] = punct_counts['!']
            features['question_count'] = punct_counts['?']
            features['punctuation_ratio'] = sum(punct_counts.values()) / len(text) if len(text) > 0 else 0
        except Exception as e:
            print(f"Error in extract_linguistic_features: {str(e)}")
            # Fill with default values if there's an error
            features = {k: 0 for k in [
                'word_count', 'char_count', 'avg_word_length', 'unique_words',
                'unique_words_ratio', 'sentence_count', 'avg_sentence_length',
                'avg_sentence_char_length', 'exclamation_count', 'question_count',
                'punctuation_ratio'
            ]}
        return features    
    def extract_sentiment_features(self, text):
        features = {}
        vader_scores = self.sia.polarity_scores(text)
        features.update({f'vader_{k}': v for k, v in vader_scores.items()})
        blob = TextBlob(text)
        features['textblob_polarity'] = blob.sentiment.polarity
        features['textblob_subjectivity'] = blob.sentiment.subjectivity
        return features
    
    def extract_structural_features(self, text):
        features = {}
        words = text.split()
        features['caps_count'] = sum(1 for word in words if word.isupper())
        features['caps_ratio'] = features['caps_count'] / len(words) if words else 0
        paragraphs = text.split('\n\n')
        features['paragraph_count'] = len(paragraphs)
        features['avg_paragraph_length'] = np.mean([len(p.split()) for p in paragraphs]) if paragraphs else 0
        return features
    
    def extract_pos_features(self, text):
        features = {}
        doc = nlp(text)
        pos_counts = Counter(token.pos_ for token in doc)
        for pos, count in pos_counts.items():
            features[f'pos_{pos.lower()}'] = count
        ner_counts = Counter(ent.label_ for ent in doc.ents)
        for ner, count in ner_counts.items():
            features[f'ner_{ner.lower()}'] = count
        total_tokens = len(doc)
        if total_tokens > 0:
            features['noun_ratio'] = pos_counts['NOUN'] / total_tokens if 'NOUN' in pos_counts else 0
            features['verb_ratio'] = pos_counts['VERB'] / total_tokens if 'VERB' in pos_counts else 0
            features['adj_ratio'] = pos_counts['ADJ'] / total_tokens if 'ADJ' in pos_counts else 0
        return features
    
    def extract_readability_features(self, text):
        features = {}
        words = text.split()
        sentences = sent_tokenize(text)
        word_count = len(words)
        sentence_count = len(sentences)
        if sentence_count > 0 and word_count > 0:
            features['avg_words_per_sentence'] = word_count / sentence_count
            syllable_count = sum([self.count_syllables(word) for word in words])
            features['flesch_reading_ease'] = 206.835 - 1.015 * (word_count / sentence_count) - 84.6 * (syllable_count / word_count)
            complex_words = sum(1 for word in words if self.count_syllables(word) >= 3)
            features['gunning_fog'] = 0.4 * ((word_count / sentence_count) + 100 * (complex_words / word_count))
        return features
    
    def count_syllables(self, word):
        word = word.lower()
        count = 0
        vowels = 'aeiouy'
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith('e'):
            count -= 1
        if count == 0:
            count += 1
        return count
    
    def prepare_data(self, df, text_column, label_column):
        print("Starting data preparation and feature engineering...")
        
        # Clean and standardize label types
        df[label_column] = df[label_column].astype(str).str.strip()
        
        # Remove invalid labels
        invalid_labels = df[df[label_column].isna()].index
        if len(invalid_labels) > 0:
            print(f"Removing {len(invalid_labels)} rows with invalid labels")
            df = df.drop(invalid_labels)
        
        df['processed_text'] = df[text_column].apply(self.preprocess_text)
    
        # Extract features
        linguistic_features = pd.DataFrame(df['processed_text'].apply(self.extract_linguistic_features).tolist())
        sentiment_features = pd.DataFrame(df['processed_text'].apply(self.extract_sentiment_features).tolist())
        structural_features = pd.DataFrame(df['processed_text'].apply(self.extract_structural_features).tolist())
        pos_features = pd.DataFrame(df['processed_text'].apply(self.extract_pos_features).tolist())
        readability_features = pd.DataFrame(df['processed_text'].apply(self.extract_readability_features).tolist())
    
        # TF-IDF features
        if self.vectorizer is None:
            self.vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
            text_features = self.vectorizer.fit_transform(df['processed_text'])
        else:
            text_features = self.vectorizer.transform(df['processed_text'])
        
        # Store number of text features
        self.n_text_features = text_features.shape[1]
    
        # Combine all non-text features
        self.other_features = [
            linguistic_features,
            sentiment_features,
            structural_features,
            pos_features,
            readability_features
        ]
        all_features = pd.concat(self.other_features, axis=1)
    
        # Store feature names and their order
        self.feature_names = all_features.columns.tolist()
    
        # Handle NaN values and scale features
        all_features = pd.DataFrame(self.imputer.fit_transform(all_features), columns=all_features.columns)
        other_features_array = all_features.values
        scaled_features = self.scaler.fit_transform(other_features_array)
    
        # Combine all features
        self.feature_matrix = np.hstack((text_features.toarray(), scaled_features))
    
        # Transform labels
        labels = self.le.fit_transform(df[label_column])
        
        return self.feature_matrix, labels   
    def train_models(self, X_train, y_train, X_test):
        print("Starting model training...")
    
        # Split features into text and other features
        n_text_features = self.vectorizer.get_feature_names_out().shape[0]
        X_train_text = X_train[:, :n_text_features]
        X_train_other = X_train[:, n_text_features:]
    
        # Handle any potential NaN values in the features
        X_train_other = self.imputer.transform(X_train_other)
        
        # Scale the non-text features
        X_train_other_scaled = self.scaler.transform(X_train_other)
    
        # Combine features
        X_train_combined = np.hstack((X_train_text, X_train_other_scaled))
    
        # Check for any remaining NaN values
        if np.isnan(X_train_combined).any():
            print("Warning: NaN values still present after preprocessing")
            # Replace any remaining NaN values with 0
            X_train_combined = np.nan_to_num(X_train_combined)
    
        models = {
            'random_forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
            'svm': LinearSVC(random_state=42),
            'lstm': self._create_lstm_model(X_train.shape[1])
        }
    
        for name, model in models.items():
            print(f"Training {name}...")
            if name == 'lstm':
                # LSTM expects 3D input
                X_reshaped = X_train_combined.reshape((X_train_combined.shape[0], 1, X_train_combined.shape[1]))
                model.fit(X_reshaped, y_train, epochs=5, batch_size=32, verbose=1)
            else:
                # Other models can use the standard scaled features
                model.fit(X_train_combined, y_train)
    
            self.models[name] = model
        print("Model training completed.")
    
    def _create_lstm_model(self, input_dim):
        model = Sequential([
            LSTM(64, input_shape=(1, input_dim)),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def evaluate_models(self, X_test, y_test):
        print("evaluating the model...")
        results = {}
        for name, model in self.models.items():
            if name == 'lstm':
                X_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
                y_pred = (model.predict(X_reshaped) > 0.5).astype(int)
            else:
                y_pred = model.predict(X_test)
            results[name] = {
                'classification_report': classification_report(y_test, y_pred),
                'confusion_matrix': confusion_matrix(y_test, y_pred)
            }
        return results
    
    def predict(self, text):
        """Predict using the trained models with proper feature handling."""
        # Preprocess text
        processed_text = self.preprocess_text(text)
        
        # Extract TF-IDF features
        text_features = self.vectorizer.transform([processed_text]).toarray()
        
        # Extract other features in the same order as during training
        other_features = {}
        other_features.update(self.extract_linguistic_features(processed_text))
        other_features.update(self.extract_sentiment_features(processed_text))
        other_features.update(self.extract_structural_features(processed_text))
        other_features.update(self.extract_pos_features(processed_text))
        other_features.update(self.extract_readability_features(processed_text))
        
        # Convert to DataFrame with correct column order
        other_features_df = pd.DataFrame([other_features])
        other_features_df = other_features_df.reindex(columns=self.feature_names, fill_value=0)
        
        # Handle missing values and scale
        other_features_array = self.imputer.transform(other_features_df)
        scaled_features = self.scaler.transform(other_features_array)
        
        # Combine features
        all_features = np.hstack((text_features, scaled_features))
        
        # Make predictions
        predictions = {}
        for name, model in self.models.items():
            if name == 'lstm':
                # Reshape for LSTM
                reshaped_features = all_features.reshape(1, 1, -1)
                pred = model.predict(reshaped_features)
                predictions[name] = (pred > 0.5).astype(int).flatten()[0]
            else:
                predictions[name] = model.predict(all_features)[0]
        
        # Convert numeric predictions to labels
        labeled_predictions = {
            name: self.le.inverse_transform([pred])[0] 
            for name, pred in predictions.items()
        }
        
        return labeled_predictions

    def manual_testing(self):
        """Allow manual testing of the model with user input."""
        print("\n=== Fake News Detection Manual Testing ===")
        while True:
            user_input = input("\nPaste your news article (or type 'exit' to quit):\n")
            if user_input.lower() == 'exit':
                break
            
            try:
                predictions = self.predict(user_input)
                
                print("\nPredictions:")
                for model_name, prediction in predictions.items():
                    print(f"{model_name.replace('_', ' ').title()}: {prediction}")
                
                # Calculate consensus prediction
                prediction_values = list(predictions.values())
                most_common = max(set(prediction_values), key=prediction_values.count)
                consensus_ratio = prediction_values.count(most_common) / len(prediction_values)
                
                print(f"\nFinal Prediction: {most_common}")
                print(f"Confidence: {consensus_ratio:.1%}\n")
                
            except Exception as e:
                print(f"Error occurred: {str(e)}")
        

def main():
    try:
        # Load your dataset
        print("Loading dataset...")
        df = pd.read_csv('final_en.csv')
        print(f"Dataset loaded with {len(df)} rows")
        
        # Clean dataset
        print("Cleaning dataset by dropping unnecessary unnamed columns...")
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        
        # Initialize FakeNewsDetector
        print("Initializing FakeNewsDetector...")
        detector = FakeNewsDetector()

        # Prepare data
        print("Preparing data...")
        X, y = detector.prepare_data(df, 'text', 'label')
        print(f"Features shape: {X.shape}, Labels shape: {y.shape}")

        # Split data
        print("Splitting data into train and test sets...")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train models
        print("Training models...")
        detector.train_models(X_train, y_train, X_test)

        # Evaluate models
        print("\nEvaluating models...")
        results = detector.evaluate_models(X_test, y_test)
        
        # Print evaluation results
        for model_name, result in results.items():
            print(f"\n{model_name.upper()} Results:")
            print(result['classification_report'])
            print("\nConfusion Matrix:")
            print(result['confusion_matrix'])
            
        # Start manual testing
        print("\nStarting manual testing mode...")
        detector.manual_testing()
        
    except FileNotFoundError:
        print("Error: Could not find the news.csv file. Please ensure it exists in the correct directory.")
    except pd.errors.EmptyDataError:
        print("Error: The news.csv file is empty.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Loading dataset...
Dataset loaded with 10002 rows
Cleaning dataset by dropping unnecessary unnamed columns...
Initializing FakeNewsDetector...
Preparing data...
Starting data preparation and feature engineering...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
prepro


Paste your news article (or type 'exit' to quit):
 kenya is not country its an island


preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 1
Lstm: 1

Final Prediction: 1
Confidence: 75.0%




Paste your news article (or type 'exit' to quit):
 nairobi is a continent


preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 0
Lstm: 1

Final Prediction: 1
Confidence: 50.0%




Paste your news article (or type 'exit' to quit):
 Xi says he will work with Trump team as he meets Biden in Peru


preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 1
Lstm: 1

Final Prediction: 1
Confidence: 75.0%




Paste your news article (or type 'exit' to quit):
 Xi says he will not  work with Trump team as he meets Biden in Peru


preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 1
Lstm: 1

Final Prediction: 1
Confidence: 75.0%




Paste your news article (or type 'exit' to quit):
 Rumors are spreading that the government is secretly drafting a bill to ban social media platforms. Critics claim this is a tactic to suppress dissent ahead of the upcoming elections. Anonymous sources report the bill could be passed within weeks.


preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 0
Lstm: 1

Final Prediction: 1
Confidence: 50.0%




Paste your news article (or type 'exit' to quit):
 A viral post claims that drinking lemon water exactly at 3 AM can cure all types of cancer. Self-proclaimed experts have shared anecdotal evidence of miraculous recoveries, though no scientific studies back the claim.


preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 0
Lstm: 1

Final Prediction: 1
Confidence: 50.0%




Paste your news article (or type 'exit' to quit):
 In a shocking revelation, a leaked World Bank memo suggests plans to make Bitcoin the global legal tender by 2030. The memo allegedly outlines steps to phase out all fiat currencies, sparking widespread concern among economists.


preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 0
Lstm: 1

Final Prediction: 1
Confidence: 50.0%




Paste your news article (or type 'exit' to quit):
 A newly launched AI-powered app promises to predict the exact moment of your death using advanced algorithms. Critics argue it’s a scam preying on people’s fears, but the app's developers claim 99% accuracy in tests.


preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 0
Lstm: 1

Final Prediction: 1
Confidence: 50.0%




Paste your news article (or type 'exit' to quit):
 - Diplomatic tensions over global warming will take center stage at the G20 summit in Brazil this week, as negotiators at U.N. talks in Azerbaijan hit an impasse on climate finance that they hope leaders of the world's 20 major economies can break. Heads of state arriving in Rio de Janeiro on Sunday for the G20 summit will spend Monday and Tuesday addressing issues from poverty and hunger to the reform of global institutions. Still, the ongoing U.N. climate talks have thrown a spotlight on their efforts to tackle global warming. While the COP29 summit in Baku, Azerbaijan, is tasked with agreeing a goal to mobilize hundreds billions of dollars for the climate, leaders of the Group of 20 major economies half a world away in Rio are holding the purse strings. G20 countries account for 85% of the world's economy and are the largest contributors to multilateral development banks helping to steer climate finance. They are also responsible f

preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 1
Lstm: 1

Final Prediction: 1
Confidence: 75.0%




Paste your news article (or type 'exit' to quit):
 Diplomatic tensions over global warming have caused the abrupt cancellation of the G20 summit initially scheduled in Brazil this week. Reports claim that negotiators at U.N. talks in Azerbaijan walked out after secret documents revealed the summit’s agenda prioritized corporate interests over genuine climate action.  Heads of state were reportedly set to arrive in Rio de Janeiro on Sunday to discuss issues ranging from cryptocurrency regulation to the privatization of rainforests. However, leaked memos suggest that discussions on poverty, hunger, and climate finance were to be sidelined.  In Baku, Azerbaijan, where the COP29 summit is being held, protesters stormed the conference venue, accusing negotiators of colluding with fossil fuel companies. Claims that a "climate fund" will be used to develop oil extraction technologies rather than renewable energy have sparked global outrage.  G20 countries, responsible for nearly all greenhou

preprocessing the text...
linguistic functionality...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step

Predictions:
Random Forest: 0
Logistic Regression: 1
Svm: 0
Lstm: 1

Final Prediction: 1
Confidence: 50.0%




Paste your news article (or type 'exit' to quit):
 exit
