In [7]:
import pandas as pd
import numpy as np
import re
import warnings
import string
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK imports for text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

# TextBlob for sentiment analysis
from textblob import TextBlob

# TensorFlow/Keras for deep learning model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# SpaCy for additional NLP tasks (if needed)
import spacy

# Scikit-learn for machine learning models and evaluation
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.impute import SimpleImputer

from collections import Counter

In [19]:

# Suppress warnings if necessary
warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet', download_dir='/contentnltk_data/')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /contentnltk_data/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] D

True

In [20]:
# Load spaCy model
try:
    nlp = spacy.load('en_core_web_sm')
except:
    import subprocess
    subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'])
    nlp = spacy.load('en_core_web_sm')


NameError: name 'df' is not defined

In [None]:
class FakeNewsDetector:
    def __init__(self):
        self.models = {}
        self.vectorizer = None
        self.le = LabelEncoder()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.sia = SentimentIntensityAnalyzer()
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='mean')
        self.min_max_scaler = MinMaxScaler()
        
    def preprocess_text(self, text):
        print("preprocessing the text...")
        """Clean and preprocess text data"""
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        return ' '.join(tokens)
    
    def extract_linguistic_features(self, text):
        print("linguistic fucntionality...")
        features = {}
        features['word_count'] = len(text.split())
        features['char_count'] = len(text)
        features['avg_word_length'] = np.mean([len(word) for word in text.split()])
        features['unique_words'] = len(set(text.split()))
        features['unique_words_ratio'] = features['unique_words'] / features['word_count'] if features['word_count'] > 0 else 0
        sentences = sent_tokenize(text)
        features['sentence_count'] = len(sentences)
        features['avg_sentence_length'] = np.mean([len(sent.split()) for sent in sentences]) if sentences else 0
        features['avg_sentence_char_length'] = np.mean([len(sent) for sent in sentences]) if sentences else 0
        punct_counts = Counter(c for c in text if c in string.punctuation)
        features['exclamation_count'] = punct_counts['!']
        features['question_count'] = punct_counts['?']
        features['punctuation_ratio'] = sum(punct_counts.values()) / len(text) if len(text) > 0 else 0
        return features
    
    def extract_sentiment_features(self, text):
        features = {}
        vader_scores = self.sia.polarity_scores(text)
        features.update({f'vader_{k}': v for k, v in vader_scores.items()})
        blob = TextBlob(text)
        features['textblob_polarity'] = blob.sentiment.polarity
        features['textblob_subjectivity'] = blob.sentiment.subjectivity
        return features
    
    def extract_structural_features(self, text):
        features = {}
        words = text.split()
        features['caps_count'] = sum(1 for word in words if word.isupper())
        features['caps_ratio'] = features['caps_count'] / len(words) if words else 0
        paragraphs = text.split('\n\n')
        features['paragraph_count'] = len(paragraphs)
        features['avg_paragraph_length'] = np.mean([len(p.split()) for p in paragraphs]) if paragraphs else 0
        return features
    
    def extract_pos_features(self, text):
        features = {}
        doc = nlp(text)
        pos_counts = Counter(token.pos_ for token in doc)
        for pos, count in pos_counts.items():
            features[f'pos_{pos.lower()}'] = count
        ner_counts = Counter(ent.label_ for ent in doc.ents)
        for ner, count in ner_counts.items():
            features[f'ner_{ner.lower()}'] = count
        total_tokens = len(doc)
        if total_tokens > 0:
            features['noun_ratio'] = pos_counts['NOUN'] / total_tokens if 'NOUN' in pos_counts else 0
            features['verb_ratio'] = pos_counts['VERB'] / total_tokens if 'VERB' in pos_counts else 0
            features['adj_ratio'] = pos_counts['ADJ'] / total_tokens if 'ADJ' in pos_counts else 0
        return features
    
    def extract_readability_features(self, text):
        features = {}
        words = text.split()
        sentences = sent_tokenize(text)
        word_count = len(words)
        sentence_count = len(sentences)
        if sentence_count > 0 and word_count > 0:
            features['avg_words_per_sentence'] = word_count / sentence_count
            syllable_count = sum([self.count_syllables(word) for word in words])
            features['flesch_reading_ease'] = 206.835 - 1.015 * (word_count / sentence_count) - 84.6 * (syllable_count / word_count)
            complex_words = sum(1 for word in words if self.count_syllables(word) >= 3)
            features['gunning_fog'] = 0.4 * ((word_count / sentence_count) + 100 * (complex_words / word_count))
        return features
    
    def count_syllables(self, word):
        word = word.lower()
        count = 0
        vowels = 'aeiouy'
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith('e'):
            count -= 1
        if count == 0:
            count += 1
        return count
    
    def prepare_data(self, df, text_column, label_column):
        print("Starting data preparation and feature engineering...")
        df['processed_text'] = df[text_column].apply(self.preprocess_text)
    
        # Extract all features
        linguistic_features = pd.DataFrame(df['processed_text'].apply(self.extract_linguistic_features).tolist())
        sentiment_features = pd.DataFrame(df['processed_text'].apply(self.extract_sentiment_features).tolist())
        structural_features = pd.DataFrame(df['processed_text'].apply(self.extract_structural_features).tolist())
        pos_features = pd.DataFrame(df['processed_text'].apply(self.extract_pos_features).tolist())
        readability_features = pd.DataFrame(df['processed_text'].apply(self.extract_readability_features).tolist())
    
        # TF-IDF features
        if self.vectorizer is None:
            self.vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
            text_features = self.vectorizer.fit_transform(df['processed_text'])
        else:
            text_features = self.vectorizer.transform(df['processed_text'])
    
        # Combine all non-text features
        all_features = pd.concat([
            linguistic_features,
            sentiment_features,
            structural_features,
            pos_features,
            readability_features
        ], axis=1)
    
        # Store the feature names for later use
        self.feature_names = all_features.columns.tolist()
    
        # Convert to numpy arrays
        text_features_array = text_features.toarray()
        other_features_array = all_features.values
    
        # Scale features
        self.scaler.fit(other_features_array)
        scaled_features = self.scaler.transform(other_features_array)
    
        # Also fit MinMaxScaler for models that need non-negative values
        self.min_max_scaler.fit(other_features_array)
        minmax_features = self.min_max_scaler.transform(other_features_array)
    
        # Combine all features
        self.feature_matrix = np.hstack((text_features_array, scaled_features, minmax_features))
    
        # Transform labels
        labels = self.le.fit_transform(df[label_column])
        print("Data preparation completed.")
        return self.feature_matrix, labels
        
    def train_models(self, X_train, y_train, X_test):
        print("Starting model training...")
    
        # Split features into text, scaled, and MinMaxScaled
        n_text_features = self.vectorizer.get_feature_names_out().shape[0]
        n_other_features = X_train.shape[1] - n_text_features
        X_train_text = X_train[:, :n_text_features]
        X_train_other = X_train[:, n_text_features:]
    
        # Scale the non-text features
        self.scaler.fit(X_train_other)
        X_train_other_scaled = self.scaler.transform(X_train_other)
    
        # MinMax scale the non-text features
        self.min_max_scaler.fit(X_train_other)
        X_train_other_minmax = self.min_max_scaler.transform(X_train_other)
    
        
        models = {
            'random_forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
            'svm': LinearSVC(random_state=42),
            'lstm': self._create_lstm_model(X_train.shape[1])
        }
        
    
        for name, model in models.items():
            print(f"Training {name}...")
            if name == 'naive_bayes':
                # For Naive Bayes, use MinMaxScaled features
                X_train_combined = np.hstack((X_train_text, X_train_other_minmax))
                model.fit(X_train_combined, y_train)
            elif name == 'lstm':
                # LSTM expects 3D input
                X_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
                model.fit(X_reshaped, y_train, epochs=5, batch_size=32, verbose=1)
            else:
                # Other models can use the standard scaled features
                X_train_combined = np.hstack((X_train_text, X_train_other_scaled))
                model.fit(X_train_combined, y_train)
    
            self.models[name] = model
        print("Model training completed.")
    
    def _create_lstm_model(self, input_dim):
        model = Sequential([
            LSTM(64, input_shape=(1, input_dim)),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def evaluate_models(self, X_test, y_test):
        print("evaluating the model...")
        results = {}
        for name, model in self.models.items():
            if name == 'lstm':
                X_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
                y_pred = (model.predict(X_reshaped) > 0.5).astype(int)
            else:
                y_pred = model.predict(X_test)
            results[name] = {
                'classification_report': classification_report(y_test, y_pred),
                'confusion_matrix': confusion_matrix(y_test, y_pred)
            }
        return results
    
    def predict(self, text):
        processed_text = self.preprocess_text(text)
        features = []
        features.extend(self.extract_linguistic_features(processed_text).values())
        features.extend(self.extract_sentiment_features(processed_text).values())
        features.extend(self.extract_structural_features(processed_text).values())
        features.extend(self.extract_pos_features(processed_text).values())
        features.extend(self.extract_readability_features(processed_text).values())
        text_features = self.vectorizer.transform([processed_text]).toarray()
        all_features = np.concatenate([text_features, np.array(features).reshape(1, -1)], axis=1)
        all_features = self.scaler.transform(all_features)
        predictions = {name: (model.predict(all_features.reshape(1, 1, -1)) if name == 'lstm' else model.predict(all_features)) for name, model in self.models.items()}
        return predictions
def main():
    try:
        # Load your dataset
        print("Loading dataset...")
        # Assuming dataset has 'text' and 'label' columns
        df = pd.read_csv('final_en.csv')
        print(f"Dataset loaded with {len(df)} rows")

        # Initialize detector
        print("Initializing FakeNewsDetector...")
        detector = FakeNewsDetector()

        # Prepare data
        print("Preparing data...")
        X, y = detector.prepare_data(df, 'text', 'label')
        print(f"Features shape: {X.shape}, Labels shape: {y.shape}")

        # Split data
        print("Splitting data into train and test sets...")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=0.2, 
            random_state=42,
            stratify=y  # Ensures balanced split
        )
        print(f"Training set size: {X_train.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")

        # Train models
        print("Training models...")
        detector.train_models(X_train, y_train, X_test)

        # Evaluate models
        print("\nEvaluating models...")
        results = detector.evaluate_models(X_test, y_test)
        
        # Print results
        print("\nModel Evaluation Results:")
        for model_name, result in results.items():
            print(f"\n{model_name.upper()} Results:")
            print(result['classification_report'])
            print("\nConfusion Matrix:")
            print(result['confusion_matrix'])

        # Example prediction
        print("\nTesting with a sample article...")
        new_article = """
        Scientists have discovered a groundbreaking new treatment for cancer
        that shows promising results in clinical trials.
        """
        prediction = detector.predict(new_article)
        print("\nPredictions for sample article:")
        for model_name, pred in prediction.items():
            print(f"{model_name}: {pred}")

        # Optional: Save models
        # print("\nSaving models...")
        # detector.save_models('models')

    except FileNotFoundError:
        print("Error: Could not find the news.csv file. Please ensure it exists in the correct directory.")
    except pd.errors.EmptyDataError:
        print("Error: The news.csv file is empty.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Loading dataset...
Dataset loaded with 10000 rows
Initializing FakeNewsDetector...
Preparing data...
Starting data preparation and feature engineering...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing the text...
preprocessing 