## Voting system model

In [1]:
import os

# Change the working directory to the folder containing your notebook
os.chdir(r"c:\Users\ifahs\OneDrive\Documents\FYP_Interface\Fake_News_Detection-master (1)\Fake_News_Detection-master")

# Verify the change
print("Current working directory:", os.getcwd())


Current working directory: c:\Users\ifahs\OneDrive\Documents\FYP_Interface\Fake_News_Detection-master (1)\Fake_News_Detection-master


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Dropout
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Section 2: Enhanced Data Loading and Exploration
print("Loading and exploring dataset...")
data_path = r"C:\Users\ifahs\OneDrive\Documents\FYP_Interface\models\news_model.csv" 

# Load data with better error handling
try:
    df = pd.read_csv(data_path, encoding='ISO-8859-1')
    print(f"\nDataset successfully loaded: {data_path}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Basic dataset information
print("\nDataset Overview:")
print(f"Total samples: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print("\nColumns in dataset:")
print(df.columns.tolist()[:10], "...") # Show first 10 columns

# Examine label distribution
print("\nLabel distribution (raw):")
print(df['label'].value_counts())
print("\nLabel distribution (percentage):")
print(df['label'].value_counts(normalize=True) * 100)

# Check for data quality issues
print("\nMissing values by column:")
missing_values = df.isnull().sum()
missing_pct = (missing_values / len(df)) * 100
missing_stats = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage Missing': missing_pct
})
print(missing_stats[missing_stats['Missing Values'] > 0])

# Text length analysis
df['text_length'] = df['text'].str.len()
print("\nText length statistics:")
print(df['text_length'].describe())

# Show samples of very short and very long articles
print("\nSample of shortest articles:")
print(df[df['text_length'] < 50]['text'].head())
print("\nSample of longest articles:")
print(df[df['text_length'] > df['text_length'].quantile(0.99)]['text'].head())

# Check for potential duplicates
print("\nDuplicate analysis:")
print(f"Total duplicates: {df.duplicated(subset='text').sum()}")

# Data quality summary
print("\nData Quality Summary:")
print(f"1. Number of unique labels: {df['label'].nunique()}")
print(f"2. Number of empty texts: {(df['text'].str.len() == 0).sum()}")
print(f"3. Number of very short texts (<50 chars): {(df['text_length'] < 50).sum()}")
print(f"4. Number of very long texts (>99th percentile): {(df['text_length'] > df['text_length'].quantile(0.99)).sum()}")

# Save initial data quality report
quality_report = {
    'total_samples': len(df),
    'missing_values': df.isnull().sum().to_dict(),
    'label_distribution': df['label'].value_counts().to_dict(),
    'text_length_stats': df['text_length'].describe().to_dict(),
    'duplicates': df.duplicated(subset='text').sum()
}

print("\nInitial data quality report saved. Ready for preprocessing phase.")

Loading and exploring dataset...



Dataset successfully loaded: C:\Users\ifahs\OneDrive\Documents\FYP_Interface\models\news_model.csv

Dataset Overview:
Total samples: 56291
Total columns: 6

Columns in dataset:
['id', 'title', 'text', 'subject', 'date', 'label'] ...

Label distribution (raw):
label
TRUE                                                                                                                                                                                                                                                                                                                                                       38871
Fake                                                                                                                                                                                                                                                                                                                                                       17400
fake                       

Total duplicates: 1

Data Quality Summary:
1. Number of unique labels: 12
2. Number of empty texts: 0
3. Number of very short texts (<50 chars): 102
4. Number of very long texts (>99th percentile): 563

Initial data quality report saved. Ready for preprocessing phase.


In [4]:
# Section 3: Enhanced Data Preprocessing with Better Label Handling
print("\nStarting enhanced preprocessing...")

# First, show original data distribution
print("\nOriginal label distribution:")
print(df['label'].value_counts())

# Standardize labels more carefully
df['label'] = df['label'].str.strip().str.lower()
df['label'] = df['label'].replace({
    'true': 'True',
    'fake': 'Fake',
    'true ': 'True',
    'fake ': 'Fake',
    'TRUE': 'True',
    'FALSE': 'Fake'
})

# Print intermediate distribution
print("\nAfter standardization:")
print(df['label'].value_counts())

# Clean data
df = df[~df['text'].isna()]  # Remove rows with no text
df = df[df['text'].str.len() > 10]  # Remove very short texts

# Keep only valid labels
valid_labels = ['True', 'Fake']
df = df[df['label'].isin(valid_labels)]

# Print final distribution
print("\nFinal label distribution:")
print(df['label'].value_counts())
print(df['label'].value_counts(normalize=True))

# Remove duplicate articles
df = df.drop_duplicates(subset='text', keep='first')
print(f"\nShape after removing duplicates: {df.shape}")


# Convert labels to numeric
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])
X = df['text']

# Split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# TF-IDF Vectorization
print("\nFitting TF-IDF vectorizer...")
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    max_df=0.7,
    min_df=2,  # Remove very rare words
    ngram_range=(1, 2)
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"Vocabulary size: {len(tfidf.vocabulary_)}")
print(f"Training set shape after vectorization: {X_train_tfidf.shape}")

# Apply SMOTE for balance
print("\nApplying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_train_tfidf_balanced, y_train_balanced = smote.fit_resample(
    X_train_tfidf, y_train
)

# Print class distribution after SMOTE
unique, counts = np.unique(y_train_balanced, return_counts=True)
print("Class distribution after SMOTE:")
for label, count in zip(['Fake', 'True'], counts):
    print(f"{label}: {count}")

print("\nPreprocessing completed successfully.")


Starting enhanced preprocessing...

Original label distribution:
label
TRUE                                                                                                                                                                                                                                                                                                                                                       38871
Fake                                                                                                                                                                                                                                                                                                                                                       17400
fake                                                                                                                                                                                                                          


Final label distribution:
label
True    38872
Fake    17390
Name: count, dtype: int64
label
True    0.69091
Fake    0.30909
Name: proportion, dtype: float64



Shape after removing duplicates: (56262, 7)

Training set size: 45009
Testing set size: 11253

Fitting TF-IDF vectorizer...


Vocabulary size: 5000
Training set shape after vectorization: (45009, 5000)

Applying SMOTE to balance classes...


Class distribution after SMOTE:
Fake: 31097
True: 31097

Preprocessing completed successfully.


In [5]:
# Section 4: Model Training and Evaluation Function
def train_evaluate_model(model, name, X_train, y_train, X_test, y_test):
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(report)
    
    return {
        'model': model,
        'accuracy': accuracy,
        'f1': f1,
        'conf_matrix': conf_matrix,
        'report': report
    }

In [6]:
# Section 5: Train Individual Models
results = {}

# PassiveAggressive
pa_model = PassiveAggressiveClassifier(max_iter=1000)
results['PassiveAggressive'] = train_evaluate_model(
    pa_model, 'PassiveAggressive',
    X_train_tfidf_balanced, y_train_balanced,
    X_test_tfidf, y_test
)

# Naive Bayes
nb_model = MultinomialNB()
results['NaiveBayes'] = train_evaluate_model(
    nb_model, 'NaiveBayes',
    X_train_tfidf_balanced, y_train_balanced,
    X_test_tfidf, y_test
)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
results['RandomForest'] = train_evaluate_model(
    rf_model, 'RandomForest',
    X_train_tfidf_balanced, y_train_balanced,
    X_test_tfidf, y_test
)




Training PassiveAggressive...


PassiveAggressive Results:
Accuracy: 0.9961
F1 Score: 0.9972

Confusion Matrix:
[[3459   19]
 [  25 7750]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3478
           1       1.00      1.00      1.00      7775

    accuracy                           1.00     11253
   macro avg       1.00      1.00      1.00     11253
weighted avg       1.00      1.00      1.00     11253


Training NaiveBayes...
NaiveBayes Results:
Accuracy: 0.9794
F1 Score: 0.9849

Confusion Matrix:
[[3433   45]
 [ 187 7588]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      3478
           1       0.99      0.98      0.98      7775

    accuracy                           0.98     11253
   macro avg       0.97      0.98      0.98     11253
weighted avg       0.98      0.98      0.98     11253


Training RandomForest...


RandomForest Results:
Accuracy: 0.9921
F1 Score: 0.9943

Confusion Matrix:
[[3414   64]
 [  25 7750]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3478
           1       0.99      1.00      0.99      7775

    accuracy                           0.99     11253
   macro avg       0.99      0.99      0.99     11253
weighted avg       0.99      0.99      0.99     11253



In [7]:
# Section 6: Train Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('pa', PassiveAggressiveClassifier()),
        ('nb', MultinomialNB()),
        ('rf', RandomForestClassifier())
    ],
    voting='hard'
)

results['Voting'] = train_evaluate_model(
    voting_clf, 'Voting',
    X_train_tfidf_balanced, y_train_balanced,
    X_test_tfidf, y_test
)



Training Voting...


Voting Results:
Accuracy: 0.9953
F1 Score: 0.9966

Confusion Matrix:
[[3452   26]
 [  27 7748]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3478
           1       1.00      1.00      1.00      7775

    accuracy                           1.00     11253
   macro avg       0.99      0.99      0.99     11253
weighted avg       1.00      1.00      1.00     11253



In [8]:
# Train the model
voting_clf.fit(X_train_tfidf_balanced, y_train_balanced)

# Make predictions
y_pred = voting_clf.predict(X_test_tfidf)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\nVoting Classifier Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(report)



Voting Classifier Results:
Accuracy: 0.9950
F1 Score: 0.9964

Confusion Matrix:
[[3449   29]
 [  27 7748]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3478
           1       1.00      1.00      1.00      7775

    accuracy                           1.00     11253
   macro avg       0.99      0.99      0.99     11253
weighted avg       1.00      1.00      1.00     11253



In [9]:
save_path = r"C:\Users\ifahs\OneDrive\Documents\FYP_Interface\models\best_news_model.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(save_path, 'wb') as f:
    pickle.dump((voting_clf, tfidf), f)

print(f"\nModel saved to: {save_path}")


Model saved to: C:\Users\ifahs\OneDrive\Documents\FYP_Interface\models\best_news_model.pkl
