In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import emoji
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import RegexpTokenizer, TweetTokenizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import time

[nltk_data] Downloading package punkt to
[nltk_data]     /home/mt0/24CS60R08/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mt0/24CS60R08/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mt0/24CS60R08/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Initialize stopwords, lemmatizer, and tokenizer
stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)
lemmatizer = nltk.WordNetLemmatizer()
tokenizer = TweetTokenizer()
punct_set = set(string.punctuation + '''…'"`’”“''' + '️')

def preprocess_text(text):
    """
    Cleans and preprocesses social media text by:
    - Lowercasing
    - Tokenizing using a TweetTokenizer
    - Replacing emojis, hashtags, user mentions, and URLs with special tokens
    - Removing punctuation and stopwords
    - Lemmatizing tokens
    """
    text = text.lower()
    tokens = tokenizer.tokenize(text)

    updated_tokens = []
    for t in tokens:
        if emoji.is_emoji(t):
            updated_tokens.append(f"<emoji> {emoji.demojize(t, delimiters=('', ''))} </emoji>")
        elif t.startswith('#'):
            updated_tokens.append(f'<hashtag> {t[1:]} </hashtag>')
        elif t.startswith('@'):
            updated_tokens.append('<user>')
        elif t.startswith('http'):
            updated_tokens.append('<url>')
        elif t in punct_set:
            pass
        elif t and t not in stop_words:
            updated_tokens.append(t)

    updated_tokens = [lemmatizer.lemmatize(word) for word in updated_tokens]
    return " ".join(updated_tokens)

def to_corpus(X):
    """Applies preprocessing to an array of text."""
    vfunc = np.vectorize(preprocess_text)
    return vfunc(X)

In [4]:
# Define class mappings and initialize the vectorizer
class_map = {"real": 1, "fake": 0}
vectorizer = TfidfVectorizer()

def load_data(filepath):
    """Loads, preprocesses, and vectorizes data from a CSV file."""
    df = pd.read_csv(filepath)
    # Ensure the first column is treated as the text data
    X = df.iloc[:, 0].values 
    y = df.iloc[:, -1].map(class_map).values
    
    print("Preprocessing text data...")
    X_corpus = to_corpus(X)
    
    print("Vectorizing text data with TF-IDF...")
    X_vectorized = vectorizer.fit_transform(X_corpus)
    
    return X_vectorized, y


In [5]:
# Load the dataset
filepath = "./CL-II-MisinformationData - Sheet1.csv"
X, y = load_data(filepath)
print(f"\nData loaded successfully. Shape of X: {X.shape}, Shape of y: {y.shape}")

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, shuffle=True)

print(f"Shape of X_train: {X_train.shape}, Shape of y_train: {y_train.shape}")
print(f"Shape of X_test:  {X_test.shape}, Shape of y_test:  {y_test.shape}")
print(f"Shape of X_val:   {X_val.shape}, Shape of y_val:   {y_val.shape}\n")

# Dictionary to store model performance
model_performance = {}

Preprocessing text data...
Vectorizing text data with TF-IDF...

Data loaded successfully. Shape of X: (10600, 15763), Shape of y: (10600,)
Shape of X_train: (8480, 15763), Shape of y_train: (8480,)
Shape of X_test:  (1060, 15763), Shape of y_test:  (1060,)
Shape of X_val:   (1060, 15763), Shape of y_val:   (1060,)



In [6]:
# ---------------------- 1️. K-Nearest Neighbors (KNN) ----------------------
print("--- Training K-Nearest Neighbors (KNN) ---")
start_time = time.time()
knn_params = {"n_neighbors": [3, 5, 7], "metric": ["euclidean", "manhattan"]}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, n_jobs=-1)
knn_grid.fit(X_train, y_train)
y_pred_knn = knn_grid.best_estimator_.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
model_performance["KNN"] = knn_accuracy
end_time = time.time()

print(f"KNN training completed in {end_time - start_time:.2f} seconds.")
print(f"Best parameters for KNN: {knn_grid.best_params_}")
print(f"KNN Accuracy: {knn_accuracy:.4f}")
print("Classification Report for KNN:")
print(classification_report(y_test, y_pred_knn))
print("-" * 50)

--- Training K-Nearest Neighbors (KNN) ---


KNN training completed in 4.87 seconds.
Best parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 3}
KNN Accuracy: 0.8858
Classification Report for KNN:
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       530
           1       0.85      0.93      0.89       530

    accuracy                           0.89      1060
   macro avg       0.89      0.89      0.89      1060
weighted avg       0.89      0.89      0.89      1060

--------------------------------------------------


In [7]:
# ---------------------- 2. Logistic Regression ----------------------
print("\n--- Training Logistic Regression ---")
start_time = time.time()
log_params = {"C": [0.1, 1, 10], "solver": ["liblinear"]}
log_reg_grid = GridSearchCV(LogisticRegression(random_state=42), log_params, cv=5, n_jobs=-1)
log_reg_grid.fit(X_train, y_train)
y_pred_log = log_reg_grid.best_estimator_.predict(X_test)
log_accuracy = accuracy_score(y_test, y_pred_log)
model_performance["Logistic Regression"] = log_accuracy
end_time = time.time()

print(f"Logistic Regression training completed in {end_time - start_time:.2f} seconds.")
print(f"Best parameters for Logistic Regression: {log_reg_grid.best_params_}")
print(f"Logistic Regression Accuracy: {log_accuracy:.4f}")
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_log))
print("-" * 50)


--- Training Logistic Regression ---
Logistic Regression training completed in 2.34 seconds.
Best parameters for Logistic Regression: {'C': 10, 'solver': 'liblinear'}
Logistic Regression Accuracy: 0.9491
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       530
           1       0.94      0.95      0.95       530

    accuracy                           0.95      1060
   macro avg       0.95      0.95      0.95      1060
weighted avg       0.95      0.95      0.95      1060

--------------------------------------------------


In [8]:
# ---------------------- 3️. Support Vector Machine (SVM) ----------------------
print("\n--- Training Support Vector Machine (SVM) ---")
start_time = time.time()
svm_params = {"C": [0.1, 1, 10]}
svm_grid = GridSearchCV(SVC(kernel="linear", random_state=42), svm_params, cv=5, n_jobs=-1)
svm_grid.fit(X_train, y_train)
y_pred_svm = svm_grid.best_estimator_.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
model_performance["SVM"] = svm_accuracy
end_time = time.time()

print(f"SVM training completed in {end_time - start_time:.2f} seconds.")
print(f"Best parameters for SVM: {svm_grid.best_params_}")
print(f"SVM Accuracy: {svm_accuracy:.4f}")
print("Classification Report for SVM:")
print(classification_report(y_test, y_pred_svm))
print("-" * 50)


--- Training Support Vector Machine (SVM) ---
SVM training completed in 11.13 seconds.
Best parameters for SVM: {'C': 1}
SVM Accuracy: 0.9500
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       530
           1       0.95      0.95      0.95       530

    accuracy                           0.95      1060
   macro avg       0.95      0.95      0.95      1060
weighted avg       0.95      0.95      0.95      1060

--------------------------------------------------


In [9]:
# ---------------------- 4️. K-Means Clustering (Unsupervised) ----------------------
print("\n--- Training K-Means Clustering ---")
start_time = time.time()
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans.fit(X_train)
y_pred_kmeans_raw = kmeans.predict(X_test)

accuracy_mapping1 = accuracy_score(y_test, y_pred_kmeans_raw)
accuracy_mapping2 = accuracy_score(y_test, 1 - y_pred_kmeans_raw)

if accuracy_mapping1 > accuracy_mapping2:
    kmeans_accuracy = accuracy_mapping1
    y_pred_kmeans = y_pred_kmeans_raw
    print("K-Means cluster 0 mapped to class 0, cluster 1 to class 1.")
else:
    kmeans_accuracy = accuracy_mapping2
    y_pred_kmeans = 1 - y_pred_kmeans_raw
    print("K-Means cluster 0 mapped to class 1, cluster 1 to class 0.")

model_performance["K-Means"] = kmeans_accuracy
end_time = time.time()

print(f"K-Means training completed in {end_time - start_time:.2f} seconds.")
print(f"K-Means Accuracy: {kmeans_accuracy:.4f}")
print("Classification Report for K-Means:")
print(classification_report(y_test, y_pred_kmeans))
print("-" * 50)


--- Training K-Means Clustering ---
K-Means cluster 0 mapped to class 0, cluster 1 to class 1.
K-Means training completed in 0.40 seconds.
K-Means Accuracy: 0.5896
Classification Report for K-Means:
              precision    recall  f1-score   support

           0       0.56      0.86      0.68       530
           1       0.70      0.32      0.44       530

    accuracy                           0.59      1060
   macro avg       0.63      0.59      0.56      1060
weighted avg       0.63      0.59      0.56      1060

--------------------------------------------------


In [10]:
# ---------------------- 6️. Gradient Boosting ----------------------
print("\n--- Training Gradient Boosting ---")
start_time = time.time()
gb_params = {"n_estimators": [100, 200], "learning_rate": [0.01, 0.1], "max_depth": [3, 5]}
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=5, n_jobs=-1)
gb_grid.fit(X_train, y_train)
y_pred_gb = gb_grid.best_estimator_.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
model_performance["Gradient Boosting"] = gb_accuracy
end_time = time.time()

print(f"Gradient Boosting training completed in {end_time - start_time:.2f} seconds.")
print(f"Best parameters for Gradient Boosting: {gb_grid.best_params_}")
print(f"Gradient Boosting Accuracy: {gb_accuracy:.4f}")
print("Classification Report for Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))
print("-" * 50)


--- Training Gradient Boosting ---
Gradient Boosting training completed in 80.94 seconds.
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Gradient Boosting Accuracy: 0.9453
Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       530
           1       0.95      0.94      0.94       530

    accuracy                           0.95      1060
   macro avg       0.95      0.95      0.95      1060
weighted avg       0.95      0.95      0.95      1060

--------------------------------------------------


In [11]:
# ---------------------- 5️. Neural Network (MLP Classifier) ----------------------
print("\n--- Training Neural Network (MLP) ---")
start_time = time.time()
# Increased max_iter to 300 to ensure convergence
mlp_params = {"hidden_layer_sizes": [(50,), (100,)], "activation": ["relu", "tanh"], "alpha": [0.0001, 0.001]}
mlp_grid = GridSearchCV(MLPClassifier(max_iter=300, random_state=42), mlp_params, cv=5, n_jobs=-1)
mlp_grid.fit(X_train, y_train)
y_pred_mlp = mlp_grid.best_estimator_.predict(X_test)
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
model_performance["Neural Network"] = mlp_accuracy
end_time = time.time()

print(f"MLP training completed in {end_time - start_time:.2f} seconds.")
print(f"Best parameters for MLP: {mlp_grid.best_params_}")
print(f"MLP Accuracy: {mlp_accuracy:.4f}")
print("Classification Report for MLP:")
print(classification_report(y_test, y_pred_mlp))
print("-" * 50)


--- Training Neural Network (MLP) ---
MLP training completed in 469.39 seconds.
Best parameters for MLP: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100,)}
MLP Accuracy: 0.9481
Classification Report for MLP:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       530
           1       0.94      0.95      0.95       530

    accuracy                           0.95      1060
   macro avg       0.95      0.95      0.95      1060
weighted avg       0.95      0.95      0.95      1060

--------------------------------------------------


In [12]:
# ---------------------- Final Performance Summary ----------------------
print("Final Model Performance (Accuracy Scores):\n")
# Sort models by accuracy in descending order
sorted_performance = sorted(model_performance.items(), key=lambda item: item[1], reverse=True)

for model, acc in sorted_performance:
    print(f"{model:<20}: {acc:.4f}")

Final Model Performance (Accuracy Scores):

SVM                 : 0.9500
Logistic Regression : 0.9491
Neural Network      : 0.9481
Gradient Boosting   : 0.9453
KNN                 : 0.8858
K-Means             : 0.5896
