In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Tensorflow Libraries
from tensorflow import keras
from tensorflow.keras import layers,models
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import Callback, EarlyStopping,ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import Model


# System libraries
from pathlib import Path
import os.path
import random
import matplotlib.cm as cm
import cv2

# Metrics
from sklearn.metrics import classification_report, confusion_matrix
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [16]:
df = pd.read_csv(r"C:\Users\chira\Downloads\spam.csv", encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [17]:
df=df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.columns = ['label', 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [None]:
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    stop_words = set(stopwords.words('english'))
    text = " ".join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Apply cleaning
df['cleaned_text'] = df['message'].apply(clean_text)

In [39]:
# Function to train models
def train_models(X, y, title):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    models = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"{title} - {name}: {acc:.4f}")

In [42]:
# Feature extraction
vectorizers = {
    "BoW": CountVectorizer(),
    "TF-IDF": TfidfVectorizer()
}

y = df['label']

for vec_name, vectorizer in vectorizers.items():
    print(f"\nTraining with {vec_name}")
    X_raw = vectorizer.fit_transform(df['message'])
    X_clean = vectorizer.fit_transform(df['cleaned_text'])
    
    train_models(X_raw, y, f"{vec_name} (Raw Text)")
    train_models(X_clean, y, f"{vec_name} (Cleaned Text)")

# Ensemble (Voting Classifier)
voting_clf = VotingClassifier(estimators=[
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier( eval_metric='logloss'))
], voting='hard')

X_final = vectorizers['TF-IDF'].fit_transform(df['cleaned_text'])
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
print(f"\nEnsemble Model Accuracy: {accuracy_score(y_test, y_pred):.4f}")



Training with BoW
BoW (Raw Text) - Naive Bayes: 0.9803
BoW (Raw Text) - Random Forest: 0.9740


Parameters: { "use_label_encoder" } are not used.



BoW (Raw Text) - XGBoost: 0.9794
BoW (Cleaned Text) - Naive Bayes: 0.9704
BoW (Cleaned Text) - Random Forest: 0.9731


Parameters: { "use_label_encoder" } are not used.



BoW (Cleaned Text) - XGBoost: 0.9776

Training with TF-IDF
TF-IDF (Raw Text) - Naive Bayes: 0.9623
TF-IDF (Raw Text) - Random Forest: 0.9749


Parameters: { "use_label_encoder" } are not used.



TF-IDF (Raw Text) - XGBoost: 0.9803
TF-IDF (Cleaned Text) - Naive Bayes: 0.9686
TF-IDF (Cleaned Text) - Random Forest: 0.9713


Parameters: { "use_label_encoder" } are not used.



TF-IDF (Cleaned Text) - XGBoost: 0.9803

Ensemble Model Accuracy: 0.9731
