In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import re

# Load the dataset
df = pd.read_csv('dataset/spam_detection_dataset.csv', encoding='latin1')
df = df['column_name'].fillna(df['column_name'].mode()[0], inplace=True)

In [23]:
# Define cleaning functions
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_non_ascii(text):
    # Remove non-ASCII characters
    return ''.join(char for char in text if ord(char) < 128)

def remove_digits(text):
    # Remove numeric digits
    return re.sub(r'\d+', '', text)

def remove_special_characters(text):
    # Remove special characters except whitespace
    return re.sub(r'[^\w\s]', '', text)

def normalize_case(text):
    # Normalize text to lowercase
    return text.lower()

def clean_text(text):
    # Remove URLs
    text = remove_urls(text)
    # Remove non-ASCII characters
    text = remove_non_ascii(text)
    # Remove numeric digits
    text = remove_digits(text)
    # Remove special characters except whitespace
    text = remove_special_characters(text)
    # Normalize case
    text = normalize_case(text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply cleaning functions to the 'comment' column
df['comment'] = df['comment'].apply(clean_text)

# Split dataset
X = df['comment']
y = df['spam']  # Assuming 'spam' is the column with binary labels (1 for spam, 0 for non-spam)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
# Define a function to train, evaluate models, and print the classification report
def train_and_evaluate_model(pipeline, model_name):
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} Accuracy: {accuracy}')
    
    # Print precision, recall, and F1-score
    print(f'{model_name} Classification Report:\n')
    print(classification_report(y_test, y_pred, target_names=['not spam', 'spam']))
    print('-'*60)

In [25]:
# 1. SVM Model
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('svc', SVC(kernel='linear'))  # SVM classifier with a linear kernel
])
train_and_evaluate_model(pipeline_svm, 'SVM')

SVM Accuracy: 0.9164078674948241
SVM Classification Report:

              precision    recall  f1-score   support

    not spam       0.92      0.96      0.94      2543
        spam       0.91      0.84      0.87      1321

    accuracy                           0.92      3864
   macro avg       0.91      0.90      0.91      3864
weighted avg       0.92      0.92      0.92      3864

------------------------------------------------------------


In [26]:
# 2. Logistic Regression Model
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression(max_iter=1000, C=0.5))  # Logistic Regression
])
train_and_evaluate_model(pipeline_lr, 'Logistic Regression')

Logistic Regression Accuracy: 0.8874223602484472
Logistic Regression Classification Report:

              precision    recall  f1-score   support

    not spam       0.87      0.97      0.92      2543
        spam       0.92      0.73      0.82      1321

    accuracy                           0.89      3864
   macro avg       0.90      0.85      0.87      3864
weighted avg       0.89      0.89      0.88      3864

------------------------------------------------------------


In [27]:
# 3. Random Forest Model
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=10))  # Random Forest Classifier
])
train_and_evaluate_model(pipeline_rf, 'Random Forest')


Random Forest Accuracy: 0.7204968944099379
Random Forest Classification Report:

              precision    recall  f1-score   support

    not spam       0.70      0.99      0.82      2543
        spam       0.93      0.20      0.33      1321

    accuracy                           0.72      3864
   macro avg       0.82      0.59      0.57      3864
weighted avg       0.78      0.72      0.65      3864

------------------------------------------------------------


In [28]:
# 4. Naive Bayes Model
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())  # Naive Bayes Classifier
])
train_and_evaluate_model(pipeline_nb, 'Naive Bayes')

Naive Bayes Accuracy: 0.900103519668737
Naive Bayes Classification Report:

              precision    recall  f1-score   support

    not spam       0.90      0.96      0.93      2543
        spam       0.90      0.79      0.84      1321

    accuracy                           0.90      3864
   macro avg       0.90      0.87      0.89      3864
weighted avg       0.90      0.90      0.90      3864

------------------------------------------------------------
