# Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler



# Data

In [4]:
data=pd.read_csv('spam.csv')
data

FileNotFoundError: [Errno 2] No such file or directory: 'spam.csv'

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data['Spam']=data['Category'].apply(lambda x:1 if x=='spam' else 0)
data.head(5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.Message, data.Spam, test_size=0.25, random_state=42)


## Handling Class Imbalance

In [None]:
# Undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train.to_frame(), y_train)
X_train_res = X_train_res['Message']

# Oversample the minority class
ros = RandomOverSampler(random_state=42)
X_train_overs, y_train_overs = ros.fit_resample(X_train.to_frame(), y_train)
X_train_overs = X_train_overs['Message']

# Training The Model


In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    training_time = end_time - start_time

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f1, training_time

## Original Data


### Naive Bayes Model



In [None]:
clf_nb = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

### Random Forest Model

In [None]:
clf_rf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

### SVM Model


In [None]:
clf_svm = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svm', SVC(kernel='linear', random_state=42))
])

### kNN Model


In [None]:
clf_knn = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])


### Evaluate Models

In [None]:
nb_metrics = evaluate_model(clf_nb, X_train, y_train, X_test, y_test)
rf_metrics = evaluate_model(clf_rf, X_train, y_train, X_test, y_test)
svm_metrics = evaluate_model(clf_svm, X_train, y_train, X_test, y_test)
knn_metrics = evaluate_model(clf_knn, X_train, y_train, X_test, y_test)

print("Original Data")
print("Naive Bayes - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*nb_metrics))
print("Random Forest - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*rf_metrics))
print("SVM - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*svm_metrics))
print("kNN - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*knn_metrics))

Naive Bayes - Accuracy: 0.9849, Precision: 0.9503, Recall: 0.9348, F1 Score: 0.9425, Training Time: 0.2315 seconds
Random Forest - Accuracy: 0.9835, Precision: 1.0000, Recall: 0.8750, F1 Score: 0.9333, Training Time: 2.6781 seconds
SVM - Accuracy: 0.9892, Precision: 0.9884, Recall: 0.9293, F1 Score: 0.9580, Training Time: 0.9108 seconds
kNN - Accuracy: 0.9196, Precision: 1.0000, Recall: 0.3913, F1 Score: 0.5625, Training Time: 0.2379 seconds


## Undersampled Data


### Naive Bayes Model


In [None]:
clf_nb_res = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

### Random Forest Model


In [None]:
clf_rf_res = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

### SVM Model

In [None]:
clf_svm_res = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svm', SVC(kernel='linear', random_state=42))
])

### kNN Model

In [None]:
clf_knn_res = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

### Evaluate Models


In [None]:
nb_metrics_res = evaluate_model(clf_nb_res, X_train_res, y_train_res, X_test, y_test)
rf_metrics_res = evaluate_model(clf_rf_res, X_train_res, y_train_res, X_test, y_test)
svm_metrics_res = evaluate_model(clf_svm_res, X_train_res, y_train_res, X_test, y_test)
knn_metrics_res = evaluate_model(clf_knn_res, X_train_res, y_train_res, X_test, y_test)

print("\nUndersampled Data")
print("Naive Bayes - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*nb_metrics_res))
print("Random Forest - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*rf_metrics_res))
print("SVM - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*svm_metrics_res))
print("kNN - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*knn_metrics_res))

## Oversampled Model

### Naive Bayes Model

In [None]:
clf_nb_overs = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

### Random Forest Model

In [None]:
clf_rf_overs = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

### SVM Model

In [None]:
clf_svm_overs = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svm', SVC(kernel='linear', random_state=42))
])

### kNN Model

In [None]:
clf_knn_overs = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

### Evaluate Models

In [None]:
nb_metrics_overs = evaluate_model(clf_nb_overs, X_train_overs, y_train_overs, X_test, y_test)
rf_metrics_overs = evaluate_model(clf_rf_overs, X_train_overs, y_train_overs, X_test, y_test)
svm_metrics_overs = evaluate_model(clf_svm_overs, X_train_overs, y_train_overs, X_test, y_test)
knn_metrics_overs = evaluate_model(clf_knn_overs, X_train_overs, y_train_overs, X_test, y_test)

print("\nOversampled Data")
print("Naive Bayes - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*nb_metrics_overs))
print("Random Forest - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*rf_metrics_overs))
print("SVM - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*svm_metrics_overs))
print("kNN - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}, Training Time: {:.4f} seconds".format(*knn_metrics_overs))

# Graphic Analysis

In [None]:
# Plotting the number of instances in each class

fig, ax = plt.subplots(1, 3, figsize=(18, 6))

# Original data
sns.countplot(x=y_train, ax=ax[0])
ax[0].set_title('Original Data')
ax[0].set_xlabel('Class')
ax[0].set_ylabel('Count')

# Undersampled data
sns.countplot(x=y_train_res, ax=ax[1])
ax[1].set_title('Undersampled Data')
ax[1].set_xlabel('Class')
ax[1].set_ylabel('Count')

# Oversampled data
sns.countplot(x=y_train_overs, ax=ax[2])
ax[2].set_title('Oversampled Data')
ax[2].set_xlabel('Class')
ax[2].set_ylabel('Count')

plt.tight_layout()
plt.show()

# Examples


In [None]:
emails=[
    'Sounds great! Are you home now?',
    'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES',
    'Hello, how r u??',
    'You are the lucky winner of a $500.00 gift card to spend at any online store. To claim your prize, simply follow these easy steps'
]

**Predict Email**

In [None]:
clf_svm.predict(emails)

array([0, 1, 0, 0, 1])