In [7]:
import pandas as pd

# Load the dataset with proper encoding
try:
    data = pd.read_csv('spam.csv', encoding='utf-8')
except UnicodeDecodeError:
    data = pd.read_csv('spam.csv', encoding='latin1')

# Inspect the first few rows of the dataset
print(data.head())
print(data.columns)


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [8]:
# Rename the first two columns appropriately
data = data.iloc[:, :2]
data.columns = ['label', 'message']

# Map labels to binary values
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Separate features and target
X = data['message']
y = data['label']


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Use TF-IDF to convert text data into numerical features
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Initialize the models
naive_bayes = MultinomialNB()
log_reg = LogisticRegression(max_iter=1000)
svc = SVC()

# Train the models
naive_bayes.fit(X_train, y_train)
log_reg.fit(X_train, y_train)
svc.fit(X_train, y_train)

# Predict and evaluate Naive Bayes
y_pred_nb = naive_bayes.predict(X_test)
print("Naive Bayes")
print(classification_report(y_test, y_pred_nb))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("ROC AUC:", roc_auc_score(y_test, y_pred_nb))

# Predict and evaluate Logistic Regression
y_pred_log_reg = log_reg.predict(X_test)
print("\nLogistic Regression")
print(classification_report(y_test, y_pred_log_reg))
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("ROC AUC:", roc_auc_score(y_test, y_pred_log_reg))

# Predict and evaluate SVM
y_pred_svc = svc.predict(X_test)
print("\nSVM")
print(classification_report(y_test, y_pred_svc))
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print("ROC AUC:", roc_auc_score(y_test, y_pred_svc))


Naive Bayes
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy: 0.968609865470852
ROC AUC: 0.8833333333333333

Logistic Regression
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       0.97      0.61      0.75       150

    accuracy                           0.94      1115
   macro avg       0.96      0.80      0.86      1115
weighted avg       0.95      0.94      0.94      1115

Accuracy: 0.9443946188340807
ROC AUC: 0.8017789291882556

SVM
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.99      0.80      0.89       150

    accuracy                   