Importing the Dependencies


In [1]:
import pandas as pd

Loading the Dataset

In [2]:
data = pd.read_csv('/content/spam.csv')

Displaying first few rows

In [3]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Checking for Missing Values

In [4]:
data.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


Importing the Dependencies

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

Preprocessing the Data

In [6]:
# Convert labels to binary (spam: 1, ham: 0)
data['Category'] = data['Category'].map({'spam': 1, 'ham': 0})

# Convert text to feature vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Message'])

y = data['Category']

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

Importing the Dependencies

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


Initialize models

In [8]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}


Training the Models

In [9]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.9834888729361091
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2416
           1       0.98      0.89      0.94       370

    accuracy                           0.98      2786
   macro avg       0.98      0.95      0.96      2786
weighted avg       0.98      0.98      0.98      2786

Random Forest Accuracy: 0.9737975592246949
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2416
           1       0.99      0.81      0.89       370

    accuracy                           0.97      2786
   macro avg       0.98      0.90      0.94      2786
weighted avg       0.97      0.97      0.97      2786

AdaBoost Accuracy: 0.9662598707824839
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2416
           1       0.90      0.84      0.87       370

    accuracy                           0.97      278