Spam Detection

In [1]:
import numpy as np 
import pandas as pd 
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split , cross_val_score , cross_validate 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score , classification_report , make_scorer
from sklearn.pipeline import Pipeline

Data Preprocessing

In [2]:
data = pd.read_csv("spam_ham_dataset.csv")

In [3]:
data['text'] = data['text'].apply(lambda x: x.replace('\r\n',' '))

In [4]:
y = data['label_num']
X = data['text']


In [5]:
stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words('english'))
for i in range(len(X)) : 
    text = X.iloc[i].lower()
    text = text.translate(str.maketrans('','',string.punctuation)).split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)
    corpus.append(text)


In [6]:
#X_train  , X_test, y_train , y_test = train_test_split(corpus, y , test_size = 0.2 , random_state =42)
#vectorizer = CountVectorizer()
#X_train= vectorizer.fit_transform(X_train)
#X_test = vectorizer.transform(X_test)


KNN , SVM(Linear ,RBF ) , LOGISTIC REGRESSION , Decision Tree

SVM LINEAR

In [7]:
svm = SVC(kernel= 'linear')

pip_svm  = Pipeline([
    ('vectorizer' , CountVectorizer()),
    ('svm' , svm)
])


scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}


results = cross_validate(pip_svm, corpus, y, cv=5, scoring=scoring)

 
print(f"Accuracy: {results['test_accuracy'].mean():.3f}")
print(f"Precision: {results['test_precision'].mean():.3f}")
print(f"Recall: {results['test_recall'].mean():.3f}")
print(f"F1-score: {results['test_f1'].mean():.3f}")

#scores_svm_linear = cross_validate(pip_svm, corpus, y, cv=5)

#print(f" Accuracy: {scores_svm_linear.mean():.3f}")

#y_train_pred = svm.predict(X_train)
#y_test_pred = svm.predict(X_test)

#cross_score = cross_val_score(svm , corpus , y , cv = 5 ) 
#print("Scores for each fold:", cross_score)
#print("Mean accuracy:", cross_score.mean())

#print(f"Acc SVM linear Kenrel : {svm.score(X_test,y_test)}")
#print(f"Classfication report {classification_report(y_test , y_test_pred)}")
#print(f"Training Score  : {svm.score(X_train, y_train)}") 

Accuracy: 0.966
Precision: 0.959
Recall: 0.959
F1-score: 0.959


SVM RBF

In [8]:
svm = SVC(kernel= 'rbf')

pip_svm_rbf = Pipeline([
    ('vectorizer' , CountVectorizer()),
    ('svm' , svm)])

scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}


results = cross_validate(pip_svm_rbf, corpus, y, cv=5, scoring=scoring)

 
print(f"Accuracy: {results['test_accuracy'].mean():.3f}")
print(f"Precision: {results['test_precision'].mean():.3f}")
print(f"Recall: {results['test_recall'].mean():.3f}")
print(f"F1-score: {results['test_f1'].mean():.3f}")

Accuracy: 0.964
Precision: 0.948
Recall: 0.968
F1-score: 0.957


Logistic Regression 

In [9]:
log = LogisticRegression(max_iter =1000)


pipline_reg = Pipeline([
    ('vectorizer' , CountVectorizer()),
    ('log',log)])

scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}


results = cross_validate(pipline_reg, corpus, y, cv=5, scoring=scoring)

 
print(f"Accuracy: {results['test_accuracy'].mean():.3f}")
print(f"Precision: {results['test_precision'].mean():.3f}")
print(f"Recall: {results['test_recall'].mean():.3f}")
print(f"F1-score: {results['test_f1'].mean():.3f}")

Accuracy: 0.979
Precision: 0.971
Recall: 0.979
F1-score: 0.975


KNN

In [10]:
knn = KNeighborsClassifier(n_neighbors = 5)


pipline_knn = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('knn', knn)])


scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}


results = cross_validate(pipline_knn, corpus, y, cv=5, scoring=scoring)

 
print(f"Accuracy: {results['test_accuracy'].mean():.3f}")
print(f"Precision: {results['test_precision'].mean():.3f}")
print(f"Recall: {results['test_recall'].mean():.3f}")
print(f"F1-score: {results['test_f1'].mean():.3f}")

Accuracy: 0.825
Precision: 0.804
Recall: 0.863
F1-score: 0.811


Desicion Tree

In [11]:
clf = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=42)

pipline_clf = Pipeline([('vectorizer' , CountVectorizer()) , ('klf' , clf)])


scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}


results = cross_validate(pipline_clf, corpus, y, cv=5, scoring=scoring)

 
print(f"Accuracy: {results['test_accuracy'].mean():.3f}")
print(f"Precision: {results['test_precision'].mean():.3f}")
print(f"Recall: {results['test_recall'].mean():.3f}")
print(f"F1-score: {results['test_f1'].mean():.3f}")


Accuracy: 0.790
Precision: 0.833
Recall: 0.650
F1-score: 0.667
