In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("spam.csv", encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
print(f"The Shape of the is: {data.shape}\n")

data.info()

The Shape of the is: (5572, 5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
data["Target"] = data["v1"].apply(lambda x: 1 if x == "spam" else 0)
data.drop("v1", axis=1, inplace=True)
data.head(10)

Unnamed: 0,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,Target
0,"Go until jurong point, crazy.. Available only ...",,,,0
1,Ok lar... Joking wif u oni...,,,,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,U dun say so early hor... U c already then say...,,,,0
4,"Nah I don't think he goes to usf, he lives aro...",,,,0
5,FreeMsg Hey there darling it's been 3 week's n...,,,,1
6,Even my brother is not like to speak with me. ...,,,,0
7,As per your request 'Melle Melle (Oru Minnamin...,,,,0
8,WINNER!! As a valued network customer you have...,,,,1
9,Had your mobile 11 months or more? U R entitle...,,,,1


In [5]:
data.isna().sum()
# data.describe().T

v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
Target           0
dtype: int64

In [6]:
data.drop_duplicates(inplace=True)

In [7]:
# code for getting the unique values in dataset
threshold = 0.9
data = data.loc[:, data.isnull().mean() < threshold]
data = data.dropna()

In [8]:
data.head()

Unnamed: 0,v2,Target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
target = data["Target"]

In [10]:
params = {
    'n_estimators': [50,10,150,200],
    'criterion': ['gini','entropy'],
    'max_depth': [5,10,15,20],
    'min_samples_split': [5,10,15,20],
    'min_samples_leaf': [2,6,10,14],
    'class_weight': ['balanced'],
    'max_features': ['sqrt'],
}

accuracy_scores = []

tfidf = TfidfVectorizer(strip_accents='unicode', lowercase=True, stop_words='english')
vectorized = tfidf.fit_transform(data["v2"])

train_x, test_x, train_y, test_y = train_test_split(vectorized, target, test_size=0.2, random_state=42, shuffle=True)



# Random Forest Classifier

In [11]:
random_forest = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(random_forest, param_distributions=params, n_iter=5, cv=5, n_jobs=-1, verbose=1, random_state=42)

random_search.fit(train_x, train_y)
best_rf= random_search.best_estimator_
print(f"The Best Parameters are: {best_rf}\n")

y_pred = best_rf.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)
print(f"The Accuracy of the model is: {accuracy}\n")
accuracy_scores.append(accuracy)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
The Best Parameters are: RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=10, min_samples_split=10,
                       n_estimators=150, random_state=42)

The Accuracy of the model is: 0.9729206963249516



# LGBM Classifier

In [12]:
params_lgbm = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [5, 10, 15, 20],
    'num_leaves': [31, 40, 50, 60],
    'min_child_samples': [10, 20, 30, 40],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'class_weight': ['balanced']
}


lgbm = LGBMClassifier(random_state=42)
random_search_lgbm = RandomizedSearchCV(estimator=lgbm, param_distributions=params_lgbm, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

random_search_lgbm.fit(train_x, train_y)
best_lgbm = random_search_lgbm.best_estimator_
print(f"The Best Parameters are: {best_lgbm}\n")

y_pred = best_lgbm.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)
print(f"The Accuracy of the model is: {accuracy}\n")
accuracy_scores.append(accuracy)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Number of positive: 508, number of negative: 3627
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7086
[LightGBM] [Info] Number of data points in the train set: 4135, number of used features: 634
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
The Best Parameters are: LGBMClassifier(class_weight='balanced', colsample_bytree=0.6, learning_rate=0.2,
               max_depth=20, min_child_samples=10, n_estimators=150,
               random_state=42)

The Accuracy of the model is: 0.9796905222437138



# SVC

In [13]:
params_svc = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced']
}

svc = SVC(random_state=42)
random_search_svc = RandomizedSearchCV(estimator=svc, param_distributions=params_svc, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

random_search_svc.fit(train_x, train_y)
best_svc = random_search_svc.best_estimator_
print(f"The Best Parameters are: {best_svc}\n")

y_pred = best_svc.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)
print(f"The Accuracy of the model is: {accuracy}\n")
accuracy_scores.append(accuracy)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
The Best Parameters are: SVC(C=1, class_weight='balanced', degree=2, kernel='sigmoid', random_state=42)

The Accuracy of the model is: 0.9787234042553191



# Logistic Classifier

In [14]:
params_logreg = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'class_weight': ['balanced']
}

logreg = LogisticRegression(random_state=42)
random_search_logreg = RandomizedSearchCV(estimator=logreg, param_distributions=params_logreg, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

random_search_logreg.fit(train_x, train_y)
best_logreg = random_search_logreg.best_estimator_
print(f"The Best Parameters are: {best_svc}\n")

y_pred = best_logreg.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)
print(f"The Accuracy of the model is: {accuracy}\n")
accuracy_scores.append(accuracy)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The Best Parameters are: SVC(C=1, class_weight='balanced', degree=2, kernel='sigmoid', random_state=42)

The Accuracy of the model is: 0.9796905222437138



In [19]:
models = [RandomForestClassifier(), LGBMClassifier(), SVC(), LogisticRegression()]

max_accuracy_index = np.argmax(accuracy_scores)

print(f"The Best Model is: {models[max_accuracy_index]} with accuracy of {max_accuracy_index * 100}% \n")

The Best Model is: LGBMClassifier() with accuracy of 100% 

