In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("spam.csv", encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
print(f"The Shape of the is: {data.shape}\n")

data.info()

The Shape of the is: (5572, 5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
data["Target"] = data["v1"].apply(lambda x: 1 if x == "spam" else 0)
data.drop("v1", axis=1, inplace=True)
data.head(10)

Unnamed: 0,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,Target
0,"Go until jurong point, crazy.. Available only ...",,,,0
1,Ok lar... Joking wif u oni...,,,,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,U dun say so early hor... U c already then say...,,,,0
4,"Nah I don't think he goes to usf, he lives aro...",,,,0
5,FreeMsg Hey there darling it's been 3 week's n...,,,,1
6,Even my brother is not like to speak with me. ...,,,,0
7,As per your request 'Melle Melle (Oru Minnamin...,,,,0
8,WINNER!! As a valued network customer you have...,,,,1
9,Had your mobile 11 months or more? U R entitle...,,,,1


In [5]:
data.isna().sum()
# data.describe().T

v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
Target           0
dtype: int64

In [6]:
data.drop_duplicates(inplace=True)

In [7]:
# code for getting the unique values in dataset
threshold = 0.9
data = data.loc[:, data.isnull().mean() < threshold]
data = data.dropna()

In [8]:
data.head()

Unnamed: 0,v2,Target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
target = data["Target"]

## Using TFIDF Vectors

In [10]:
accuracy_scores = []

tfidf = TfidfVectorizer(strip_accents='unicode', lowercase=True, stop_words='english')
vectorized = tfidf.fit_transform(data["v2"])

train_x, test_x, train_y, test_y = train_test_split(vectorized, target, test_size=0.2, random_state=42, shuffle=True)

## K-Nearest Neighbour


In [22]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50]
}

knn = KNeighborsClassifier()

random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_grid, n_iter=10, cv=5,  n_jobs=-1,  verbose=1,random_state=42)
random_search.fit(train_x, train_y)

best_knn = random_search.best_estimator_

y_pred = best_knn.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)

print(f"The Accuracy of the model is: {accuracy}\n")

print("Correct and Wrong Predictions for the first 10 rows:")

print("")


for i in range(min(20, len(test_x.toarray()))):
    correct = test_y.iloc[i]
    prediction = y_pred[i]
    if correct == prediction:
        print(f"Row {i}: Correctly predicted as {prediction}")
    else:
        print(f"Row {i}: Incorrectly predicted as {prediction}, Actual: {correct}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
The Accuracy of the model is: 0.9177949709864603

Correct and Wrong Predictions for the first 10 rows:

Row 0: Correctly predicted as 0
Row 1: Correctly predicted as 0
Row 2: Correctly predicted as 0
Row 3: Correctly predicted as 0
Row 4: Correctly predicted as 0
Row 5: Correctly predicted as 1
Row 6: Correctly predicted as 0
Row 7: Incorrectly predicted as 0, Actual: 1
Row 8: Correctly predicted as 0
Row 9: Correctly predicted as 0
Row 10: Correctly predicted as 0
Row 11: Correctly predicted as 0
Row 12: Incorrectly predicted as 0, Actual: 1
Row 13: Correctly predicted as 0
Row 14: Correctly predicted as 0
Row 15: Correctly predicted as 0
Row 16: Correctly predicted as 0
Row 17: Correctly predicted as 0
Row 18: Correctly predicted as 0
Row 19: Correctly predicted as 0


# Random Forest Classifier

In [11]:
from sklearn.metrics import recall_score
params = {
    'n_estimators': [50,10,150,200],
    'criterion': ['gini','entropy'],
    'max_depth': [5,10,15,20],
    'min_samples_split': [5,10,15,20],
    'min_samples_leaf': [2,6,10,14],
    'class_weight': ['balanced'],
    'max_features': ['sqrt'],
}


random_forest = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(random_forest, param_distributions=params, n_iter=5, cv=5, n_jobs=-1, verbose=1, random_state=42)

random_search.fit(train_x, train_y)
best_rf= random_search.best_estimator_
print(f"The Best Parameters are: {best_rf}\n")

y_pred = best_rf.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)
print(f"The Accuracy of the model is: {accuracy}\n")
accuracy_scores.append(accuracy)

recall = recall_score(test_y, y_pred)
print("recall score: ", recall)
confusion = confusion_matrix(test_y, y_pred)
print("confusion score: ", confusion)
presicion = precision_score(test_y, y_pred)
print("presicion score: ", presicion)
f1 = f1_score(test_y, y_pred)
print("f1 score: ", f1)
roc_auc = roc_auc_score(test_y, y_pred)
print("roc_auc score: ", roc_auc)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
The Best Parameters are: RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=10, min_samples_split=10,
                       n_estimators=150, random_state=42)

The Accuracy of the model is: 0.9729206963249516

recall score:  0.8620689655172413
confusion score:  [[881   8]
 [ 20 125]]
presicion score:  0.9398496240601504
f1 score:  0.8992805755395683
roc_auc score:  0.926535045188317


# LGBM Classifier

In [12]:
params_lgbm = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [5, 10, 15, 20],
    'num_leaves': [31, 40, 50, 60],
    'min_child_samples': [10, 20, 30, 40],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'class_weight': ['balanced']
}


lgbm = LGBMClassifier(random_state=42)
random_search_lgbm = RandomizedSearchCV(estimator=lgbm, param_distributions=params_lgbm, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

random_search_lgbm.fit(train_x, train_y)
best_lgbm = random_search_lgbm.best_estimator_
print(f"The Best Parameters are: {best_lgbm}\n")

y_pred = best_lgbm.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)
print(f"The Accuracy of the model is: {accuracy}\n")
accuracy_scores.append(accuracy)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Number of positive: 508, number of negative: 3627
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7086
[LightGBM] [Info] Number of data points in the train set: 4135, number of used features: 634
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
The Best Parameters are: LGBMClassifier(class_weight='balanced', colsample_bytree=0.6, learning_rate=0.2,
               max_depth=20, min_child_samples=10, n_estimators=150,
               random_state=42)

The Accuracy of the model is: 0.9796905222437138



# SVC

In [13]:
params_svc = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced']
}

svc = SVC(random_state=42)
random_search_svc = RandomizedSearchCV(estimator=svc, param_distributions=params_svc, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

random_search_svc.fit(train_x, train_y)
best_svc = random_search_svc.best_estimator_
print(f"The Best Parameters are: {best_svc}\n")

y_pred = best_svc.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)
print(f"The Accuracy of the model is: {accuracy}\n")
accuracy_scores.append(accuracy)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
The Best Parameters are: SVC(C=1, class_weight='balanced', degree=2, kernel='sigmoid', random_state=42)

The Accuracy of the model is: 0.9787234042553191



# Logistic Classifier

In [14]:
params_logreg = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'class_weight': ['balanced']
}

logreg = LogisticRegression(random_state=42)
random_search_logreg = RandomizedSearchCV(estimator=logreg, param_distributions=params_logreg, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

random_search_logreg.fit(train_x, train_y)
best_logreg = random_search_logreg.best_estimator_
print(f"The Best Parameters are: {best_svc}\n")

y_pred = best_logreg.predict(test_x)

accuracy = accuracy_score(test_y, y_pred)
print(f"The Accuracy of the model is: {accuracy}\n")
accuracy_scores.append(accuracy)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The Best Parameters are: SVC(C=1, class_weight='balanced', degree=2, kernel='sigmoid', random_state=42)

The Accuracy of the model is: 0.9796905222437138



In [19]:
models = [RandomForestClassifier(), LGBMClassifier(), SVC(), LogisticRegression()]

max_accuracy_index = np.argmax(accuracy_scores)

print(f"The Best Model is: {models[max_accuracy_index]} with accuracy of {max_accuracy_index * 100}% \n")

The Best Model is: LGBMClassifier() with accuracy of 100% 



## Using the Bag of Words (BOW)

We need to first split the data then apply the BOW or TFIDF as it prevetns the data lekage

In [1]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [4]:
messages = pd.read_csv("spam.csv", encoding='latin-1')
messages.rename(columns={'v2': 'sms'}, inplace=True)
messages.head()

Unnamed: 0,v1,sms,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
messages["Target"] = messages["v1"].apply(lambda x: 1 if x == "spam" else 0)
messages.drop("v1", axis=1, inplace=True)


threshold = 0.9
messages = messages.loc[:, messages.isnull().mean() < threshold]
messages = messages.dropna()
messages.head(10)

Unnamed: 0,sms,Target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [6]:
ss = SnowballStemmer('english')

In [7]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['sms'][i])  # remove the regular expressions 
    review = review.lower()
    review = review.split()
    review = [ss.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkts st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breath

In [9]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(corpus, messages['Target'], test_size=0.20, random_state=0)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, ngram_range=(1, 2))

In [11]:
train_x = cv.fit_transform(train_x).toarray()
test_x = cv.transform(test_x).toarray()

In [12]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))
train_x

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [13]:
cv.vocabulary_   # returns the key value pair where key is the word and value is the index of the word in the array

{'kalli': 1063,
 'home': 956,
 'town': 2177,
 'escap': 634,
 'theatr': 2109,
 'go': 810,
 'watch': 2351,
 'minut': 1310,
 'go watch': 825,
 'walk': 2329,
 'mom': 1343,
 'right': 1755,
 'pass': 1535,
 'left': 1133,
 'hill': 938,
 'address': 20,
 'lt': 1217,
 'gt': 867,
 'lt gt': 1219,
 'dunno': 591,
 'close': 359,
 'oredi': 1510,
 'yo': 2490,
 'im': 1001,
 'work': 2436,
 'ur': 2246,
 'luck': 1223,
 'love': 1202,
 'someon': 1928,
 'one': 1489,
 'love someon': 1210,
 'also': 51,
 'know': 1086,
 'lunch': 1227,
 'menu': 1287,
 'da': 481,
 'oh': 1470,
 'yeah': 2479,
 'diet': 540,
 'nah': 1386,
 'straight': 2012,
 'bring': 217,
 'bud': 226,
 'drink': 580,
 'someth': 1930,
 'actual': 16,
 'littl': 1165,
 'use': 2276,
 'cash': 302,
 'choos': 342,
 'gr': 856,
 'tone': 2160,
 'wk': 2423,
 'servic': 1850,
 'week': 2367,
 'cost': 436,
 'credit': 455,
 'kick': 1074,
 'back': 135,
 'enjoy': 622,
 'fight': 694,
 'world': 2439,
 'easi': 598,
 'either': 608,
 'win': 2405,
 'lose': 1195,
 'bt': 222,
 'st

In [14]:
# from sklearn.model_selection import train_test_split
# train_x, test_x, train_y, test_y = train_test_split(x, messages['Target'], test_size=0.20, random_state=0)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
import tensorflow as tf

with tf.device('/CPU:0'):
    params = {
        'n_estimators': [50,100,],
        'criterion': ['gini','entropy'],
        'max_depth': [5,10,15],
        'min_samples_split': [5,10,15,20,30,40,50],
        'min_samples_leaf': [2,6,10,14, 16, 20,],
        'class_weight': ['balanced'],
        'max_features': ['sqrt'],
    }


    random_forest = RandomForestClassifier(random_state=42)
    random_search = RandomizedSearchCV(random_forest, param_distributions=params, n_iter=5, cv=5, n_jobs=-1, verbose=1, random_state=42)

    random_search.fit(train_x, train_y)
    best_rf= random_search.best_estimator_
    print(f"The Best Parameters are: {best_rf}\n")

    y_pred = best_rf.predict(test_x)

    accuracy = accuracy_score(test_y, y_pred)
    print(f"The Accuracy of the model is: {accuracy}\n")
    # accuracy_scores.append(accuracy)

    recall = recall_score(test_y, y_pred)
    print("recall score: ", recall)
    confusion = confusion_matrix(test_y, y_pred)
    print("confusion score: ", confusion)
    presicion = precision_score(test_y, y_pred)
    print("presicion score: ", presicion)
    f1 = f1_score(test_y, y_pred)
    print("f1 score: ", f1)
    roc_auc = roc_auc_score(test_y, y_pred)
    print("roc_auc score: ", roc_auc)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
The Best Parameters are: RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=15, min_samples_leaf=6, min_samples_split=5,
                       random_state=42)

The Accuracy of the model is: 0.968609865470852

recall score:  0.8734939759036144
confusion score:  [[935  14]
 [ 21 145]]
presicion score:  0.9119496855345912
f1 score:  0.8923076923076924
roc_auc score:  0.9293708024934298


In [16]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(random_forest, train_x, train_y, cv=5)
print("Cross-validation scores:", scores)


Cross-validation scores: [0.973 0.983 0.982 0.978 0.979]


## Using Word2Vec and AvgWord2Vec

In [17]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip
