In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix#sadness (0), 


In [2]:
spam_data = pd.read_csv('mail_data.csv')

In [3]:
spam_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
spam_data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
spam_data.isna().sum()

Category    0
Message     0
dtype: int64

In [6]:
spam_data.replace({'Category':{'ham':0,'spam':1}}, inplace = True)

In [7]:
spam_data['Category'].value_counts()

0    4825
1     747
Name: Category, dtype: int64

##### Splitting labels and features

In [8]:
X = spam_data.drop(columns='Category', axis = 1)
Y = spam_data['Category']
print(X,Y)

                                                Message
0     Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...
2     Free entry in 2 a wkly comp to win FA Cup fina...
3     U dun say so early hor... U c already then say...
4     Nah I don't think he goes to usf, he lives aro...
...                                                 ...
5567  This is the 2nd time we have tried 2 contact u...
5568               Will ü b going to esplanade fr home?
5569  Pity, * was in mood for that. So...any other s...
5570  The guy did some bitching but I acted like i'd...
5571                         Rofl. Its true to its name

[5572 rows x 1 columns] 0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64


##### Stemming the data

In [9]:
port_stem = PorterStemmer()

In [10]:
def stemming(Message):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', Message) #this regex is looking for words from a-z only. no numbers. commas and fullstops are replaced with a space as indicated by ' '
    stemmed_content = re.sub(r'http\S+', '', Message) #checks for hyperlinks
    stemmed_content = stemmed_content.lower()#convert everything to lowercase letters
    stemmed_content = stemmed_content.split()#convert everything in text to a list
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] #reducing words to their root word - but for loop is removing all stop words
    stemmed_content = ' '.join(stemmed_content)#joining all words
    return stemmed_content

In [11]:
spam_data['Message'] = spam_data['Message'].apply(stemming)

In [12]:
X = spam_data['Message'].values
Y = spam_data['Category'].values
print(X,Y)

['go jurong point, crazy.. avail bugi n great world la e buffet... cine got amor wat...'
 'ok lar... joke wif u oni...'
 "free entri 2 wkli comp win fa cup final tkt 21st may 2005. text fa 87121 receiv entri question(std txt rate)t&c' appli 08452810075over18'"
 ... 'pity, * mood that. so...ani suggestions?'
 "guy bitch act like i'd interest buy someth els next week gave us free"
 'rofl. true name'] [0 0 1 ... 0 0 0]


In [13]:
#converting text data to numbers
#main point of vectorizer is that it will create feature columns that it deems is important
vectorizer = TfidfVectorizer() #basically finds words that are repeating the most to assign a value to it. similarly its inversely doing the opposite where if certain words are showing up and it doesnt have a value, it doesnt provide it value
vectorizer.fit(X) #only fitting X bc Y already is all numbers (0,1)

X = vectorizer.transform(X) #convert all values to respective features

In [14]:
print(X)

  (0, 8050)	0.22962269749428962
  (0, 7842)	0.1892146376670875
  (0, 5648)	0.25827574458086533
  (0, 4277)	0.28937206929337134
  (0, 4160)	0.34253158750893153
  (0, 3480)	0.1892146376670875
  (0, 3442)	0.16058011098012456
  (0, 3397)	0.13794804480121792
  (0, 2250)	0.30739505718901877
  (0, 1990)	0.28937206929337134
  (0, 1713)	0.30040366604489843
  (0, 1711)	0.32698342515267853
  (0, 1293)	0.26526713572498567
  (0, 1066)	0.34253158750893153
  (1, 7959)	0.43651585199297055
  (1, 5275)	0.556006883677782
  (1, 5245)	0.2765613566485246
  (1, 4314)	0.41533459133332773
  (1, 4124)	0.5013195084100336
  (2, 8009)	0.1912428307935117
  (2, 7971)	0.14602126349253328
  (2, 7522)	0.12364246685411001
  (2, 7343)	0.22161471585273418
  (2, 7198)	0.11922152476490841
  (2, 6858)	0.19741178626772932
  :	:
  (5568, 3397)	0.2900009528925707
  (5568, 3200)	0.5576572124830113
  (5568, 2859)	0.6874001368485954
  (5569, 7224)	0.25114395890631247
  (5569, 6987)	0.4874939876356764
  (5569, 6673)	0.3253834421202

##### Standardizing the data

In [15]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X)
standardized_data = scaler.transform(X)
print(standardized_data)

  (0, 8050)	10.610294654784285
  (0, 7842)	3.8773009678473875
  (0, 5648)	11.786587357750989
  (0, 4277)	20.196702346052152
  (0, 4160)	74.65252962559933
  (0, 3480)	4.384491807051848
  (0, 3442)	3.2345142595031966
  (0, 3397)	2.324027425051536
  (0, 2250)	32.20026757080644
  (0, 1990)	28.707498530069987
  (0, 1713)	24.380954375790882
  (0, 1711)	37.422578295735015
  (0, 1293)	15.002187984133633
  (0, 1066)	74.65252962559933
  (1, 7959)	18.37311842570421
  (1, 5275)	44.52172833850037
  (1, 5245)	3.0468679928408937
  (1, 4314)	14.129169916144727
  (1, 4124)	33.69351581745086
  (2, 8009)	16.777438519048328
  (2, 7971)	5.95022462954512
  (2, 7522)	4.118789105665062
  (2, 7343)	29.53240197830258
  (2, 7198)	2.6702044427649785
  (2, 6858)	19.30407686030629
  :	:
  (5568, 3397)	4.8856811909483655
  (5568, 3200)	29.61367231494946
  (5568, 2859)	52.025039687012416
  (5569, 7224)	5.496461146316655
  (5569, 6987)	74.65252962559933
  (5569, 6673)	13.077156776862903
  (5569, 5583)	74.6525296255993

In [16]:
X = standardized_data
Y = spam_data['Category'].values
print(X,Y)

  (0, 8050)	10.610294654784285
  (0, 7842)	3.8773009678473875
  (0, 5648)	11.786587357750989
  (0, 4277)	20.196702346052152
  (0, 4160)	74.65252962559933
  (0, 3480)	4.384491807051848
  (0, 3442)	3.2345142595031966
  (0, 3397)	2.324027425051536
  (0, 2250)	32.20026757080644
  (0, 1990)	28.707498530069987
  (0, 1713)	24.380954375790882
  (0, 1711)	37.422578295735015
  (0, 1293)	15.002187984133633
  (0, 1066)	74.65252962559933
  (1, 7959)	18.37311842570421
  (1, 5275)	44.52172833850037
  (1, 5245)	3.0468679928408937
  (1, 4314)	14.129169916144727
  (1, 4124)	33.69351581745086
  (2, 8009)	16.777438519048328
  (2, 7971)	5.95022462954512
  (2, 7522)	4.118789105665062
  (2, 7343)	29.53240197830258
  (2, 7198)	2.6702044427649785
  (2, 6858)	19.30407686030629
  :	:
  (5568, 3397)	4.8856811909483655
  (5568, 3200)	29.61367231494946
  (5568, 2859)	52.025039687012416
  (5569, 7224)	5.496461146316655
  (5569, 6987)	74.65252962559933
  (5569, 6673)	13.077156776862903
  (5569, 5583)	74.6525296255993

##### Train test split

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = .2, stratify = Y)
print(X.shape, X_train.shape, X_test.shape)

(5572, 8217) (4457, 8217) (1115, 8217)


In [19]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix 
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grids for randomized search (coarse search)
logistic_param_grid_coarse = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

xgboost_param_grid_coarse = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

svm_param_grid_coarse = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

# RandomizedSearchCV for each model (coarse search)
logistic_random_search_coarse = RandomizedSearchCV(LogisticRegression(), logistic_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
xgboost_random_search_coarse = RandomizedSearchCV(XGBClassifier(objective='binary:logistic'), xgboost_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
svm_random_search_coarse = RandomizedSearchCV(svm.SVC(), svm_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)

# Fit models using RandomizedSearchCV (coarse search)
logistic_random_search_coarse.fit(X_train, Y_train)
xgboost_random_search_coarse.fit(X_train, Y_train)
svm_random_search_coarse.fit(X_train, Y_train)

# Get best hyperparameters from RandomizedSearchCV (coarse search)
best_logistic_params_coarse = logistic_random_search_coarse.best_params_
best_xgboost_params_coarse = xgboost_random_search_coarse.best_params_
best_svm_params_coarse = svm_random_search_coarse.best_params_

# Define parameter grids for GridSearchCV (fine search)
logistic_param_grid_fine = {
    'penalty': [best_logistic_params_coarse['penalty']],
    'C': [best_logistic_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'solver': [best_logistic_params_coarse['solver']]
}

xgboost_param_grid_fine = {
    'learning_rate': [best_xgboost_params_coarse['learning_rate'] * i for i in [0.5, 1, 2]],
    'n_estimators': [best_xgboost_params_coarse['n_estimators']],
    'max_depth': [best_xgboost_params_coarse['max_depth']],
    'min_child_weight': [best_xgboost_params_coarse['min_child_weight']],
    'subsample': [best_xgboost_params_coarse['subsample']],
    'colsample_bytree': [best_xgboost_params_coarse['colsample_bytree']]
}

svm_param_grid_fine = {
    'C': [best_svm_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'gamma': [best_svm_params_coarse['gamma'] * i for i in [0.1, 1, 10]],
    'kernel': [best_svm_params_coarse['kernel']]
}

# GridSearchCV for each model (fine search)
logistic_grid_search_fine = GridSearchCV(LogisticRegression(), param_grid=logistic_param_grid_fine, cv=5, n_jobs=-1)
xgboost_grid_search_fine = GridSearchCV(XGBClassifier(objective='binary:logistic'), param_grid=xgboost_param_grid_fine, cv=5, n_jobs=-1)
svm_grid_search_fine = GridSearchCV(svm.SVC(), param_grid=svm_param_grid_fine, cv=5, n_jobs=-1)

# Fit models using GridSearchCV (fine search)
logistic_grid_search_fine.fit(X_train, Y_train)
xgboost_grid_search_fine.fit(X_train, Y_train)
svm_grid_search_fine.fit(X_train, Y_train)

# Print best hyperparameters from GridSearchCV (fine search)
print("Logistic Regression Best Parameters (Fine Search):", logistic_grid_search_fine.best_params_)
print("XGBoost Best Parameters (Fine Search):", xgboost_grid_search_fine.best_params_)
print("SVM Best Parameters (Fine Search):", svm_grid_search_fine.best_params_)

# Compare cross-validated scores of each model
logistic_cv_score_fine = logistic_grid_search_fine.best_score_
xgboost_cv_score_fine = xgboost_grid_search_fine.best_score_
svm_cv_score_fine = svm_grid_search_fine.best_score_

# Select the best model based on cross-validated scores
best_model_fine = None
if logistic_cv_score_fine >= xgboost_cv_score_fine and logistic_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = logistic_grid_search_fine.best_estimator_
elif xgboost_cv_score_fine >= logistic_cv_score_fine and xgboost_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = xgboost_grid_search_fine.best_estimator_
else:
    best_model_fine = svm_grid_search_fine.best_estimator_

# Evaluate the best model on the test set
train_accuracy_fine = best_model_fine.score(X_train, Y_train)
print("Best Model Train Accuracy (Fine Search):", train_accuracy_fine)
test_accuracy_fine = best_model_fine.score(X_test, Y_test)
print("Best Model Test Accuracy (Fine Search):", test_accuracy_fine)






Logistic Regression Best Parameters (Fine Search): {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
XGBoost Best Parameters (Fine Search): {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1.0}
SVM Best Parameters (Fine Search): {'C': 0.010000000000000002, 'gamma': 0.001, 'kernel': 'linear'}
Best Model Train Accuracy (Fine Search): 0.991249719542293
Best Model Test Accuracy (Fine Search): 0.9757847533632287


In [20]:
y_pred = best_model_fine.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
roc_auc = roc_auc_score(Y_test, best_model_fine.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(Y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9757847533632287
Precision: 0.9765625
Recall: 0.8389261744966443
F1 Score: 0.9025270758122744
ROC AUC Score: 0.9887378937568607
Confusion Matrix:
[[963   3]
 [ 24 125]]


##### Random Forest Classifier

In [21]:
random_forest_classifier = RandomForestClassifier()
random_forest_classifier.fit(X_train, Y_train)

RandomForestClassifier()

In [22]:
#Accuracy score
X_test_prediction = random_forest_classifier.predict(X_test)
test_data_accuracy_random_forest = accuracy_score(X_test_prediction, Y_test)
print('Accuracy:', test_data_accuracy_random_forest)

Accuracy: 0.97847533632287


##### Making a predictive system

In [23]:
input_data = ["Your free ringtone is waiting to be collected. Simply text the password ""MIX"" to 85069 to verify. Get Usher and Britney. FML, PO Box 5249, MK17 92H. 450Ppw 16"]

input_data_features = vectorizer.transform(input_data) #you use the vectorizer to transform the data based on the vectorizer you created earlier


prediction = best_model_fine.predict(input_data_features)
print(prediction)
if prediction == 0:
    print("Not Spam")
else:
    print("Spam")

[1]
Spam
