In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
import numpy as np

In [23]:
data = pd.read_csv(r'C:\Users\goura\OneDrive\Desktop\projects\email spam\spam (or) ham.csv')

In [24]:
data.head(3)

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [25]:
data.info

<bound method DataFrame.info of      Class                                                sms
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name
5572  spam  This is the 2nd time we have tried 2 contact u...

[5573 rows x 2 columns]>

In [26]:


data.shape

(5573, 2)

data cleaning and preprocessng

In [27]:
import re

def remove_special_characters(text):
    # Remove special characters and symbols
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

In [28]:
def convert_to_lowercase(text):
    # Convert text to lowercase
    lowercased_text = text.lower()
    return lowercased_text

In [29]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stop_words(text):
    # Tokenize the text into individual words
    words = text.split()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Reconstruct the text without stop words
    filtered_text = ' '.join(filtered_words)
    return filtered_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# Apply data cleaning to the 'sms' column
data['cleaned_sms'] = data['sms'].apply(remove_special_characters)
data['cleaned_sms'] = data['cleaned_sms'].apply(convert_to_lowercase)
data['cleaned_sms'] = data['cleaned_sms'].apply(remove_stop_words)

# Display the updated data
print(data.head())

  Class                                                sms  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                         cleaned_sms  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts st ...  
3                u dun say early hor u c already say  
4        nah dont think goes usf lives around though  


In [31]:
# Feature extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['cleaned_sms'])
y = data['Class']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive bayes multinomialNB

In [33]:
# Train Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)

# Hyperparameter tuning for SVC using GridSearchCV

In [34]:

param_grid_svc = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1]
}

svc_model = SVC()
grid_search_svc = GridSearchCV(svc_model, param_grid_svc, scoring='accuracy', cv=5)
grid_search_svc.fit(X_train, y_train)
best_params_svc = grid_search_svc.best_params_

# Train the best SVC model

In [35]:

svc_model_best = SVC(**best_params_svc)
svc_model_best.fit(X_train, y_train)
svc_predictions = svc_model_best.predict(X_test)

# Hyperparameter tuning for RandomForestClassifier using GridSearchCV

In [21]:

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, scoring='accuracy', cv=5)
grid_search_rf.fit(X_train, y_train)
best_params_rf = grid_search_rf.best_params_

# Train the best RandomForestClassifier model

In [36]:

rf_model_best = RandomForestClassifier(**best_params_rf, random_state=42)
rf_model_best.fit(X_train, y_train)
rf_predictions = rf_model_best.predict(X_test)


# Hyperparameter tuning for GradientBoostingClassifier using GridSearchCV

In [37]:

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

gb_model = GradientBoostingClassifier(random_state=42)
grid_search_gb = GridSearchCV(gb_model, param_grid_gb, scoring='accuracy', cv=5)
grid_search_gb.fit(X_train, y_train)
best_params_gb = grid_search_gb.best_params_

# Train the best GradientBoostingClassifier model

In [38]:

gb_model_best = GradientBoostingClassifier(**best_params_gb, random_state=42)
gb_model_best.fit(X_train, y_train)
gb_predictions = gb_model_best.predict(X_test)

# Evaluate Multinomial Naive Bayes model

In [39]:

nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, pos_label='spam')
nb_recall = recall_score(y_test, nb_predictions, pos_label='spam')
nb_f1 = f1_score(y_test, nb_predictions, pos_label='spam')

# Evaluate SVC model

In [40]:

svc_accuracy = accuracy_score(y_test, svc_predictions)
svc_precision = precision_score(y_test, svc_predictions, pos_label='spam')
svc_recall = recall_score(y_test, svc_predictions, pos_label='spam')
svc_f1 = f1_score(y_test, svc_predictions, pos_label='spam')

# Evaluate RandomForestClassifier model

In [41]:

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, pos_label='spam')
rf_recall = recall_score(y_test, rf_predictions, pos_label='spam')
rf_f1 = f1_score(y_test, rf_predictions, pos_label='spam')

# Evaluate GradientBoostingClassifier model

In [42]:

gb_accuracy = accuracy_score(y_test, gb_predictions)
gb_precision = precision_score(y_test, gb_predictions, pos_label='spam')
gb_recall = recall_score(y_test, gb_predictions, pos_label='spam')
gb_f1 = f1_score(y_test, gb_predictions, pos_label='spam')

In [43]:
print(f'Multinomial Naive Bayes Accuracy: {nb_accuracy}')
print(f'Multinomial Naive Bayes Precision: {nb_precision}')
print(f'Multinomial Naive Bayes Recall: {nb_recall}')
print(f'Multinomial Naive Bayes F1 Score: {nb_f1}')

print(f'SVC Accuracy: {svc_accuracy}')
print(f'SVC Precision: {svc_precision}')
print(f'SVC Recall: {svc_recall}')
print(f'SVC F1 Score: {svc_f1}')

print(f'Random Forest Accuracy: {rf_accuracy}')
print(f'Random Forest Precision: {rf_precision}')
print(f'Random Forest Recall: {rf_recall}')
print(f'Random Forest F1 Score: {rf_f1}')

print(f'Gradient Boosting Accuracy: {gb_accuracy}')
print(f'Gradient Boosting Precision: {gb_precision}')
print(f'Gradient Boosting Recall: {gb_recall}')
print(f'Gradient Boosting F1 Score: {gb_f1}')

Multinomial Naive Bayes Accuracy: 0.9632286995515695
Multinomial Naive Bayes Precision: 0.8222222222222222
Multinomial Naive Bayes Recall: 0.9426751592356688
Multinomial Naive Bayes F1 Score: 0.8783382789317508
SVC Accuracy: 0.979372197309417
SVC Precision: 0.9785714285714285
SVC Recall: 0.8726114649681529
SVC F1 Score: 0.9225589225589226
Random Forest Accuracy: 0.97847533632287
Random Forest Precision: 1.0
Random Forest Recall: 0.8471337579617835
Random Forest F1 Score: 0.9172413793103448
Gradient Boosting Accuracy: 0.9748878923766816
Gradient Boosting Precision: 0.9708029197080292
Gradient Boosting Recall: 0.8471337579617835
Gradient Boosting F1 Score: 0.9047619047619048


# Choose the best model based on accuracy and other metrics

In [44]:

best_model = None
best_model_name = ""
best_model_accuracy = 0

models = [
    ("Multinomial Naive Bayes", nb_accuracy, nb_model),
    ("SVC", svc_accuracy, svc_model_best),
    ("Random Forest", rf_accuracy, rf_model_best),
    ("Gradient Boosting", gb_accuracy, gb_model_best)
]

for name, accuracy, model in models:
    if accuracy > best_model_accuracy:
        best_model_accuracy = accuracy
        best_model = model
        best_model_name = name

print(f'The best model is: {best_model_name}')

The best model is: SVC


# Save the best model and vectorizer using pickle

In [45]:

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open(f'{best_model_name.lower().replace(" ", "_")}.pkl', 'wb') as f:
    pickle.dump(best_model, f)