> ## **Machine Learning Task:** Email Spam Detection ##

We will start by importing all the necessary libraries and the dataset.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
import chardet

with open('Emails.csv', 'rb') as file:
    result = chardet.detect(file.read())
    encoding = result['encoding']

print(f"Detected encoding: {encoding}")


Detected encoding: ascii


In [3]:

data = pd.read_csv('Emails.csv', usecols=['v1', 'v2'], encoding=encoding)

Now, lets check for any null values and any duplicates. To clean our data. 

In [4]:
data.isnull().sum()

v2    0
v1    2
dtype: int64

In [5]:
data.dropna(inplace=True, axis=0)
data.head()

Unnamed: 0,v2,v1
0,naturally irresistible your corporate identity...,spam
1,the stock trading gunslinger fanny is merrill...,spam
2,unbelievable new homes made easy im wanting t...,spam
3,4 color printing special request additional i...,spam
4,"do not have money , get software cds from here...",spam


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5728 entries, 0 to 5729
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v2      5728 non-null   object
 1   v1      5728 non-null   object
dtypes: object(2)
memory usage: 134.2+ KB


In [7]:
data.describe()

Unnamed: 0,v2,v1
count,5728,5728
unique,5695,4
top,"re : contact info glenn , please , contact r...",ham
freq,2,4358


In [8]:
data.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
its termination would not have such a phenomenal impact on the power situation . however,1,1,e dpc contributed only 0 . 7 per cent of the ...,1
mr suresh prabhu,1,1,lf against undeserved claims in the event of e...,1
ham,4358,4325,"re : eprm 2001 houston layla , my associate ...",2
spam,1368,1368,naturally irresistible your corporate identity...,1


In [9]:
data.duplicated().sum()

33

In [10]:
data.drop_duplicates(inplace=True)

In [11]:
data.duplicated().sum()

0

Now lets move onto the data preprocessing. So that we can make our data suitable for our model. We will start by cleaning our text and removing any special characters from our text.

In [12]:
# Step 1: Text Cleaning
def clean_text(text):
    # Remove special characters and numbers
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

In [13]:
data['v2'] = data['v2'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print(data.head())

                                                  v2    v1
0  naturally irresistible your corporate identity...  spam
1  the stock trading gunslinger  fanny is merrill...  spam
2  unbelievable new homes made easy  im wanting t...  spam
3   color printing special  request additional in...  spam
4  do not have money  get software cds from here ...  spam


Now that we are done with that. It better to convert our text into lowercase so that there are no discrepancies or uneven cases in our text. This can cause problems in our model.

In [14]:
# Step 1: Text Clearning (Continued)
def lowercase_text(text):
    # Convert text to lowercase
    lowercase_text = text.lower()
    return lowercase_text

In [15]:
data['v2'] = data['v2'].apply(lowercase_text)

# Display the first few rows of the dataset with lowercase text
print(data.head())

                                                  v2    v1
0  naturally irresistible your corporate identity...  spam
1  the stock trading gunslinger  fanny is merrill...  spam
2  unbelievable new homes made easy  im wanting t...  spam
3   color printing special  request additional in...  spam
4  do not have money  get software cds from here ...  spam


Now, just in case. I am going to download some necessary packages and update them if required. 

In [16]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

This part of the process is to divide the text into small tokens. What this means is that each word of our dataset is a seperate element. That will be reviewed by our model. 

In [17]:
# Step 1: Text Cleaning (Continued)
def tokenize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    return tokens

In [18]:
data['v2'] = data['v2'].apply(tokenize_text)

# Display the first few rows of the dataset with tokenized text
print(data.head())

                                                  v2    v1
0  [naturally, irresistible, your, corporate, ide...  spam
1  [the, stock, trading, gunslinger, fanny, is, m...  spam
2  [unbelievable, new, homes, made, easy, im, wan...  spam
3  [color, printing, special, request, additional...  spam
4  [do, not, have, money, get, software, cds, fro...  spam


After making our tokens. We are going to remove any stopwords. Stopwords are words that are not important for our model. They are words like 'the', 'a', 'an', 'is', 'are', etc. These words are not important for our model. So, we are going to remove them.

In [19]:
# Step 1: Text Cleaning (Continued)
def remove_stopwords(tokens):
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [20]:
data['v2'] = data['v2'].apply(remove_stopwords)

# Display the first few rows of the dataset with filtered text
print(data.head())

                                                  v2    v1
0  [naturally, irresistible, corporate, identity,...  spam
1  [stock, trading, gunslinger, fanny, merrill, m...  spam
2  [unbelievable, new, homes, made, easy, im, wan...  spam
3  [color, printing, special, request, additional...  spam
4  [money, get, software, cds, software, compatib...  spam


This is the final step of our data preprocessing. We are going to stem our words. Stemming is the process of reducing a word to its root form. For example, the word 'running' will be reduced to 'run'. This is done so that our model can understand the words better. We are going to use the PorterStemmer for this task. Moreover, we have also provided an option to use lemmatization, if that might be better. Although a very small difference is observed between the two. However, stemming is faster than lemmatization. So, we are going to use stemming and also has a slight edge in accuracy.

In [21]:
# Step 1: Text Cleaning (Continued)
def apply_stemming(tokens):
    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

def apply_lemmatization(tokens):
    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens


In [22]:
# Choose either stemming or lemmatization
data['v2'] = data['v2'].apply(apply_stemming)
# or
#data['v2'] = data['v2'].apply(apply_lemmatization)

In [23]:
# Display the first few rows of the dataset with processed text
print(data.head())

                                                  v2    v1
0  [natur, irresist, corpor, ident, lt, realli, h...  spam
1  [stock, trade, gunsling, fanni, merril, muzo, ...  spam
2  [unbeliev, new, home, made, easi, im, want, sh...  spam
3  [color, print, special, request, addit, inform...  spam
4  [money, get, softwar, cd, softwar, compat, gre...  spam


Now we are going to perform vectorization our data. This is done so that our model can understand our data better. We are going to use the TF-IDF vectorizer for this task. Additionally, this is also done so that all the words are given equal importance and the data can be tested as a whole. Hence, removing any bias and making it easier for our model to understand the data. Our features are reduced so that the models works faster and more efficiently.

In [24]:
# Step 2: Feature Extraction (TF-IDF)
corpus = [' '.join(tokens) for tokens in data['v2']]
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(corpus)

In [25]:
# Display the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X.shape)

TF-IDF Matrix Shape: (5695, 25568)


Just for the layman, TF-IDF is a measure of originality of a word by comparing the number of times a word appears in a document with the number of documents the word appears in. Hence, giving us a measure of how important a word is to a document in a collection or corpus.

Also, I have seperately made the label variable so it would be easier for anyone reading to understand the code. The column v1 are the labels and the column v2 are the text. 

In [26]:
labels = data['v1']

This part of the process is fairly common but extremely necessary part of our machine learning process. We are going to split our data into training and testing data. This is done so that we can train our model on the training data and then test it on the testing data. This is done so that we can check the accuracy of our model. We are going to use the train_test_split function from the sklearn library. We are going to use 20% of our data for testing and 80% for training.

In [27]:
# Assuming you have 'X' as your TF-IDF matrix and 'labels' as corresponding labels
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

> ## **Machine Learning Model:** Naive Bayes ##

We will start by finding the parameters that are best for our model. I have used the GridSearchCV function from the sklearn library. This function will help us find the best parameters for our model. We are going to use the Multinomial Naive Bayes model for this task. Alpha is from range 0.1 to 10.0, with seperations of 0.1 for each value of alpha. Just wide range is taken so that the best results can be achieved.

In [28]:
# Hyperparameter tuning for Naive Bayes
param_grid_nb = {'alpha': [0.1 * i for i in range(1, 101)]}
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
grid_search_nb.fit(X_train, y_train)
best_nb_classifier = grid_search_nb.best_estimator_

# Deploy the best Naive Bayes model
new_sms_nb = ["Get a free gift now!", "Hi, how are you?"]
new_sms_corpus_nb = [' '.join(tokenize_text(clean_text(sms.lower()))) for sms in new_sms_nb]
new_sms_tfidf_nb = tfidf_vectorizer.transform(new_sms_corpus_nb)
predicted_labels_nb = best_nb_classifier.predict(new_sms_tfidf_nb)

for sms, label in zip(new_sms_nb, predicted_labels_nb):
    print(f"Email: {sms} - Predicted Label: {label}")
    
print(f"Best Score: {grid_search_nb.best_score_}")
print(f"Best Estimator: {grid_search_nb.best_estimator_}")
print(f"Best Value of Alpha: {grid_search_nb.best_params_}")



Email: Get a free gift now! - Predicted Label: spam
Email: Hi, how are you? - Predicted Label: spam
Best Score: 0.9855141852737525
Best Estimator: MultinomialNB(alpha=0.1)
Best Value of Alpha: {'alpha': 0.1}


We are now going to use the Naive Bayes model for our task. We are going to use the Multinomial Naive Bayes model. This is because our data is discrete and not continuous. We are going to use the MultinomialNB function from the sklearn library. A very important thing to note is that we are going to use the fit function on our training data. This is done so that our model can learn from our training data. We will now create a model variable.

In [29]:
# Step 3: Model Training (Naive Bayes)
nb_classifier = best_nb_classifier
nb_classifier.fit(X_train, y_train)

Now that we have trained our model. We would use that model to predict the labels of our testing data. We are going to use the predict function on our testing data. We will now create a predictions variable. Additionally, we would also check the accuracy of our model.

In [30]:
# Step 4: Model Evaluation
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9806848112379281


Lastly, we are going to create a classification report and a confusion matrix. This is done so that we can check the accuracy of our model. We are going to use the classification_report and the confusion_matrix function from the sklearn library. 

In [31]:
# Display classification report and confusion matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
                                                                                               precision    recall  f1-score   support

 its termination would not  have such a phenomenal impact on the power situation .  however        0.00      0.00      0.00         0
                                                                           mr suresh prabhu        0.00      0.00      0.00         1
                                                                                         ham       0.99      0.99      0.99       842
                                                                                        spam       0.98      0.96      0.97       296

                                                                                    accuracy                           0.98      1139
                                                                                   macro avg       0.49      0.49      0.49      1139
                                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


> ## **Machine Learning Model:** Logistic Regression ##

We would use the same methods as the Naive Bayes Algorithm.

In [32]:
# Hyperparameter tuning for Logistic Regression
param_grid_logreg = {'C': [0.1 * i for i in range(1, 101)]}
grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_logreg, cv=3)
grid_search_logreg.fit(X_train, y_train)
best_logreg_classifier = grid_search_logreg.best_estimator_

# Deploy the best Logistic Regression model
new_sms_logreg = ["Get a free gift now!", "Hi, how are you?"]
new_sms_corpus_logreg = [' '.join(tokenize_text(clean_text(sms.lower()))) for sms in new_sms_logreg]
new_sms_tfidf_logreg = tfidf_vectorizer.transform(new_sms_corpus_logreg)
predicted_labels_logreg = best_logreg_classifier.predict(new_sms_tfidf_logreg)

for sms, label in zip(new_sms_logreg, predicted_labels_logreg):
    print(f"Email: {sms} - Predicted Label: {label}")

print(f"Best Score: {grid_search_logreg.best_score_}")
print(f"Best Estimator: {grid_search_logreg.best_estimator_}")
print(f"Best Value of Alpha: {grid_search_logreg.best_params_}")



Email: Get a free gift now! - Predicted Label: spam
Email: Hi, how are you? - Predicted Label: ham
Best Score: 0.9903426456221488
Best Estimator: LogisticRegression(C=9.3, max_iter=1000)
Best Value of Alpha: {'C': 9.3}


In [33]:
# Step 3: Model Training (Logistic Regression)
logreg_classifier = best_logreg_classifier
logreg_classifier.fit(X_train, y_train)

In [34]:
# Step 4: Model Evaluation
y_pred_logreg = logreg_classifier.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)

Logistic Regression Accuracy: 0.9894644424934153


In [35]:
# Display classification report and confusion matrix for Logistic Regression
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))
print("\nLogistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))


Logistic Regression Classification Report:
                     precision    recall  f1-score   support

 mr suresh prabhu        0.00      0.00      0.00         1
               ham       0.99      1.00      0.99       842
              spam       0.99      0.97      0.98       296

          accuracy                           0.99      1139
         macro avg       0.66      0.66      0.66      1139
      weighted avg       0.99      0.99      0.99      1139


Logistic Regression Confusion Matrix:
 [[  0   0   1]
 [  0 839   3]
 [  0   8 288]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
