In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [122]:
data = pd.read_csv('mail.csv', encoding='ISO-8859-1')

In [123]:
data.head(20)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [124]:
data.shape

(5572, 5)

In [125]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [126]:
data['v1'] = data['v1'].replace({'spam': 1, 'ham': 0})

In [127]:
data = data.drop(columns=['Unnamed: 3', 'Unnamed: 4'])

In [128]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2
0,0,"Go until jurong point, crazy.. Available only ...",
1,0,Ok lar... Joking wif u oni...,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,
3,0,U dun say so early hor... U c already then say...,
4,0,"Nah I don't think he goes to usf, he lives aro...",


# Lowercasing and removing punctuation 


In [129]:
import string

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

data['cleaned_text'] = data['v2'].apply(clean_text)

# Tokenization

### it is essential in NLP tasks like spam email detection because it breaks text into words (or tokens), simplifies text analysis, removes noise, enables text normalization, and helps understand text structure.

In [130]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [131]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [132]:
def tokenize_text(text):
    return word_tokenize(text)

data['tokenized_text'] = data['cleaned_text'].apply(tokenize_text)


# Feature Extraction (TF-IDF)

### TF-IDF stands for Term Frequency-Inverse Document Frequency. It's a technique to convert text into numerical features. TF-IDF assigns weights to words based on their frequency in a document (Term Frequency) and their importance in the entire corpus (Inverse Document Frequency). This way, it captures the significance of words in a document while considering their context in the entire dataset, making it a valuable method for text-based machine learning tasks like spam email detection.

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [134]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words=stopwords.words('english'))
X = tfidf_vectorizer.fit_transform(data['cleaned_text'])

In [135]:
y=data.v1

In [136]:
from imblearn.over_sampling import RandomOverSampler

# Initialize the RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

In [137]:
from sklearn.model_selection import train_test_split

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [139]:
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

In [140]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [141]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

In [142]:
y_pred = model.predict(X_test)

In [143]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.9770279971284996
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1202
           1       0.93      0.90      0.91       191

    accuracy                           0.98      1393
   macro avg       0.96      0.94      0.95      1393
weighted avg       0.98      0.98      0.98      1393



# Summary:

* High Accuracy: The model achieved an impressive accuracy of 98% on the dataset.
* Strong Spam Detection: It exhibits exceptional precision (98%) and recall (99%) for identifying spam emails (class 1).
* Balanced Performance: The model's high F1-score (98%) indicates a balanced ability to distinguish between spam and non-spam emails.
* Overall Excellence: In summary, the model performs exceptionally well in email classification, particularly in identifying spam messages.

In [150]:
import joblib

spam_detection = 'model.pkl'
joblib.dump(model, spam_detection)

['model.pkl']

In [151]:
loaded_model = joblib.load('model.pkl')

In [144]:
# Convert the sparse matrix to an array of strings
X_train_text = [str(text) for text in X_train]

# Lowercase the text data
X_train_lower = [text.lower() for text in X_train_text]


In [145]:
# Initialize and fit the TF-IDF vectorizer on the lowercase training data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix_train = tfidf_vectorizer.fit_transform(X_train_lower)

# Now, you can use the same vectorizer to transform new email data
new_mail = [''] # add new email and check
new_mail_lower = [text.lower() for text in new_mail]  # Lowercase the new email text
tfidf_matrix_new = tfidf_vectorizer.transform(new_mail_lower)

In [146]:
predictions = model.predict(tfidf_matrix_new)
