In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix

In [12]:
data = pd.read_csv(r'mail_data.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
data.shape

(5572, 2)

In [6]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
data.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [None]:
# spam -> spam mail
# ham -> Not a Spam Mail
# Label encoding spam as 1 and ham as 0

In [17]:
# data.loc[data.Category=='spam','Category']

In [18]:
data.loc[data.Category=='spam','Category',]=1

In [19]:
data.loc[data.Category=='ham','Category',]=0

In [20]:
data.Category.value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

In [21]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,0,"Sorry, I'll call later"
freq,4825,30


In [25]:
x = data['Message']
y = data['Category']

In [30]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: object

### TTS

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [27]:
x.shape,x_test.shape,x_train.shape

((5572,), (1115,), (4457,))

### Feature Extraction

In [28]:
# Converting text to numeric by vectorizing
feature = TfidfVectorizer(stop_words='english', lowercase=True)

In [29]:
feature

In [31]:
# fitting and transforming with training data
x_train_feat = feature.fit_transform(x_train)

# transforming training data
x_test_feat = feature.transform(x_test)

# Type conversion of Y to int
y_train =y_train.astype('int')
y_test =y_test.astype('int')

In [34]:
x_train_feat

<4457x7543 sparse matrix of type '<class 'numpy.float64'>'
	with 34950 stored elements in Compressed Sparse Row format>

In [35]:
y_train,y_test

(2787    0
 5473    0
 1546    0
 1962    0
 4394    1
        ..
 1658    0
 1386    0
 1271    0
 2989    0
 2658    0
 Name: Category, Length: 4457, dtype: int32,
 624     0
 3557    0
 1595    0
 2745    0
 5015    0
        ..
 2049    0
 3897    1
 5289    0
 2808    1
 265     0
 Name: Category, Length: 1115, dtype: int32)

### Model

In [36]:
model= LogisticRegression()

In [37]:
model.fit(x_train_feat,y_train)

### Model Evaluation

In [48]:
# Training Data
train_pred = model.predict(x_train_feat)
train_acc = accuracy_score(y_train,train_pred)
train_conf = confusion_matrix(y_train,train_pred)
print(f"Training Accuracy: {train_acc}\nTraining Confusion Matrix: \n{train_conf}")

Training Accuracy: 0.9672425398249944
Training Confusion Matrix: 
[[3855    4]
 [ 142  456]]


In [46]:
# Test Data
y_pred = model.predict(x_test_feat)
acc = accuracy_score(y_test,y_pred)
conf = confusion_matrix(y_test,y_pred)
print(f"Accuracy on Test Data: {acc}\nConfusion Matrix of Test Data: \n{conf}")

Accuracy on Test Data: 0.9605381165919282
Confusion Matrix of Test Data: 
[[966   0]
 [ 44 105]]


In [None]:
# Model has trained well and performs good in both training and testing process

In [49]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [50]:
f1_score(y_test, y_pred)

0.8267716535433071

In [51]:
precision_score(y_test, y_pred)

1.0

In [52]:
recall_score(y_test, y_pred)

0.7046979865771812

### Predictive Model

In [54]:
mail = ['''Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's''']
def spam_detection(mail):
    # Convert text to feature Vectorizing
    input = feature.transform(mail)
    
    # making Prediction
    res = model.predict(input)
    if res[0]==1:
        return "This is a Spam Mail "
    else:
        return "This is not a Spam Mail"


print(spam_detection(mail))

This is a Spam Mail 


In [None]:
spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham,U dun say so early hor... U c already then say...


In [55]:
ns=['''U dun say so early hor... U c already then say...''']
print(spam_detection(ns))

This is not a Spam Mail
