In [3]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv(r'mail_data.csv')

In [5]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [11]:
df.shape

(5572, 2)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### LABEL ENCODING

In [16]:
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})

In [18]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
X = df['Message']
y = df['Category']

In [22]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [24]:
y.tail()

5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, dtype: int64

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [28]:
X_train.shape,X_test.shape

((4457,), (1115,))

In [30]:
y_train.shape,y_test.shape

((4457,), (1115,))

In [32]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words="english", binary=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [33]:
print(X_train_features)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

In [36]:
print(X_test_features)

  (0, 4942)	0.27552235188443686
  (0, 4100)	0.3392428284838497
  (0, 3955)	0.3774291665065587
  (0, 3395)	0.402169324846608
  (0, 3225)	0.402169324846608
  (0, 2173)	0.30145841567028486
  (0, 2065)	0.36113324080559445
  (0, 1751)	0.34896165336060586
  (1, 7158)	0.3981347747267476
  (1, 6986)	0.2493471978387002
  (1, 6642)	0.326271353777915
  (1, 6544)	0.2204999931204713
  (1, 5430)	0.387052012561607
  (1, 4044)	0.3234324946551934
  (1, 3443)	0.3234324946551934
  (1, 1975)	0.3578586983359201
  (1, 1361)	0.37034060973735533
  (2, 6570)	0.3042743325149729
  (2, 5597)	0.43828336765880876
  (2, 4369)	0.4230992819157864
  (2, 3510)	0.4016985150384895
  (2, 3084)	0.21988546741069176
  (2, 3067)	0.21988546741069176
  (2, 2377)	0.4230992819157864
  (2, 1292)	0.3150204452887917
  :	:
  (1110, 6142)	0.22937745257301317
  (1110, 5204)	0.2537606265072484
  (1110, 4806)	0.26149679947415966
  (1110, 4497)	0.2874866271650959
  (1110, 4105)	0.23914254153997352
  (1110, 3938)	0.24167410415901527
  (1110

In [38]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [40]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_features, y_train)

In [41]:
prediction_train = model.predict(X_train_features)

In [44]:
prediction_train

array([0, 1, 1, ..., 1, 1, 1])

In [46]:
accuracy_train = accuracy_score(y_train, prediction_train)

In [48]:
accuracy_train

0.9649988781691721

In [50]:
prediction_test = model.predict(X_test_features)
accuracy_test = accuracy_score(y_test, prediction_test)

In [52]:
accuracy_test

0.9659192825112107

In [54]:
input_user_mail = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]

input_data_features = feature_extraction.transform(input_user_mail)

prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This is a ham mail")
else:
    print("This is a spam mail")

This is a spam mail


In [56]:
import pickle
pickle.dump(model, open("logistic_regression.pkl", "wb"))
pickle.dump(feature_extraction, open("feature_extraction.pkl", "wb"))