In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("mail_data.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# CHecking null and duplicated values 

In [3]:
df.isna().sum()

Category    0
Message     0
dtype: int64

In [4]:
df.duplicated().sum()

np.int64(415)

In [5]:
df=df.drop_duplicates()

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5157 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5157 non-null   object
 1   Message   5157 non-null   object
dtypes: object(2)
memory usage: 120.9+ KB


In [8]:
df.describe()

Unnamed: 0,Category,Message
count,5157,5157
unique,2,5157
top,ham,Rofl. Its true to its name
freq,4516,1


In [9]:
df['Category'].value_counts()

Category
ham     4516
spam     641
Name: count, dtype: int64

# Encoding the categorical values

In [10]:
# spam_email={'ham':0,'spam':1}
# df['Category']=df['Category'].map(spam_email)
df.loc[df['Category']=='ham','Category',]=0
df.loc[df['Category']=='spam','Category',]=1

In [11]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Dependent and Independent Variables

In [12]:
X=df['Message']
y=df['Category']

# Train-Test split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
X_train.shape

(4125,)

In [16]:
X_test.shape

(1032,)

# Feature Extraction

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [18]:
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [19]:
print(X_train)

2598    Got fujitsu, ibm, hp, toshiba... Got a lot of ...
5418    So how are you really. What are you up to. How...
99                        I see a cup of coffee animation
2321        This pain couldn't have come at a worse time.
2388                               Also where's the piece
                              ...                        
4750    Thanx u darlin!im cool thanx. A few bday drink...
474     Want 2 get laid tonight? Want real Dogging loc...
3273    MOON has come to color your dreams, STARS to m...
4022                We have to pick rayan macleran there.
882     see, i knew giving you a break a few times wou...
Name: Message, Length: 4125, dtype: object


In [20]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 31429 stored elements and shape (4125, 7345)>
  Coords	Values
  (0, 3056)	0.3768666543151668
  (0, 2902)	0.37814533528523747
  (0, 3412)	0.3967833520562993
  (0, 3364)	0.3967833520562993
  (0, 6635)	0.3967833520562993
  (0, 4017)	0.27386074693036566
  (0, 4337)	0.33305955181924346
  (0, 5618)	0.23819775627179068
  (1, 5313)	0.55295106396087
  (1, 4170)	0.8332137306025039
  (2, 2034)	0.5724594631408114
  (2, 1800)	0.5474949662354052
  (2, 920)	0.6103600781566784
  (3, 4782)	0.4512908807267238
  (3, 1950)	0.5492646153275634
  (3, 1827)	0.3080655289837874
  (3, 7209)	0.5492646153275634
  (3, 6545)	0.31312766649238255
  (4, 4934)	1.0
  (5, 3461)	0.745570338992923
  (5, 2264)	0.6664269424430392
  (6, 3091)	0.16573762354288965
  (6, 4519)	0.16305124020026582
  (6, 4644)	0.20594833628441883
  (6, 2326)	0.47029719292723343
  :	:
  (4122, 4431)	0.3593480537173941
  (4123, 4926)	0.4041188590986093
  (4123, 5287)	0.646795156027331
  (4

# Model Train

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
lr_model=LogisticRegression(class_weight='balanced')

In [23]:
lr_model.fit(X_train_features,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,100


In [24]:
#prediction on training data
pred_train_data=lr_model.predict(X_train_features)

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
acs_train=accuracy_score(y_train,pred_train_data)
acs_train

0.9944242424242424

In [27]:
# prediction on testing data
pred_test_data=lr_model.predict(X_test_features)

In [28]:
acs_test=accuracy_score(y_test,pred_test_data)
acs_test

0.9738372093023255

In [29]:
# To check the model's work
input_mail=["Hello I am bardan KC"]
input_feature_extraction=feature_extraction.transform(input_mail)
prediction=lr_model.predict(input_feature_extraction)

In [30]:
if prediction[0]==0:
    print("Ham Mail")
else:
    print("Spam mail")

Ham Mail


# Model Saving

In [31]:
import joblib
from sklearn.pipeline import Pipeline

In [32]:
pipeline=Pipeline([('tfidf',feature_extraction),('clf',lr_model)])
joblib.dump(pipeline,'lr_model.pkl')

['lr_model.pkl']