In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings

In [None]:
df = pd.read_csv('/content/mail_data.csv')

In [None]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_data = df.where((pd.notnull(df)),'')

In [None]:
mail_data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [None]:
mail_data.shape

(5572, 2)

Label Encoder


In [None]:
# spam 0;  ham 1;
mail_data.loc[mail_data['Category']=='spam', 'Category'] = 0
mail_data.loc[mail_data['Category']=='ham', 'Category'] = 1

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
x = mail_data['Message']
y = mail_data['Category']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)

In [None]:
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4457,) (1115,)


In [None]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [None]:
print(x_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34775 stored elements and shape (4457, 7431)>
  Coords	Values
  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537024
  (1, 2758)	0.3226407885943799
  (1, 1839)	0.2784903590561455
  (1, 918)	0.22871581159877646
  (1, 2746)	0.3398297002864083
  (1, 2957)	0.3398297002864083
  (1, 3325)	0.31610586766078863
  (1, 3185)	0.29694482957694585
  (1, 4080)	0.18880584110891163
  (2, 6601)	0.6056811524587518
  (2, 2404)	0.45287711070606745
  (2, 3156)	0.4107239318312698
  (2, 407)	0.509272536051008
  (3, 7414)	0.8100020912469564
  (3, 2870)	0.5864269879324768
  (4, 2870)	0.41872147309323743
  (4, 487)	0.2899118421746198
  :	:
  (4454, 2855)	0.47210665083641806
  (4454, 2246)	0.47210665083641806
  (4455, 4456)	0.24

In [None]:
model = LogisticRegression()

In [None]:
# training the model

model.fit(x_train_features, y_train)

In [None]:
# prediction on training data
prediction_train_data = model.predict(x_train_features)
accuracy_train_data = accuracy_score(y_train, prediction_train_data)


In [None]:
print(f"Accuracy on training data:  {accuracy_train_data*100:.2f}")

Accuracy on training data:  96.77


In [None]:
prediction_test_data = model.predict(x_test_features)
accuracy_test_data = accuracy_score(y_test, prediction_test_data)

In [None]:
print(f"Accuracy on test data: {accuracy_test_data*100:.2f}")

Accuracy on test data: 96.68


In [None]:
input_mail = input("Enter mail: ")
input_mail = [input_mail]
input_data_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_data_features)
print(prediction)

if prediction[0]==1:
  print("Ham mail")
else:
  print("Spam mail")

Enter mail: hey
[1]
Ham mail


## Deploying to Streamlit


In [None]:
import pickle

In [None]:
filename = 'final_model.sav'
pickle.dump(model, open(filename, 'wb'))

filename = 'feature_extraction.pkl'
pickle.dump(feature_extraction, open(filename, 'wb'))