Importing the required dependencies

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [65]:
# loadding data fro csv
mail_data = pd.read_csv('./mail_data.csv')
# print(mail_data)

In [66]:
# replacing null value with empty string
mail_data = mail_data.where(pd.notnull(mail_data),'')

In [67]:
# printing first few rows of data
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [68]:
# shape of data
mail_data.shape

(5572, 2)

### Label Encoding
#### Spam =1 Ham = 0

In [69]:
#spam mail as 1 and ham mail as 0
mail_data.loc[mail_data['Category']=='spam','Category',] = 1
mail_data.loc[mail_data['Category']=='ham','Category',] = 0

In [70]:
# seprating text and labels

X = mail_data['Message']
Y = mail_data['Category']

In [71]:
print(X)
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: object


Prepairing Test & Train Data

In [72]:
# splliting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [73]:
# data shape
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


Feature Extraction

In [74]:
# converting text data to feature vectors

extract_features = TfidfVectorizer(min_df=1, stop_words='english',lowercase='True')
X_train_features = extract_features.fit_transform(X_train)
X_test_features = extract_features.transform(X_test)

# converting Y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [75]:
print(X_train_features)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

Model Training

In [76]:
# training logistic regression model
model = LogisticRegression()
model.fit(X_train_features, Y_train)


Model Evaluation

In [77]:
# predicting on test data
training_data_prediction = model.predict(X_train_features)
training_data_accuracy = accuracy_score(Y_train, training_data_prediction)

print('Accuracy on training data: ', training_data_accuracy)

Accuracy on training data:  0.9661207089970832


In [78]:
# predicting on train data
test_data_prediction = model.predict(X_test_features)
test_data_accuracy = accuracy_score(Y_test, test_data_prediction)

print('Accuracy on test data: ', test_data_accuracy)

Accuracy on test data:  0.967713004484305


Prediction for Fresh email

In [80]:
# predicting for new mail
mail = ["sports fans - get the latest sports news str* 2 ur mobile 1 wk FREE PLUS a FREE TONE Txt SPORT ON to 8007 www.getzed.co.uk 0870141701216+ norm 4txt/120p"]
#convert text to feature vectors
mail_features = extract_features.transform(mail)

# make prediction
prediction = model.predict(mail_features)
if(prediction[0]==1):
    print('Spam mail')
else:
    print('Ham mail')

Spam mail


Onnx Model Conversion

In [82]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType


# Specify an initial type for the model ( similar to input shape for the model )
initial_type = [ 
    ( 'input_mail' , FloatTensorType( [None,1] ) ) 
]

# Write the ONNX model to disk
converted_model = convert_sklearn( model , initial_types=initial_type )
with open( "sklearn_model.onnx", "wb" ) as f:
    f.write( converted_model.SerializeToString() )

In [84]:
!python3 -m onnxruntime.tools.convert_onnx_models_to_ort sklearn_model.onnx

Converting models with optimization style 'Fixed' and level 'all'
Converting optimized ONNX model /media/a2v10/New Volume/Workspace/Ai/Spam Email Detction/sklearn_model.onnx to ORT format model /media/a2v10/New Volume/Workspace/Ai/Spam Email Detction/sklearn_model.ort
Converted 1/1 models successfully.
Generating config file from ORT format models with optimization style 'Fixed' and level 'all'
2022-09-22 22:20:28,198 ort_format_model.utils [INFO] - Created config in /media/a2v10/New Volume/Workspace/Ai/Spam Email Detction/sklearn_model.required_operators.config
Converting models with optimization style 'Runtime' and level 'all'
Converting optimized ONNX model /media/a2v10/New Volume/Workspace/Ai/Spam Email Detction/sklearn_model.onnx to ORT format model /media/a2v10/New Volume/Workspace/Ai/Spam Email Detction/sklearn_model.with_runtime_opt.ort
Converted 1/1 models successfully.
Converting models again without runtime optimizations to generate a complete config file. These converted mo