In [None]:
# Importing Dependencies:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

Reading the Data

In [None]:
df = pd.read_csv('/content/mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Check for the missing values in the dataset

In [None]:
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
df.shape

(5572, 2)

Replace Catagorical Values as Numerical Values


*   ham
*   spam



In [None]:
encoder  = LabelEncoder()

In [None]:
df['Category'] = encoder.fit_transform(df.Category)

In [None]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."



*   ham has been replaced with 0
*   spam has been replaced with 1



Split the data as features and target

In [None]:
x = df['Message']
y = df['Category']

In [None]:
x.shape

(5572,)

In [None]:
y.shape

(5572,)

In [None]:
y.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
0,4825
1,747


Split the data into training and testing data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x.values, y.values, random_state=3, test_size=0.2)

In [None]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [None]:
x_train_features = feature_extraction.fit_transform(X_train)
x_test_features = feature_extraction.transform(X_test)

In [None]:
Y_train.dtype

dtype('int64')

In [None]:
print(x_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [None]:
clf  = LogisticRegression()

In [None]:
clf.fit(x_train_features, Y_train)

In [None]:
y_hat_train = clf.predict(x_train_features)

In [None]:
print("Accuracy score of training data: ",  accuracy_score(Y_train, y_hat_train))

Accuracy score of training data:  0.9670181736594121


In [None]:
x_test_features.shape

(1115, 7431)

In [None]:
y_hat_test = clf.predict(x_test_features)

In [None]:
print("Accuracy score of testing data: ",  accuracy_score(Y_test, y_hat_test))

Accuracy score of training data:  0.9659192825112107


Predictive System

In [None]:
input_data = ["Your free ringtone is waiting to be collected. Simply text the password ""MIX"" to 85069 to verify. Get Usher and Britney. FML, PO Box 5249, MK17 92H. 450Ppw 16"]

input_data_text = feature_extraction.transform(input_data)
prediction = clf.predict(input_data_text)

if (prediction[0] == 1):
  print("It is a spam mail")
else:
  print("It is not a spam mail")

It is a spam mail
