In [1]:
# imorting dependencies
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# load data 
mail_data = pd.read_csv('mail_data.csv')
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [3]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [4]:
# replace the null values with null string
mail_data =  mail_data.where((pd.notnull(mail_data)),'')

In [5]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [6]:
mail_data.shape

(5572, 2)

## LAbel Encoding

In [8]:
mail_data = mail_data.replace({"Category":{'ham':0,"spam":1}})

# 0 ---> ham
# 1 ---> Spam

In [9]:
mail_data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives around here though"


In [None]:
### Seprate the data into target and features

In [10]:
X = mail_data['Message']
Y = mail_data['Category']

print(X)
print(Y)

# Features Extraction

In [37]:
vectorozior = TfidfVectorizer()

In [39]:
vectorozior.fit(X)

In [40]:
X = vectorozior.transform(X)

In [41]:
print(X)

  (0, 8544)	0.22081883351949952
  (0, 8320)	0.18240101628302693
  (0, 8080)	0.2300034410835773
  (0, 7690)	0.15550627816331297
  (0, 5954)	0.25533539230157154
  (0, 5567)	0.156366230319576
  (0, 4497)	0.2757872678027423
  (0, 4370)	0.32645117023873077
  (0, 4110)	0.10707657674366398
  (0, 3651)	0.1803318063070826
  (0, 3611)	0.15304155020494287
  (0, 3567)	0.14786475068980162
  (0, 2334)	0.25281395947472957
  (0, 2057)	0.2757872678027423
  (0, 1763)	0.2757872678027423
  (0, 1761)	0.31163292870610654
  (0, 1313)	0.24417482890859654
  (0, 1079)	0.32645117023873077
  (1, 8446)	0.43162957585464123
  (1, 5563)	0.5466243141314314
  (1, 5534)	0.2718944069420321
  (1, 4533)	0.4083258549263009
  (1, 4338)	0.5236804332035243
  (2, 8502)	0.18684640809270722
  (2, 8459)	0.14485563808749205
  :	:
  (5570, 7095)	0.2053833705586015
  (5570, 7085)	0.1843030791950225
  (5570, 5363)	0.21007771328115263
  (5570, 4638)	0.15952002662071876
  (5570, 4241)	0.12205359146631063
  (5570, 4184)	0.282975003903785

## Split the data into train & test

In [47]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2,random_state=2)

In [48]:
print(X.shape,X_train.shape,X_test.shape)
print(Y.shape,Y_train.shape,Y_test.shape)

(5572, 8709) (4457, 8709) (1115, 8709)
(5572,) (4457,) (1115,)


# Train the model

## Logistics Regression

In [49]:
model = LogisticRegression()

In [51]:
model.fit(X_train,Y_train)

### Evaluating the trained model

In [52]:
#Accuracy on training data
model.score(X_train,Y_train)

0.974646623289208

In [53]:
# Accuracy on test data
model.score(X_test,Y_test)

0.9551569506726457

In [67]:
input_data = ["URGENT, IMPORTANT INFORMATION FOR O2 USER. TODAY IS YOUR LUCKY DAY! 2 FIND OUT WHY LOG ONTO HTTP://WWW.URAWINNER.COM THERE IS A FANTASTIC SURPRISE AWAITING FOR YOU"
]
# Convert the input_data into TfidfVectorizer

input_data_vectrozior = vectorozior.transform(input_data)

# Making Prediction
prediction = model.predict(input_data_vectrozior)
print(prediction)

if (prediction[0] == 0):
    print('Ham Mail')
else:
    print('Spam Mail')

[1]
Spam Mail
