In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer #importing this to convert text data to numbers
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
mail=pd.read_csv("/content/mail_data.csv")

In [3]:
mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#replacing null values with null strings
data=mail.where((pd.notnull(mail)),'')

In [8]:
mail.shape

(5572, 2)

In [9]:
#replacing ham with 0 and spam as 1
#label encoding
mail.loc[mail['Category']=='spam','Category',]=1

In [10]:
mail.loc[mail['Category']=='ham','Category',]=0

In [11]:
mail.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
input=mail['Message']
outcome=mail['Category']

In [13]:
print(input)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [14]:
print(outcome)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: object


In [15]:
X_train,X_test,Y_train,Y_test=train_test_split(input,outcome,test_size=0.2,random_state=2)

Feature extraction

In [16]:
#transform text data to numerical values so that logistic regression can be used
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

Training the model


In [17]:
model=LogisticRegression()

In [18]:
model.fit(X_train_features,Y_train)

LogisticRegression()

In [20]:
modelResults=model.predict(X_train_features)
accuracy=accuracy_score(Y_train,modelResults)
print(accuracy)

0.9683643706529056


In [21]:
modelResults=model.predict(X_test_features)
accuracy=accuracy_score(Y_test,modelResults)
print(accuracy)

0.9524663677130045


Building a predictive system

In [24]:
input_mail = ["You have won 10 crores! Grab on to this offer"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
