In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
# Loading the data from CSV file to a Pandas Data Frame 
raw_mail_data = pd.read_csv('./data/mail_data.csv')

In [5]:
print(raw_mail_data.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
# Replaces the null values in the data with an empty string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [10]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
#Checking the number of rows and columns
mail_data.shape

(5572, 2)

# Lable encoding

In [23]:
# change the spam and non spam lables of the emails with 1 and zero
mail_data.loc[mail_data['Category'] == 'spam','Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham','Category',] = 1

In [35]:
# Seperating the data into features and targets
X, Y = mail_data['Message'], mail_data['Category']

# Splitting Data into Test and Train

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state= 3)

In [43]:
print(X_train.shape, X_test.shape)

(4457,) (1115,)


# Feature Extraction

In [47]:
# Transfrom The text data into feature vector used as input to the ml model
# Goes over all the words in the data set and based on its frequence will give those words a value
# Min_df = 1 states that if the sore allocated to a word is less than one then we ignore it
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')
# Below we fit the vectorizer to the training data 
X_train_features = feature_extraction.fit_transform(X_train)
# not need to fit this time only transform based on the previous fit
X_test_features = feature_extraction.transform(X_test)

# convert all values of Y_train and Y_test to integers 
Y_train = Y_train.astype("int")
Y_test = Y_test.astype("int")

In [50]:
print(X_train_features.shape)

(4457, 7431)


# Training

In [51]:
model = LogisticRegression()
# training the logistic regression model with the training data 
model.fit(X_train_features, Y_train)

# Evaluation of Model

In [60]:
prediction_on_train_data = model.predict(X_train_features)
accuracy_on_train_data = accuracy_score(Y_train, prediction_on_train_data)

In [61]:
print(accuracy_on_train_data)

0.9670181736594121


In [62]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [63]:
print(accuracy_on_test_data)

0.9659192825112107


# Building a predictive system

In [66]:
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
input_feature_mail = feature_extraction.transform(input_mail)
print(model.predict(input_feature_mail))

[0]
