# 1. IMPORTING THE DEPENDENCIES


In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 2. DATA COLLECTION AND PREPROCESSING


In [5]:
# Loading data from csv to a dataframe

raw_data = pd.read_csv('C:/Users/artur/Documents/Proyectos ML/Email Classification/mail_data.csv')

In [6]:
print(raw_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name
5572     spam  Get Rich Quick! Dear Friend, Do you want to ma...

[5573 rows x 2 columns]


In [7]:
# Replace none values with null

mail_data = raw_data.where((pd.notnull(raw_data)),'')

In [8]:
# Print mail_data

mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# Number of rows and columns

mail_data.shape

(5573, 2)

In [10]:
# Changing category column

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# Separating data as texts and label

X = mail_data['Message']
Y = mail_data['Category']

In [12]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
5572    Get Rich Quick! Dear Friend, Do you want to ma...
Name: Message, Length: 5573, dtype: object


In [13]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5568    1
5569    1
5570    1
5571    1
5572    0
Name: Category, Length: 5573, dtype: object


# 3. TRAIN AND TEST DATA SPLITTING

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [15]:
print(X_test.shape)
print(X_train.shape)

(1115,)
(4458,)


# 4.FEATURE EXTRACTION

In [16]:
# Transform the text to feature vectors to feed the Logistic Regression Model

feature_extraction = TfidfVectorizer(min_df= 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [17]:
print(X_train_features)

  (0, 4965)	0.35220359469390916
  (0, 6895)	0.4361087948626689
  (0, 5462)	0.4851901905768957
  (0, 4186)	0.30941615643448916
  (0, 2348)	0.2803506193722415
  (0, 1148)	0.39415619477828906
  (0, 1699)	0.3473590251726336
  (1, 1308)	0.7209587932301342
  (1, 6594)	0.5184874138571819
  (1, 4186)	0.45977083436842753
  (2, 2640)	0.7129453814664571
  (2, 6611)	0.4741023508003651
  (2, 4734)	0.5166583435997288
  (3, 3871)	0.26059048379398303
  (3, 6265)	0.279243602779787
  (3, 64)	0.26059048379398303
  (3, 3284)	0.16368875311207728
  (3, 7258)	0.18384682568758393
  (3, 6336)	0.26059048379398303
  (3, 604)	0.26059048379398303
  (3, 6824)	0.269126025971473
  (3, 6024)	0.26059048379398303
  (3, 812)	0.22771984421282773
  (3, 1577)	0.37248343030295744
  (3, 2884)	0.12645839493186312
  :	:
  (4455, 3161)	0.3210079348827513
  (4455, 4614)	0.2670844787478871
  (4456, 2270)	0.37039638825745896
  (4456, 5659)	0.3322719488843572
  (4456, 6817)	0.3018858737530533
  (4456, 2488)	0.35911018364386516
  (44

# 5. TRAINING AND EVALUATING THE MODEL

In [18]:
model = LogisticRegression()

In [19]:
# Training the model
model.fit(X_train_features, Y_train)

In [20]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [21]:
print('Accuracy on traning data: ', accuracy_on_training_data)

Accuracy on traning data:  0.9679228353521758


In [22]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [23]:
print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.9668161434977578


# 6. BUILDING A PREDICTIVE SYSTEM

In [47]:
input_mail = ["This msg is for your mobile content order It has been resent as previous attempt failed due to network error Queries to customersqueries@netvision.uk.com"]

# Covert text to feature vector

input_data_features = feature_extraction.transform(input_mail)

# Making prediction

Prediction = model.predict(input_data_features)
print(Prediction)

if (Prediction[0]==1):
    print('Ham mail')

else:
    print('Spam mail')

[0]
Spam mail
