### Importing the dependencies

In [1]:
# Importing all libraries
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data Collection and Preprocessing

In [2]:
# Load the data
df = pd.read_csv('mail_data.csv')

In [3]:
# Replace Null values with a Null string
mail_data = df.where((pd.notnull(df)),'')

In [4]:
# Display first 5 values of mail_data
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Check number of rows and columns
mail_data.shape

(5572, 2)

### Label Encoding

In [6]:
# Label spam mail as 0; ham mail as 1;
mail_data.loc[mail_data['Category'] == 'spam','Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham','Category'] = 1

In [7]:
# Seperating the data as texts and labels
X = mail_data['Message']
Y = mail_data['Category']

In [8]:
# Print X (Messages)
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [9]:
# Print Y (Category)
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

### Splitting the data into Training data and Test data

In [10]:
# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)

In [11]:
# Print X dimensions
X.shape

(5572,)

In [12]:
# Print X_train dimensions
X_train.shape

(4457,)

In [13]:
# Print X_test dimentions
X_test.shape

(1115,)

### Feature Extraction

In [14]:
# Transform text data to feature vectors
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [15]:
# Convert Y_train and Y_test into integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [16]:
# Print X_train_features
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34775 stored elements and shape (4457, 7431)>
  Coords	Values
  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537024
  (1, 2758)	0.3226407885943799
  (1, 1839)	0.2784903590561455
  (1, 918)	0.22871581159877646
  (1, 2746)	0.3398297002864083
  (1, 2957)	0.3398297002864083
  (1, 3325)	0.31610586766078863
  (1, 3185)	0.29694482957694585
  (1, 4080)	0.18880584110891163
  (2, 6601)	0.6056811524587518
  (2, 2404)	0.45287711070606745
  (2, 3156)	0.4107239318312698
  (2, 407)	0.509272536051008
  (3, 7414)	0.8100020912469564
  (3, 2870)	0.5864269879324768
  (4, 2870)	0.41872147309323743
  (4, 487)	0.2899118421746198
  :	:
  (4454, 2855)	0.47210665083641806
  (4454, 2246)	0.47210665083641806
  (4455, 4456)	0.24

In [17]:
# Print X_test_features
print(X_test_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7687 stored elements and shape (1115, 7431)>
  Coords	Values
  (0, 1)	0.2381316303003606
  (0, 9)	0.2852706805264544
  (0, 14)	0.26797874471323896
  (0, 20)	0.30668032384591537
  (0, 306)	0.23975986557206702
  (0, 405)	0.2381316303003606
  (0, 1041)	0.28016206931555726
  (0, 1082)	0.2451068436245027
  (0, 1361)	0.25132445289897426
  (0, 1405)	0.3176863938914351
  (0, 1549)	0.2646498848307188
  (0, 4386)	0.18353336340308998
  (0, 5213)	0.1988547357502182
  (0, 5373)	0.2365698724638063
  (0, 6920)	0.20571591693537986
  (0, 7271)	0.1940327008179069
  (1, 3491)	0.496093956101028
  (1, 4418)	0.3457696891316818
  (1, 4729)	0.22965776503163893
  (1, 6214)	0.3621564482127515
  (1, 6507)	0.26731535902873493
  (1, 6588)	0.3298937975962767
  (1, 6732)	0.42473488678029325
  (1, 7368)	0.29957800964520975
  (2, 201)	0.2824102268489399
  :	:
  (1110, 6591)	0.7327660015422193
  (1111, 2440)	0.4137350055985486
  (1111, 3227)	0.44384935772735

### Training the Model

In [18]:
# Logistic Regression
model = LogisticRegression()

In [19]:
# Training the model
model.fit(X_train_features,Y_train)

### Evaluating the model

In [20]:
# Prediction on Training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [21]:
# Print Accuracy on Training Data
print("Accuracy on Training Data : ", accuracy_on_training_data)

Accuracy on Training Data :  0.9676912721561588


In [22]:
# Prediction on Test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [23]:
# Print Accuracy on Test Data
print("Accuracy on Test Data : ", accuracy_on_test_data)

Accuracy on Test Data :  0.9668161434977578


### Building a Predictive System

In [24]:
mail = input("Enter mail text : ")

In [25]:
# Convert text to feature vectors
input_mail = [mail]
input_data_features = feature_extraction.transform(input_mail)

In [26]:
# Making Predictions
prediction = model.predict(input_data_features)
print(prediction)

if prediction[0] == 1:
    print("Mail is not spam")
else:
    print("Spam Mail")

[0]
Spam Mail
