#### Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Data Collection

In [2]:
# Loading the dataset into a Pandas DataFrame
mail_dataset = pd.read_csv("spam.csv", encoding='ISO-8859-1')

In [3]:
# Getting the first five rows of the dataset
mail_dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Keeping the relevant columns
mail_dataset = mail_dataset[["v1", "v2"]]

In [5]:
# Renaming the columns
mail_dataset.columns = ["Category", "Message"]

In [6]:
# Printing the dataset
mail_dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Getting some basic info about the dataset
mail_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
# Checking for missing values
mail_dataset.isnull().sum()

Category    0
Message     0
dtype: int64

In [9]:
# Checking the number of rows and columns in the dataset
mail_dataset.shape

(5572, 2)

#### Label Encoding

In [10]:
# Getting the number of values in the "Category" column
mail_dataset["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [11]:
# Performing Label Encoding
mail_dataset["Category"] = mail_dataset["Category"].map({"ham": 0, "spam": 1})
mail_dataset.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


#### Training and Test data

In [12]:
# Separating the data into feature and label
X = mail_dataset["Message"]
Y = mail_dataset["Category"]

In [13]:
# Using the Tfidf Vectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english")

In [14]:
# Transforming the feature data into feature vectors to be used as an input to the Logistic Regression model
X = feature_extraction.fit_transform(X)
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 43478 stored elements and shape (5572, 8404)>
  Coords	Values
  (0, 4224)	0.3509649021061901
  (0, 5741)	0.2745089285415426
  (0, 2271)	0.27179815735762314
  (0, 1271)	0.2625103008882829
  (0, 1703)	0.2964965675440533
  (0, 3534)	0.19387320529717864
  (0, 8227)	0.23740046706740073
  (0, 4349)	0.2964965675440533
  (0, 1701)	0.33503393550839805
  (0, 1994)	0.2964965675440533
  (0, 3494)	0.16470488207184114
  (0, 1051)	0.3509649021061901
  (0, 8026)	0.19609779550499865
  (1, 5343)	0.27211951321382544
  (1, 4385)	0.4082988561907181
  (1, 4192)	0.5236458071582338
  (1, 8134)	0.4316010362639011
  (1, 5369)	0.5465881710238072
  (2, 3265)	0.11668290849577327
  (2, 2875)	0.36416076721341406
  (2, 8185)	0.19275202100567362
  (2, 2110)	0.1967393609735806
  (2, 8146)	0.14982929655580285
  (2, 3005)	0.475194304498231
  (2, 2329)	0.2069855878698051
  :	:
  (5567, 309)	0.2429849422825127
  (5567, 700)	0.24985589469349212
  (5567, 5801)	0.2

In [15]:
print(Y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64


In [16]:
# Splitting the features into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [17]:
print(X.shape, X_train.shape, X_test.shape)

(5572, 8404) (4457, 8404) (1115, 8404)


#### Training the model

In [18]:
model = LogisticRegression()

# Training the model
model.fit(X_train, Y_train)

#### Model Evaluation

In [19]:
# Prediction on the training data
train_pred = model.predict(X_train)

In [20]:
# Accuracy score
train_accuracy = metrics.accuracy_score(Y_train, train_pred)
print(f"Accuracy score of training data: {train_accuracy}")

# R squared error
train_r2score = metrics.r2_score(Y_train, train_pred)
print(f"R squared error of training data: {train_r2score}")

# Mean squared error
train_mse = metrics.mean_squared_error(Y_train, train_pred)
print(f"Mean squared error of training data: {train_mse}")

Accuracy score of training data: 0.9715054969710568
R squared error of training data: 0.7515467999150225
Mean squared error of training data: 0.028494503028943234


In [21]:
# Prediction on the test data
test_pred = model.predict(X_test)

In [22]:
# Accuracy score
test_accuracy = metrics.accuracy_score(Y_test, test_pred)
print(f"Accuracy score of test data: {test_accuracy}")

# R squared error
test_r2score = metrics.r2_score(Y_test, test_pred)
print(f"R squared error of test data: {test_r2score}")

# Mean squared error
test_mse = metrics.mean_squared_error(Y_test, test_pred)
print(f"Mean squared error of test data: {test_mse}")

Accuracy score of test data: 0.9390134529147982
R squared error of test data: 0.49856487176434805
Mean squared error of test data: 0.06098654708520179


#### Making a Predictive System

In [23]:
# Define input data
input_data = ['''Yup... Ok i go home look at the timings then i msg Ì_ again... Xuhui going to learn on 2nd may too but her lesson is at 8am,,,''']

# Convert text to feature vectors
features = feature_extraction.transform(input_data)

# Making predictions
prediction = model.predict(features)
print(f"Prediction: {prediction}")

# Condition for good prediction
if prediction[0] == 1:
    print("Spam mail")
else:
    print("Ham mail")

Prediction: [0]
Ham mail
