In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy,math

The last column of this dataset denotes whether the e-mail was 
considered spam (1) or not (0).  
Most of the attributes indicate whether a particular word or
character was frequently occuring in the e-mail

We will be using this dataset to train a model to make predictions if an email is consided spam or not when presennted with these atributes.

In [2]:
df=pd.read_csv('spambase.data')
df.head()

Unnamed: 0,0,0.64,0.64.1,0.1,0.32,0.2,0.3,0.4,0.5,0.6,...,0.41,0.42,0.43,0.778,0.44,0.45,3.756,61,278,1
0,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
1,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
2,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,...,0.0,0.223,0.0,0.0,0.0,0.0,3.0,15,54,1


# Defining values

In [3]:
X_train=df.drop(columns=['1'])
y_train=df['1']
y_train

0       1
1       1
2       1
3       1
4       1
       ..
4595    0
4596    0
4597    0
4598    0
4599    0
Name: 1, Length: 4600, dtype: int64

In [4]:
X_train=np.array(X_train)
print(y_train.shape)
print(X_train.shape)

(4600,)
(4600, 57)


# Functions

In [5]:
#sigmoid function for computing cost for logistic regression
def sigmoid(z): 
    z=np.clip(z,-500,500)
    g=1/(1+np.exp(-z))
    return g

In [6]:
# Feature scaling function to make sure gradient decent runs smoothly
def feature_scaling(x):
    mu=np.mean(x,axis=0)
    sigma=np.std(x,axis=0)
    x_scaled=(x-mu)/sigma
    return x_scaled,mu,sigma
    

In [7]:
#Computing cost for logistic regression using vectorization
def compute_cost(x,y,w,b):
    m=x.shape[0]
    z=x @ w+b
    fwb=sigmoid(z)
    fwb=np.clip(fwb,1e-15,1-1e-15)
    cost=(-1/m)*np.sum(y*np.log(fwb)+(1-y)*np.log(1-fwb))
    return cost

In [8]:
# Computing gradient for logistic regression
def compute_gradient(x,y,w,b):
    m=x.shape[0]
    z=x @ w+b
    fwb=sigmoid(z)
    fwb=np.clip(fwb,1e-15,1-1e-15)
    err=fwb-y
    dj_dw=(1/m)*x.T @ err 
    dj_db=(1/m)*np.sum(err)

    return dj_dw,dj_db

In [9]:
#Final Gradient descent function for finding value of w and b
def gradient_decent(x,y,w_in,b_in,alpha,itr,compute_cost,compute_gradient):
    w=np.copy(w_in)
    b=b_in
    J_hist=[compute_cost(x,y,w,b)] #list for storing cost at each iteration
    print(f'Iteration :    0: Cost: {J_hist[-1]:8.4f}')

    for i in range (itr):
        #main formula for gradient descent
        dj_dw,dj_db=compute_gradient(x,y,w,b)
        w-=alpha*dj_dw
        b-=alpha*dj_db

        if i<10000: #appending J_hist 
            J_hist.append(compute_cost(x,y,w,b))

        #Printing iteration and cost at 10% iteration
        if (i+1)% max(1,itr//10)==0:
            print(f'Iteration : {i+1:4d}: Cost: {J_hist[-1]:8.4f}')

    return w,b,J_hist

# Running Gradient Descent for logistic regression

In [10]:
#Specifying required values for gradient decent
n=X_train.shape[1]
alpha=10
itr=5000
X_scaled,mu,sigma=feature_scaling(X_train)
w_in=np.zeros((n,))
b_in=0.0

#running gradient descent
w_final,b_final,J_hist=gradient_decent(X_scaled,y_train,w_in,b_in,alpha,itr,compute_cost,compute_gradient)

Iteration :    0: Cost:   0.6931
Iteration :  500: Cost:   0.2029
Iteration : 1000: Cost:   0.2015
Iteration : 1500: Cost:   0.2007
Iteration : 2000: Cost:   0.2002
Iteration : 2500: Cost:   0.1998
Iteration : 3000: Cost:   0.1995
Iteration : 3500: Cost:   0.1993
Iteration : 4000: Cost:   0.1991
Iteration : 4500: Cost:   0.1989
Iteration : 5000: Cost:   0.1988


# Model evaluation

In [11]:
#function for testing model predictions
def predict(x,w_final,b_final):
    z=x@w_final+b_final
    fwb=sigmoid(z)
    predictions=(fwb>=0.51).astype(int)  #stores prediction values as 0 or 1
    labels=np.where(predictions==1,'Spam','Not Spam') #labels store Spam or Not spam

    return predictions,labels


In [12]:
# testing model accuracy
preds,lab=predict(X_scaled,w_final,b_final)
accuracy = np.mean(preds == y_train) * 100
print(f'Model acucuracy= {accuracy}%')

Model acucuracy= 93.21739130434783%


In [18]:
# Comparing some model predictions with actual vlaue in dataset

X_test=(X_train[0:3000:100,:]-mu)/sigma # Getting test data for model evaluation while feature scaling it
y_test=y_train[0:3000:100]


predictions,labels=predict(X_test,w_final,b_final)
y_labels=np.where(y_test==1,'Spam','Not Spam')

indexes=np.random.choice(len(labels),10,replace=False)
for i in indexes:
    print(f'Model prediction: {labels[i]:<10} : Actual value of y: {y_labels[i]:<10}')

Model prediction: Not Spam   : Actual value of y: Not Spam  
Model prediction: Not Spam   : Actual value of y: Not Spam  
Model prediction: Spam       : Actual value of y: Spam      
Model prediction: Spam       : Actual value of y: Spam      
Model prediction: Spam       : Actual value of y: Spam      
Model prediction: Spam       : Actual value of y: Spam      
Model prediction: Not Spam   : Actual value of y: Not Spam  
Model prediction: Spam       : Actual value of y: Spam      
Model prediction: Not Spam   : Actual value of y: Not Spam  
Model prediction: Not Spam   : Actual value of y: Not Spam  
