
## 1: Transform the data


In [145]:
#import statements
import numpy as np
import pandas as pd
import copy
import math

#create a path to the file

file_path = "shopping.csv"

#read the csv file a pandas dataframe
df = pd.read_csv(file_path)

#print the dataframe column data types
df.dtypes


Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

In [146]:
 #show first 10 data lines
df.head(10)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,3,142.5,0,0.0,48,1052.255952,0.004348,0.013043,0.0,0.0,Nov,1,8,6,11,Returning_Visitor,False,False
1,6,437.391304,2,235.55,83,2503.881781,0.002198,0.004916,2.086218,0.0,Mar,2,2,3,2,Returning_Visitor,False,True
2,1,41.125,0,0.0,126,4310.004668,0.000688,0.012823,3.451072,0.0,Nov,2,2,2,2,Returning_Visitor,False,False
3,2,141.0,0,0.0,10,606.666667,0.008333,0.026389,36.672294,0.0,Aug,2,5,7,4,Returning_Visitor,False,False
4,18,608.14,6,733.8,168,4948.398759,0.006632,0.013528,10.150644,0.0,Aug,2,2,3,1,Returning_Visitor,True,False
5,1,22.0,0,0.0,9,415.25,0.033333,0.048148,0.0,0.0,Mar,3,3,1,1,Returning_Visitor,False,False
6,0,0.0,0,0.0,14,186.933333,0.042857,0.071429,0.0,0.0,May,2,2,3,4,Returning_Visitor,False,False
7,0,0.0,0,0.0,12,198.0,0.016667,0.075,0.0,0.0,Mar,2,2,3,2,Returning_Visitor,False,False
8,8,149.5,0,0.0,55,2598.991667,0.003279,0.008197,48.729956,0.0,May,2,4,8,2,Returning_Visitor,True,False
9,6,140.333333,0,0.0,9,88.95,0.0,0.004762,0.0,0.0,May,3,2,2,3,New_Visitor,False,False


In [147]:
#define a mapping scheme for the values not a number
bool_mapping={
    "TRUE":1,
    "FALSE":0
}

visitor_mapping={
    "Returning_Visitor":1,
    "New_Visitor":2,
    "Other":3
}

month_mapping={
    "Jan":1,
    "Feb":2,
    "Mar":3,
    "Apr":4,
    "May":5,
    "June":6,
    "Jul":7,
    "Aug":8,
    "Sep":9,
    "Oct":10,
    "Nov":11,
    "Dec":12
}

In [148]:

df['Revenue']=df['Revenue'].astype(str)
df['Weekend']=df['Weekend'].astype(str)
df['Revenue'] = df['Revenue'].str.upper()
df['Weekend'] = df['Weekend'].str.upper()

In [150]:
df['Month'] = df['Month'].replace(month_mapping)
df['VisitorType'] = df['VisitorType'].replace(visitor_mapping)
df['Revenue'] = df['Revenue'].replace(bool_mapping)  
df['Weekend'] = df['Weekend'].replace(bool_mapping)


In [151]:
def load_data(df):
    # Ensure the DataFrame is already transformed (with all floats and ints)
    
    # Split the data into features (X) and target (y)
    X = df.iloc[:, :-1].values  # All columns except the last
    y = df.iloc[:, -1].values    # The last column
    
    return X, y

## 2: Split The data

In [152]:
#Load the data in and save the mappings we did
X_train, y_train = load_data(df)
print(type(X_train))

print(X_train.shape)
print(y_train.shape)

<class 'numpy.ndarray'>
(5000, 17)
(5000,)


## 3: Implementation of Scalers


In [153]:
#MinMax Scaling
def min_max_scaling(data):
    scaled_data = data.copy()
    
    if len(data.shape) == 1:  # 1D array (like y_train)
        min_value = data.min()
        max_value = data.max()
        scaled_data = (data - min_value) / (max_value - min_value)
    
    else:  # 2D array (like X_train)
        for i in range(data.shape[1]):
            min_value = data[:, i].min()
            max_value = data[:, i].max()
            scaled_data[:, i] = (data[:, i] - min_value) / (max_value - min_value)
    
    return scaled_data


In [154]:
#Mean Normalization Scaling
def mean_normalization(data):
    scaled_data = data.copy()
    
    if len(data.shape) == 1:  # Handle 1D array like y_train
        mean_value = data.mean()
        min_value = data.min()
        max_value = data.max()
        scaled_data = (data - mean_value) / (max_value - min_value)
    
    else:  # Handle 2D array like X_train
        for i in range(data.shape[1]):  # Loop through columns using NumPy's indexing
            mean_value = data[:, i].mean()
            min_value = data[:, i].min()
            max_value = data[:, i].max()
            scaled_data[:, i] = (data[:, i] - mean_value) / (max_value - min_value)
    
    return scaled_data


In [155]:
#Z-score normalization
def z_score_normalization(data):
    scaled_data = data.copy()
    
    if len(data.shape) == 1:  # Handle 1D array like y_train
        mean_value = data.mean()
        std_value = data.std()
        scaled_data = (data - mean_value) / std_value
    
    else:  # Handle 2D array like X_train
        for i in range(data.shape[1]):  # Loop through columns using NumPy's indexing
            mean_value = data[:, i].mean()
            std_value = data[:, i].std()
            scaled_data[:, i] = (data[:, i] - mean_value) / std_value
    
    return scaled_data


## 3.1: Initalize the Scalers to Data

In [156]:
#implement the Min-Max scale
X_train_minmax = min_max_scaling(X_train)
y_train_minmax = min_max_scaling(y_train)


In [157]:
#implement the normalization scale
X_train_normal = mean_normalization(X_train)
y_train_normal = mean_normalization(y_train)

In [158]:
#implement z_score_normal
X_train_Z = z_score_normalization(X_train)
y_train_Z = z_score_normalization(y_train)

## 4: Logistic Regression Model

In [159]:
#Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [160]:
#Compute Cost Function
def compute_cost(X, y, w, b, *argv):
    """
    Computes the cost over all examples
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (ndarray Shape (m,)) target value 
      w : (ndarray Shape (n,)) values of parameters of the model      
      b : (scalar) value of bias parameter of the model
      *argv : unused, for compatibility with regularized version below
    Returns:
      total_cost : (scalar) cost 
    """
    m, n = X.shape
    z = np.dot(X, w) + b
    h = sigmoid(z)
    
    # Compute cost
    cost = -(1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    
    return cost


In [161]:
m,n = X_train.shape

initial_w = np.zeros(X_train.shape[1])  # Initialize weights for the number of features
initial_b = 0.
cost = compute_cost(X_train,y_train, initial_w, initial_b)
print('cost at initial w and b (zeros) {:.3f}'.format(cost))

cost at initial w and b (zeros) 0.693


In [162]:
def compute_gradient(X, y, w, b, *argv):
    """
    Computes the gradient for logistic regression 
 
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (ndarray Shape (m,))  target value 
      w : (ndarray Shape (n,))  values of parameters of the model      
      b : (scalar)              value of bias parameter of the model
      *argv : unused, for compatibility with regularized version below
    Returns
      dj_dw : (ndarray Shape (n,)) The gradient of the cost w.r.t. the parameters w. 
      dj_db : (scalar)             The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape
    z = np.dot(X, w) + b
    h = sigmoid(z)
    
    dj_dw = (1/m) * np.dot(X.T, (h - y))
    dj_db = (1/m) * np.sum(h - y)
    
    return dj_dw, dj_db

In [163]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_):
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X :    (ndarray Shape (m, n) data, m examples by n features
      y :    (ndarray Shape (m,))  target value 
      w_in : (ndarray Shape (n,))  Initial values of parameters of the model
      b_in : (scalar)              Initial value of parameter of the model
      cost_function :              function to compute cost
      gradient_function :          function to compute gradient
      alpha : (float)              Learning rate
      num_iters : (int)            number of iterations to run gradient descent
      lambda_ : (scalar, float)    regularization constant
      
    Returns:
      w : (ndarray Shape (n,)) Updated values of parameters of the model after
          running gradient descent
      b : (scalar)                Updated value of parameter of the model after
          running gradient descent
    """    
    m = len(X)
    J_history = []
    w_history = []
    
    for i in range(num_iters):
        dj_dw, dj_db = gradient_function(X, y, w_in, b_in, lambda_)
        w_in = w_in - alpha * dj_dw
        b_in = b_in - alpha * dj_db
        
        if i < 100000:
            cost = cost_function(X, y, w_in, b_in, lambda_)
            J_history.append(cost)
        
        if i % math.ceil(num_iters / 10) == 0 or i == (num_iters - 1):
            w_history.append(w_in)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}")
    
    return w_in, b_in, J_history, w_history

## 5: Training The Data

In [164]:
#Training with NO Scaling
initial_w = np.zeros(X_train.shape[1])  # or use small random values
initial_b = 0.0

iterations = 15000
alpha = 0.0001

w, b, J_history, _ = gradient_descent(X_train, y_train, initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations, 0)

  return 1 / (1 + np.exp(-z))
  cost = -(1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  cost = -(1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))


Iteration    0: Cost      nan
Iteration 1500: Cost      nan
Iteration 3000: Cost      nan
Iteration 4500: Cost      nan
Iteration 6000: Cost      nan
Iteration 7500: Cost      nan
Iteration 9000: Cost      nan
Iteration 10500: Cost      nan
Iteration 12000: Cost      nan
Iteration 13500: Cost      nan
Iteration 14999: Cost      nan


In [165]:
#Training with MINMAX
initial_w = np.zeros(X_train_minmax.shape[1])  # or use small random values
w_random = np.random.randn(X_train_minmax.shape[1]) * 0.01

initial_b = 0.0

iterations = 30000
alpha = 0.01

w, b, J_history, _ = gradient_descent(X_train_minmax, y_train_minmax,initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations, 0)

Iteration    0: Cost     0.69
Iteration 3000: Cost     0.41
Iteration 6000: Cost     0.40
Iteration 9000: Cost     0.39
Iteration 12000: Cost     0.38
Iteration 15000: Cost     0.38
Iteration 18000: Cost     0.38
Iteration 21000: Cost     0.37
Iteration 24000: Cost     0.37
Iteration 27000: Cost     0.37
Iteration 29999: Cost     0.36


In [166]:
#trainng with normalization
#Training with MINMAX
initial_w = np.zeros(X_train_normal.shape[1])  # or use small random values
initial_b = 0.0

iterations = 30000
alpha = 0.001

w, b, J_history, _ = gradient_descent(X_train_normal, y_train_normal,initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations, 0)

Iteration    0: Cost     0.69
Iteration 3000: Cost     0.29
Iteration 6000: Cost     0.17
Iteration 9000: Cost     0.11
Iteration 12000: Cost     0.08
Iteration 15000: Cost     0.06
Iteration 18000: Cost     0.04
Iteration 21000: Cost     0.03
Iteration 24000: Cost     0.02
Iteration 27000: Cost     0.01
Iteration 29999: Cost     0.01


In [167]:
#Training with z-score normal
initial_w = np.zeros(X_train_Z.shape[1])
initial_b = 0.0

iterations = 30000
alpha = 0.0002

w, b, J_history, _ = gradient_descent(X_train_Z, y_train_Z,initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations, 0)

Iteration    0: Cost     0.69
Iteration 3000: Cost     0.37
Iteration 6000: Cost     0.14
Iteration 9000: Cost    -0.04
Iteration 12000: Cost    -0.20
Iteration 15000: Cost    -0.33
Iteration 18000: Cost    -0.45
Iteration 21000: Cost    -0.57
Iteration 24000: Cost    -0.67
Iteration 27000: Cost    -0.78
Iteration 29999: Cost    -0.87


## 6: Get Predictions and Accuracys

In [168]:

def predict(X, w, b): 
    """
    Predict whether the label is 0 or 1 using learned logistic
    regression parameters w
    
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      w : (ndarray Shape (n,))  values of parameters of the model      
      b : (scalar)              value of bias parameter of the model

    Returns:
      p : (ndarray (m,)) The predictions for X using a threshold at 0.5
    """
    # number of training examples
    m, n = X.shape   
    p = np.zeros(m)
   
    ### START CODE HERE ### 
    # Loop over each example
    for i in range(m):   
        z_wb = np.dot(X[i],w) + b
        
        # Calculate the prediction for this example
        f_wb = sigmoid(z_wb)

        # Apply the threshold
        if f_wb >= 0.5:
            p[i] = 1
        else:
            p[i] = 0
        
        
    ### END CODE HERE ### 
    return p

In [169]:
#MINMAX ACCURACY
p = predict(X_train_minmax, w,b)
print('Train Accuracy: %f'%(np.mean(p == y_train) * 100))

Train Accuracy: 84.680000


In [170]:
#NORMALIZED ACCURACY
p = predict(X_train_normal, w,b)
print('Train Accuracy: %f'%(np.mean(p == y_train) * 100))

Train Accuracy: 84.640000


In [171]:
#Z score accuracy
p = predict(X_train_Z, w,b)
print('Train Accuracy: %f'%(np.mean(p == y_train) * 100))

Train Accuracy: 87.500000


In [172]:
#No scaler
p = predict(X_train, w,b)
print('Train Accuracy: %f'%(np.mean(p == y_train) * 100))

Train Accuracy: 17.060000


# Regularized Logistical Regression

In [173]:
def compute_cost_reg(X, y, w, b, lambda_ = 1):
    """
    Computes the cost over all examples
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (ndarray Shape (m,))  target value
      w : (ndarray Shape (n,))  values of parameters of the model
      b : (scalar)              value of bias parameter of the model
      lambda_ : (scalar, float) Controls amount of regularization
    Returns:
      total_cost : (scalar)     cost 
    """
    m, n = X.shape
    z = np.dot(X, w) + b
    h = sigmoid(z)
    
    cost = -(1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    regularization = (lambda_ / (2*m)) * np.sum(w**2)
    
    return cost + regularization

In [174]:
#gradient for regularized
def compute_gradient_reg(X, y, w, b, lambda_):
    """
    Computes the gradient for logistic regression with regularization
 
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (ndarray Shape (m,))  target value 
      w : (ndarray Shape (n,))  values of parameters of the model      
      b : (scalar)              value of bias parameter of the model
      lambda_ : (scalar,float)  regularization constant
    Returns
      dj_db : (scalar)             The gradient of the cost w.r.t. the parameter b. 
      dj_dw : (ndarray Shape (n,)) The gradient of the cost w.r.t. the parameters w. 

    """
    m, n = X.shape
    z = np.dot(X, w) + b
    h = sigmoid(z)
    
    dj_dw = (1/m) * np.dot(X.T, (h - y)) + (lambda_ / m) * w
    dj_db = (1/m) * np.sum(h - y)
    
    return dj_dw, dj_db

In [175]:
def gradient_descent_reg(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_):
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X :    (ndarray Shape (m, n) data, m examples by n features
      y :    (ndarray Shape (m,))  target value 
      w_in : (ndarray Shape (n,))  Initial values of parameters of the model
      b_in : (scalar)              Initial value of parameter of the model
      cost_function :              function to compute cost
      gradient_function :          function to compute gradient
      alpha : (float)              Learning rate
      num_iters : (int)            number of iterations to run gradient descent
      lambda_ : (scalar, float)    regularization constant
      
    Returns:
      w : (ndarray Shape (n,)) Updated values of parameters of the model after
          running gradient descent
      b : (scalar)                Updated value of parameter of the model after
          running gradient descent
    """
    
    
    w = w_in
    b = b_in
    J_history = []
    
    for i in range(num_iters):
        dj_dw, dj_db = gradient_function(X, y, w, b, lambda_)
        
        w = w - alpha * dj_dw
        b = b - alpha * dj_db
        
        cost = cost_function(X, y, w, b, lambda_)
        J_history.append(cost)
        
        if i % 1000 == 0 or i == num_iters - 1:
            print(f"Iteration {i:5d}: Cost {cost:.6f}")
    
    return w, b, J_history

In [176]:
initial_w = np.zeros(X_train_Z.shape[1])
initial_b = 0.0
iterations = 40000
alpha = 0.008  # Slightly higher than original
lambda_ = 0.1  # Decreased from 0.5
w2, b2, J_history2 = gradient_descent_reg(X_train_Z, y_train, initial_w, initial_b, compute_cost_reg, compute_gradient_reg, alpha, iterations, lambda_)


Iteration     0: Cost 0.691743
Iteration  1000: Cost 0.343076
Iteration  2000: Cost 0.310951
Iteration  3000: Cost 0.302496
Iteration  4000: Cost 0.299318
Iteration  5000: Cost 0.297896
Iteration  6000: Cost 0.297189
Iteration  7000: Cost 0.296811
Iteration  8000: Cost 0.296596
Iteration  9000: Cost 0.296466
Iteration 10000: Cost 0.296385
Iteration 11000: Cost 0.296332
Iteration 12000: Cost 0.296296
Iteration 13000: Cost 0.296270
Iteration 14000: Cost 0.296252
Iteration 15000: Cost 0.296238
Iteration 16000: Cost 0.296228
Iteration 17000: Cost 0.296220
Iteration 18000: Cost 0.296214
Iteration 19000: Cost 0.296209
Iteration 20000: Cost 0.296205
Iteration 21000: Cost 0.296203
Iteration 22000: Cost 0.296200
Iteration 23000: Cost 0.296198
Iteration 24000: Cost 0.296197
Iteration 25000: Cost 0.296196
Iteration 26000: Cost 0.296195
Iteration 27000: Cost 0.296194
Iteration 28000: Cost 0.296194
Iteration 29000: Cost 0.296193
Iteration 30000: Cost 0.296193
Iteration 31000: Cost 0.296192
Iteratio

In [177]:
p = predict(X_train_Z, w, b)

print('Train Accuracy: %f'%(np.mean(p == y_train) * 100))

Train Accuracy: 87.500000
