In [50]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import category_encoders as ce

In [56]:
'''Data Cleaning'''
df = pd.read_csv(r"D:\Northeastern Semester 1\Projects\ml_project\Data_Set\Vehicle_Coupon.csv")
df.drop(['car', 'direction_same', 'toCoupon_GEQ5min'], axis=1, inplace=True)
df['temperature'] = df['temperature'].astype(str)
df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [57]:
'''Create dummies and split data'''
df_ohe = pd.get_dummies(df)
X, y = df_ohe.drop(['Y'], axis=1), df_ohe['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Stochastic gradient descent 

In [76]:

class LogisticRegression:
    
    def __init__(self,X,y, batchSize = 32,learningRate = 0.001, tolerance = 0.00005, maxIteration = 2000):
        self.X = X
        self.y =y
        self.tolerance = tolerance
        self.maxIteration = maxIteration
        self.learningRate = learningRate
        self.batchSize = batchSize

    def splitData(self):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

        return X_train, y_train, X_valid, y_valid, X_test, y_test 

    def add_x0(self, X):
        return np.column_stack([np.ones([X.shape[0], 1]), X])
        
    def sigmoid(self,z):
        sig = 1/(1+np.exp(-z))
        return sig
    
    def costFunction(self, X, y):
        pred_ =np.log(np.ones(X.shape[0])+np.exp(X.dot(self.w))) - X.dot(self.w).dot(y)
        cost = pred_.sum( )
        return cost
    
    def gradient(self,X,y):
        sigmoid = self.sigmoid(X.dot(self.w))
        grad = (sigmoid -y ).dot(X)
        return grad
    
    def gradientDescent(self, X, y):
        errors = []
        last = float('inf')
        
        for i in range(self.maxIteration):
            self.w = self.w - self.learningRate*self.gradient(X,y)
            curr = self.costFunction(X,y)
            
            diff = last - curr
            last - curr
            
            errors.append(curr)
            
            if diff < self.tolerance:
                print("The model stopped Learning")
                break
        # self.plot_cost(errors)


    def stochasticGD(self, X, y):
        X, y = np.array(X, dtype=np.float64), np.array(y, dtype=np.float64)
        XY = np.c_[X.reshape(X.shape[0], X.shape[1]), y.reshape(X.shape[0], 1)]
        
        
        # Set seed
        np.random.seed(2022)
        errors = []

        for i in tqdm(range(self.maxIteration)):
        # Shuffle x and y
          np.random.shuffle(XY)

          start = 0
          stop = start + self.batchSize
          X_batch, y_batch = XY[start:stop, :-1], XY[start:stop, -1]

          
          last_error = float('inf')

          # Recalculating the difference
          self.w = self.w - self.learningRate * self.gradient(X_batch, y_batch)
          current_error = self.costFunction(X, y)
         
          diff = last_error - current_error
          last_error = current_error

          errors.append(current_error)
          if np.abs(diff) < self.tolerance:
              print('Model stopped learning')
              break
        print(self.w)
        #self.plot_rmse(errors)
        
    def predict(self,X):
        pred = self.sigmoid(X.dot(self.w))
        return np.around(pred)
        
    def evaluate(self, y, y_hat):
        
        y = (y == 1)
        y_hat = (y_hat == 1)
        
        accuracy = (y == y_hat).sum() / y.size
        precision = (y & y_hat).sum() / y_hat.sum()
        recall = (y & y_hat).sum() / y.sum()

        print("Accuracy is", accuracy)
        print('Recall is', recall)
        print('precision is ', precision)
        
        return recall, precision, accuracy
    
    def fit(self):

        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        self.w = np.ones(X_train.shape[1], dtype = np.float64)*0
        self.stochasticGD(X_train, y_train)
        
        #print(self.w)
        
        y_hat_train = self.predict(X_train)
        recall, precision, accuracy = self.evaluate(y_train,y_hat_train)


    def validation(self):
        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        y_hat_valid = self.predict(X_valid)
        recall, precision, accuracy  = self.evaluate(y_valid, y_hat_valid)

    def test(self):
        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        y_hat_test = self.predict(X_test)
        recall, precision, accuracy  = self.evaluate(y_test, y_hat_test)



In [77]:
lr = LogisticRegression(X,y)

In [78]:
lr.fit()

100%|██████████| 2000/2000 [00:38<00:00, 51.35it/s]

[ 0.01054954 -0.11060517 -0.03036936 -0.29551037 -0.16954511  0.31263985
 -0.07204962 -0.03989872  0.24805562 -0.24250005  0.10538828 -0.08570464
 -0.16990467  0.32665444 -0.0200231   0.07660875  0.01445947  0.04078251
 -0.11777394  0.08329929  0.13678689 -0.07204962 -0.65995702  0.73624118
 -0.28345799 -0.3492558   0.62747477  0.42252774 -0.35148262 -0.02627101
  0.09731614  0.02690143  0.07295672 -0.0756326  -0.04977462  0.10317686
  0.0298203  -0.07993337  0.04353041  0.01467862 -0.04544174  0.13739543
 -0.07106623  0.03547905  0.05255477 -0.01822515 -0.26365609  0.1167498
  0.07236842  0.11125337  0.08200711 -0.11885353 -0.00255782  0.0259271
  0.00098972 -0.01269256  0.08887318 -0.11848854  0.00202637  0.02661387
  0.16690958  0.08679587 -0.01728303 -0.17192339  0.04517278  0.03805574
  0.05599256 -0.01653694 -0.01785484  0.04760401 -0.06088314 -0.00947224
  0.02344633  0.01739519 -0.09021824  0.04364294 -0.00097737  0.14164018
  0.07911369  0.16026867 -0.07671192 -0.15467804 -0.1




In [79]:
lr.validation()

Accuracy is 0.7015765765765766
Recall is 0.7916666666666666
precision is  0.7137745974955277


In [80]:
lr.test()

Accuracy is 0.6875985286389911
Recall is 0.7770050996754752
precision is  0.7030201342281879
