In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import category_encoders as ce
from sklearn.model_selection import GridSearchCV

In [4]:
'''Data Cleaning'''
df = pd.read_csv(r"D:\Northeastern Semester 1\Projects\ml_project\Data_Set\Vehicle_Coupon.csv")
df.drop(['car', 'direction_same', 'toCoupon_GEQ5min'], axis=1, inplace=True)
df['temperature'] = df['temperature'].astype(str)
df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [5]:
'''Create dummies and split data'''
df_ohe = pd.get_dummies(df)
X, y = df_ohe.drop(['Y'], axis=1), df_ohe['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [38]:
class LogisticRegression:
    
    def __init__(self,X,y, batchSize = 64,learningRate = 0.001, tolerance = 0.00005, maxIteration = 2000):
        self.X = X
        self.y =y
        self.tolerance = tolerance
        self.maxIteration = maxIteration
        self.learningRate = learningRate
        self.batchSize = batchSize

    def splitData(self):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

        return X_train, y_train, X_valid, y_valid, X_test, y_test 

    def add_x0(self, X):
        return np.column_stack([np.ones([X.shape[0], 1]), X])
        
    def sigmoid(self,z):
        sig = 1/(1+np.exp(-z))
        return sig
    
    def costFunction(self, X, y):
        pred_ =np.log(np.ones(X.shape[0])+np.exp(X.dot(self.w))) - X.dot(self.w).dot(y)
        cost = pred_.sum( )
        return cost
    
    def gradient(self,X,y):
        sigmoid = self.sigmoid(X.dot(self.w))
        grad = (sigmoid -y ).dot(X)
        return grad
    
    def gradientDescent(self, X, y):
        errors = []
        last = float('inf')
        
        for i in range(self.maxIteration):
            self.w = self.w - self.learningRate*self.gradient(X,y)
            curr = self.costFunction(X,y)
            
            diff = last - curr
            last - curr
            
            errors.append(curr)
            
            if diff < self.tolerance:
                print("The model stopped Learning")
                break
        # self.plot_cost(errors)


    def stochasticGD(self, X, y):
        X, y = np.array(X, dtype=np.float64), np.array(y, dtype=np.float64)
        XY = np.c_[X.reshape(X.shape[0], X.shape[1]), y.reshape(X.shape[0], 1)]
        
        
        # Set seed
        np.random.seed(2022)
        errors = []

        for i in tqdm(range(self.maxIteration)):
        # Shuffle x and y
          np.random.shuffle(XY)

          start = 0
          stop = start + self.batchSize
          X_batch, y_batch = XY[start:stop, :-1], XY[start:stop, -1]

          
          last_error = float('inf')

          # Recalculating the difference
          self.w = self.w - self.learningRate * self.gradient(X_batch, y_batch)
          current_error = self.costFunction(X, y)
         
          diff = last_error - current_error
          last_error = current_error

          errors.append(current_error)
          if np.abs(diff) < self.tolerance:
              print('Model stopped learning')
              break
        print(self.w)
        #self.plot_rmse(errors)
        
    def predict(self,X):
        pred = self.sigmoid(X.dot(self.w))
        return np.around(pred)
        
    def evaluate(self, y, y_hat):
        
        y = (y == 1)
        y_hat = (y_hat == 1)
        
        accuracy = (y == y_hat).sum() / y.size
        precision = (y & y_hat).sum() / y_hat.sum()
        recall = (y & y_hat).sum() / y.sum()

        print("Accuracy is", accuracy)
        print('Recall is', recall)
        print('precision is ', precision)
        
        return recall, precision, accuracy
    
    def fit(self):

        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        self.w = np.ones(X_train.shape[1], dtype = np.float64)*0
        self.stochasticGD(X_train, y_train)
        
        #print(self.w)
        
        y_hat_train = self.predict(X_train)
        recall, precision, accuracy = self.evaluate(y_train,y_hat_train)


    def validation(self):
        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        y_hat_valid = self.predict(X_valid)
        recall, precision, accuracy  = self.evaluate(y_valid, y_hat_valid)

    def test(self):
        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        y_hat_test = self.predict(X_test)
        recall, precision, accuracy  = self.evaluate(y_test, y_hat_test)



In [39]:
lr = LogisticRegression(X, y)

In [40]:
lr.fit()

100%|██████████| 2000/2000 [00:42<00:00, 47.07it/s]

[ 5.11303687e-02 -9.59628800e-02  5.34497184e-02 -4.08328138e-01
 -2.29481804e-01  3.77762890e-01 -6.80413413e-02 -2.73047103e-02
  2.52714925e-01 -3.54592994e-01  2.09422523e-01 -1.05693916e-01
 -2.12896358e-01  3.98830018e-01 -3.58392717e-02  9.89713515e-02
  1.71076643e-02  6.99607861e-02 -1.33033623e-01  2.66760638e-02
  1.84677859e-01 -6.80413413e-02 -7.92193557e-01  8.46653964e-01
 -2.76047996e-01 -4.01080792e-01  7.02908125e-01  4.50565183e-01
 -3.70325439e-01 -1.89096305e-02  9.91493746e-02  2.02968276e-02
  3.44017489e-02 -7.85514279e-02 -3.08281370e-02  1.34178545e-01
  4.29027891e-02 -9.90222582e-02  5.68616570e-02 -3.33897195e-02
 -1.11485225e-02  1.55402288e-01 -1.21305901e-01  9.06815996e-02
  6.40183157e-02 -4.37704889e-02 -3.30752551e-01  1.84622376e-01
  1.30622000e-01  7.55000923e-02  1.61037246e-01 -1.68002739e-01
 -4.45404388e-03  2.23941205e-02  2.07309946e-02 -3.39699627e-03
  1.71756306e-01 -1.42882154e-01  7.31875075e-03  6.20712756e-02
  2.75214449e-01  1.45224




In [41]:
lr.validation()

Accuracy is 0.7027027027027027
Recall is 0.7837301587301587
precision is  0.7181818181818181


In [33]:
lr.test()

Accuracy is 0.6786652653704677
Recall is 0.818266110338433
precision is  0.6798921417565486


In [18]:
from mlxtend.evaluate import bias_variance_decomp

x_train = X_train.values
x_test = X_test.values
Y_train = y_train.values
Y_test = y_test.values

mse, bias, var = bias_variance_decomp(lr, x_train, Y_train, x_test, Y_test, loss='mse', num_rounds=200, random_seed=1)

TypeError: fit() takes 1 positional argument but 3 were given