In [2]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import category_encoders as ce

In [4]:
'''Data Cleaning'''
df = pd.read_csv("/Users/anushkahegde/Desktop/NEU/IE_7374_Machine_Learning/ml_project/Data_Set/Vehicle_Coupon.csv")
df.drop(['car', 'direction_same', 'toCoupon_GEQ5min'], axis=1, inplace=True)
df['temperature'] = df['temperature'].astype(str)
df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [23]:
'''Create dummies and split data'''
df_ohe = pd.get_dummies(df)
X, y = df_ohe.drop(['Y'], axis=1), df_ohe['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [55]:
class LogisticRegression:
    
    def __init__(self,X,y, learningRate = 0.00001, tolerance = 0.00005, maxIteration = 5000):
        self.X = X
        self.y =y
        self.tolerance = tolerance
        self.maxIteration = maxIteration
        self.learningRate = learningRate

    def splitData(self):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

        return X_train, y_train, X_valid, y_valid, X_test, y_test 

    def add_x0(self, X):
        return np.column_stack([np.ones([X.shape[0], 1]), X])
        
    def sigmoid(self,z):
        sig = 1/(1+np.exp(-z))
        return sig
    
    def costFunction(self, X, y):
        pred_ =np.log(np.ones(X.shape[0])+np.exp(X.dot(self.w))) - X.dot(self.w).dot(y)
        cost = pred_.sum( )
        return cost
    
    def gradient(self,X,y):
        sigmoid = self.sigmoid(X.dot(self.w))
        grad = (sigmoid -y ).dot(X)
        return grad
    
    def gradientDescent(self, X, y):
        errors = []
        last = float('inf')
        
        for i in range(self.maxIteration):
            self.w = self.w - self.learningRate*self.gradient(X,y)
            curr = self.costFunction(X,y)
            
            diff = last - curr
            last - curr
            
            errors.append(curr)
            
            if diff < self.tolerance:
                print("The model stopped Learning")
                break
        # self.plot_cost(errors)
        
    def predict(self,X):
        pred = self.sigmoid(X.dot(self.w))
        return np.around(pred)
        
    def evaluate(self, y, y_hat):
        
        y = (y == 1)
        y_hat = (y_hat == 1)
        
        accuracy = (y == y_hat).sum() / y.size
        precision = (y & y_hat).sum() / y_hat.sum()
        recall = (y & y_hat).sum() / y.sum()

        print("Accuracy is", accuracy)
        print('Recall is', recall)
        print('precision is ', precision)
        
        return recall, precision, accuracy
    
    def fit(self):

        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        self.w = np.ones(X_train.shape[1], dtype = np.float64)*0
        self.gradientDescent(X_train, y_train)
        
        #print(self.w)
        
        y_hat_train = self.predict(X_train)
        recall, precision, accuracy = self.evaluate(y_train,y_hat_train)


    def validation(self):
        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        y_hat_valid = self.predict(X_valid)
        recall, precision, accuracy  = self.evaluate(y_valid, y_hat_valid)

    def test(self):
        X_train, y_train, X_valid, y_valid, X_test, y_test = self.splitData()
        y_hat_test = self.predict(X_test)
        recall, precision, accuracy  = self.evaluate(y_test, y_hat_test)



In [60]:
lr = LogisticRegression(X=X, y=y, learningRate = 0.0001, tolerance = 0.00005, maxIteration = 1000)

In [61]:
lr.fit()

Accuracy is 0.6910729372007886
Recall is 0.7747836835599505
precision is  0.7095313561240661


In [62]:
lr.validation()

Accuracy is 0.6981981981981982
Recall is 0.7896825396825397
precision is  0.7107142857142857


In [64]:
lr.test()

Accuracy is 0.6915396741986337
Recall is 0.7816411682892906
precision is  0.7057346169945584


In [None]:
grad, sig = lr.gradient(X_train,y_train)
grad .shape



ValueError: Unable to coerce to DataFrame, shape must be (72, 8878): given (8878, 8879)

In [261]:
sig = lr.sigmoid(X_train.dot(lr.w.T))
sig.shape

(8878,)

(array([[1., 1., 1., ..., 0., 1., 0.],
        [1., 1., 1., ..., 0., 1., 0.],
        [1., 0., 1., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 1., 0.],
        [1., 0., 0., ..., 0., 1., 0.],
        [1., 1., 1., ..., 0., 1., 0.]]),
 array([[1., 0., 1., ..., 0., 1., 0.],
        [1., 0., 0., ..., 0., 1., 0.],
        [1., 0., 1., ..., 0., 1., 0.],
        ...,
        [1., 0., 1., ..., 0., 1., 0.],
        [1., 0., 0., ..., 0., 1., 0.],
        [1., 0., 1., ..., 0., 1., 0.]]),
 array([[1., 0., 0., ..., 0., 1., 0.],
        [1., 1., 0., ..., 0., 1., 0.],
        [1., 1., 1., ..., 0., 1., 0.],
        ...,
        [1., 1., 1., ..., 0., 1., 0.],
        [1., 1., 1., ..., 0., 1., 0.],
        [1., 0., 1., ..., 0., 1., 0.]]),
 2767     0
 1028     1
 2363     0
 12521    0
 7235     0
         ..
 11079    0
 3212     1
 11752    1
 1919     1
 11567    0
 Name: Y, Length: 7102, dtype: int64,
 7043     1
 9217     0
 8405     1
 2670     1
 783      1
         ..
 412      0