In [14]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import category_encoders as ce

In [19]:
class LogisitcRegression:

    def __init__(self, tolerance, maxIteration, alpha, df)-> None:
        self.alpha = alpha
        self.maxIteration = maxIteration
        self.tolerance = tolerance
        self.df = df


    def load_dataset(self):

        X = self.df.iloc[:,:-1]
        y = self.df.iloc[:,-1]  

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)
        ordinal_columns = ['age', 'education', 'income', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']
        oe = ce.OrdinalEncoder(cols=ordinal_columns, return_df=True)
        Xtrain_ordinal = oe.fit_transform(X_train)
        Xtest_ordinal = oe.transform(X_test)
        return Xtrain_ordinal, Xtest_ordinal, y_train, y_test

    def add_column(self, X):

        return np.column_stack([np.ones(X.shape[0],1),X])

    def sigmoid(self, z):
        y_val = 1/(1+np.exp(-z))
        return y_val

    def cost(self, X, y):

        sig = self.sigmoid(np.dot(X,self.w))
        loss = y*np.log(y_val) +(1-y)*np.log(1-y_val)
        cost = -loss.sum()

        return cost

    
    def gradient(self, X, y):

        y_val = self.sigmoid(np.dot(X,self.w))
        grad = np.dot((y_val -y), X)
        return grad

    def gradientdescent(self, X, y):

        errors = []
        currE = float('inf')

        for i in tqdm(range(self.maxIteration)):

            self.w = self.w = self.alpha*self.gradient(X,y) #weights
            err = self.cost(X,y) #current error
            diff = currE - err #error difference
            currE = err

            errors.append(err)

            if diff < self.tolerance:
                print('Model stops')
                break

            return

    def predict(self, X):

        sig = self.sigmoid(np.dot(X,self.w))
        return np.round(sig)


    def evaluate(self, y, y_hat):
        y = (y == 1) # convert into t/f vector
        y_hat = (y_hat == 1)

        accuracy = np.sum(y == y_hat) / y.size
        precision = np.sum(y & y_hat) / y_hat.sum()
        recall = np.sum(y & y_hat) / y.sum() 

        return accuracy, precision, recall
        

    def fit(self):

        Xtrain_ordinal, Xtest_ordinal, y_train, y_test = self.load_dataset()

        print('Solving using gradient descent')
        self.w = np.ones(Xtrain_ordinal.shape[1], dtype = np.float64) * 0
        self.gradientdescent(Xtrain_ordinal, y_train)
        print(self.w)

        y_hat_train = self.predict(Xtrain_ordinal)
        accuracy, precision, recall = self.evaluate(y_train, y_hat_train)
        print("Accuracy is {}".format(accuracy))
        print("Precision is {}".format(precision))
        print("Recall is {}".format(recall))
                



In [16]:
data = pd.read_csv('Vehicle_Coupon.csv')

In [17]:
lr = LogisitcRegression(alpha = 0.01, tolerance = 0.00005, maxIteration = 1000, df = data )

In [18]:
lr.fit()

Solving using gradient descent


  0%|          | 0/1000 [00:00<?, ?it/s]


TypeError: can't multiply sequence by non-int of type 'float'