# ML Assignment 2 | Logistic Regression
### Anirudh Agrawal: 2018A7PS0099H | Aviral Agarwal: 2018A7PS0192H | Vikramjeet Das: 2018A7PS0280H

Lets import some standard libraries and load the dataset first

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

%matplotlib inline
np.random.seed(2022)

In [2]:
data = pd.read_csv('data.csv', header=None)
X = data[[i for i in range(8)]]
y = data[8]

First we'll write some helper functions to shuffle, normalize and split the data into train-test splits

In [3]:
def shuffle(X, y):
    '''
        Shuffles rows of a dataframe and returns shuffled dataframe
    '''
    permute = np.random.permutation(len(X))
    return X[permute], y[permute]

In [4]:
class MinMaxScaler():
    '''
        Class to normalize a pandas dataframe
    '''
    def fit(self, X):
        self.X = X
        self.min = X.min(axis = 0)
        self.max = X.max(axis = 0)
        
    def fit_transform(self, X):
        self.X = X
        self.fit(X)
        return self.transform(X)
    
    def transform(self, X):
        if (isinstance(X, np.ndarray)):
            return ((X - np.array(self.min)) / (np.array(self.max) - np.array(self.min)))
        return (X - self.min) / (self.max - self.min)
        
    def get_params():
        return self.min, self.max

In [5]:
def train_test_split(X, y, test_size=0.2):
    '''
        Splits data into train-test split
            
            Parameters:
                X : Features
                y : Labels
                test_size : Fraction of data to use for test set
            
            Returns:
                (X_train, y_train, X_test, y_test) : Train test split
    '''
    if not isinstance(X, np.ndarray):
        X = np.array(X)
    if not isinstance(y, np.ndarray):
        y = np.array(y)
        
    X, y = shuffle(X, y)
        
    test_indices = np.random.rand(X.shape[0]) < test_size
    return X[~test_indices], y[~test_indices], X[test_indices], y[test_indices]

We normalize the data before processing it further

In [6]:
X_train, y_train, X_test, y_test = train_test_split(X, y)
scaler = MinMaxScaler()
X_train_scaled = np.array(scaler.fit_transform(X_train))
X_test_scaled = np.array(scaler.transform(X_test))

In [7]:
class LogisticRegressionGD():
    
    def __init__(self, lamda, include_bias=True):
        self.losses = []
        self.epoch_losses = []
        self.lamda = lamda
        self.include_bias = include_bias
        
    def fit(self, X, y, alpha, max_epochs=500000, regularization=None, show_loss=False):
        if self.include_bias:
            X = self.__pad_ones(X)
        self.coeffs = np.zeros((X.shape[1],))
        m = X.shape[0]
        self.losses = []
        epoch = 0
        pbar = tqdm(total=max_epochs)
        while(epoch < max_epochs):
            preds = self.__predict(X).reshape(y.shape)
            loss = -(1 / m) * (np.sum((y * np.log(preds)) + (1 - y) * np.log(1 - preds)))
            if regularization == 'l2':
                loss += (self.lamda / m) * (np.sum(np.square(self.coeffs)))
            elif regularization == 'l1':
                loss += ((self.lamda / m) * np.linalg.norm(self.coeffs, ord=1))
            self.losses.append(loss)
            
            self.coeffs -= ((alpha / m) * np.sum(np.expand_dims((preds - y), axis=1) * X, axis=0))
            if regularization == 'l2':
                self.coeffs -= (self.lamda * alpha / m) * (2 * self.coeffs)
            elif regularization == 'l1':
                self.coeffs -= ((self.lamda * alpha / m) * (np.sign(self.coeffs)))
                            
            if (show_loss==True and epoch % 50 == 0):
                print(f'Loss at epoch {epoch}: {self.losses[-1]}')
            if (len(self.losses) > 2 and abs(self.losses[-1] - self.losses[-2]) < 10e-8):
                print('Converged, stopping early.')
                break
            epoch += 1
            pbar.update(1)
        pbar.close()

    def __predict(self, X):
        z = X @ self.coeffs
        return 1 / (1+np.exp(-(z))) - 10e-8 # Subtract eps for numerical stability and avoid log(0) in CE loss
    
    def predict(self, X):
        X = self.__pad_ones(X)
        preds = self.__predict(X)
        preds[preds >= 0.5] = 1
        preds[preds < 0.5] = 0
        return preds
    
    def evaluate(self, X, y):
        preds = self.predict(X)
        preds[preds >= 0.5] = 1
        preds[preds < 0.5] = 0
        return np.sum(preds == y) / len(y)
    
    def __pad_ones(self, X):
        pad_ones = np.ones(X.shape[0]).reshape((-1, 1))
        return np.concatenate((pad_ones, X), axis=1)
    
    def get_coeffs(self):
        return self.coeffs

In [8]:
logregGD = LogisticRegressionGD(lamda=0.001)
logregGD.fit(X_train_scaled, y_train, regularization='l2', alpha=0.001, max_epochs=100000)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [9]:
logregGD.evaluate(X_test_scaled, y_test)

0.7423312883435583

In [18]:
class LogisticRegressionSGD():
    
    def __init__(self, lamda, include_bias=True):
        self.losses = []
        self.epoch_losses = []
        self.lamda = lamda
        self.include_bias = include_bias
        
    def fit(self, X, y, alpha, max_iters=500000, regularization=None, show_loss=False):
        if (self.include_bias):
            X = self.__pad_ones(X)
        self.coeffs = np.zeros((X.shape[1],))
        m = X.shape[0]
        self.losses = []
        it = 0
        pbar = tqdm(total=max_iters)
        while(it < max_iters):
            for x, yi in zip(X, y):
                preds = self.__predict(x)
                loss = -((yi * np.log(preds)) + (1 - yi) * np.log(1 - preds))
                if regularization == 'l2':
                    loss += (self.lamda) * (np.sum(np.square(self.coeffs)))
                elif regularization == 'l1':
                    loss += ((self.lamda) * np.linalg.norm(self.coeffs, ord=1))
                self.losses.append(loss)

                self.coeffs -= (alpha) * ((preds - yi) * x)
                if regularization == 'l2':
                    self.coeffs -= (self.lamda * alpha) * (2 * self.coeffs)
                elif regularization == 'l1':
                    self.coeffs -= ((self.lamda * alpha) * (np.sign(self.coeffs)))

                it += 1
                pbar.update(1)
                if (show_loss==True and it % 50 == 0):
                    print(f'Loss at iteration {it}: {self.losses[-1]}')
            self.epoch_losses.append(self.losses[-1])
            if (len(self.losses) > 2 and abs(self.losses[-1] - self.losses[-2]) < 10e-7):
                print('Converged, stopping early.')
                break
        pbar.close()

    def __predict(self, X):
        z = X @ self.coeffs
        return 1 / (1+np.exp(-(z))) - 10e-8 # Subtract eps for numerical stability and avoid log(0) in CE loss
    
    def predict_proba(self, X):
        X = self.__pad_ones(X)
        return self.__predict(X)
    
    def predict(self, X):
        preds = self.predict_proba(X)
        preds[preds >= 0.5] = 1
        preds[preds < 0.5] = 0
        return preds
    
    def evaluate(self, X, y):
        preds = self.predict(X)
        preds[preds >= 0.5] = 1
        preds[preds < 0.5] = 0
        return np.sum(preds == y) / len(y)
    
    def __pad_ones(self, X):
        pad_ones = np.ones(X.shape[0]).reshape((-1, 1))
        return np.concatenate((pad_ones, X), axis=1)
    
    def get_coeffs(self):
        return self.coeffs

In [19]:
logregSGD = LogisticRegressionSGD(lamda=0.001)
logregSGD.fit(X_train_scaled, y_train, regularization='l2', alpha=0.001, max_iters=1000000)

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [20]:
logregSGD.evaluate(X_test_scaled, y_test)

0.7484662576687117

array([0.50457109, 0.51484687, 0.16923336, 0.20486932, 0.34973232,
       0.29628721, 0.21662917, 0.26451657, 0.78512165, 0.16244505,
       0.54453582, 0.45453692, 0.23471628, 0.53867169, 0.45003924,
       0.50711384, 0.1883919 , 0.43639213, 0.51625324, 0.15530319,
       0.2656695 , 0.12648487, 0.17785294, 0.24425778, 0.18079508,
       0.23979862, 0.14545621, 0.27219954, 0.23817707, 0.6113029 ,
       0.3882926 , 0.18053401, 0.31710695, 0.6939735 , 0.31630407,
       0.15411377, 0.25013472, 0.26353891, 0.69065688, 0.50401184,
       0.37162438, 0.26504006, 0.78092093, 0.45695796, 0.30641407,
       0.30895923, 0.32870873, 0.26676613, 0.1591128 , 0.22567806,
       0.42924111, 0.48595954, 0.87144251, 0.23241973, 0.11123736,
       0.27027783, 0.15625501, 0.23453546, 0.5646229 , 0.232059  ,
       0.52715442, 0.2048965 , 0.34350823, 0.77751086, 0.33347888,
       0.32768392, 0.6261788 , 0.60305293, 0.52445525, 0.19415313,
       0.21515229, 0.46325752, 0.83549776, 0.4471203 , 0.61634