# Logistic Regression from Scratch

## 1. Import Libraries

In [10]:
import pandas as pd
import numpy as np
import plotly.express as px
from copy import deepcopy

## 2. Read `titanic` Dataset

In [11]:
train = pd.read_csv('Dataset/train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
test = pd.read_csv('Dataset/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## 3. PreProcessing

### 3.1 Check columns `dtype`

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Categorical features : {Name, Sex, Cabin, Embarked, Ticket}

Numerical features : {PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare}

### 3.2 Drop columns that are not useful

In [14]:
train.drop(['PassengerId', 'Ticket', 'Name'], axis=1, inplace=True)
test.drop(['PassengerId', 'Ticket', 'Name'], axis=1, inplace=True)

In [15]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [16]:
print(f'{train.Cabin.isnull().sum() * 100/len(train)}% of Cabin is NaN')

77.10437710437711% of Cabin is NaN


#### More than 77% of Records in `Cabin` column are null. So we can drop it. 

In [17]:
train.drop('Cabin', axis=1, inplace=True)

In [18]:
test.drop('Cabin', axis=1, inplace=True)

### 3.3 Impute missing values

#### `Age` Column

In [19]:
age_mean = train.Age.mean()
train.Age.fillna(age_mean, inplace=True)

#### `Embarked` Column

In [20]:
emb_mode = train.Embarked.mode()
train.Embarked.fillna(emb_mode[0], inplace=True)

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


#### Missing values in the Test dataset

In [22]:
test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

#### We are going to impute `Age` and `Fare` columns with mean of these features in train.

In [23]:
fare_mean = train.Fare.mean()
test.Age.fillna(age_mean, inplace=True)
test.Fare.fillna(fare_mean, inplace=True)

In [24]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


### 3.4 One Hot Encoding

#### We are going to use `One Hot Encoding` for `Sex` and `Embarked`

In [25]:
train = pd.get_dummies(train, columns=['Sex', 'Embarked'])
test = pd.get_dummies(test, columns=['Sex', 'Embarked'])

In [26]:
train

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,7.2500,0,1,0,0,1
1,1,1,38.000000,1,0,71.2833,1,0,1,0,0
2,1,3,26.000000,0,0,7.9250,1,0,0,0,1
3,1,1,35.000000,1,0,53.1000,1,0,0,0,1
4,0,3,35.000000,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,0,1,0,0,1
887,1,1,19.000000,0,0,30.0000,1,0,0,0,1
888,0,3,29.699118,1,2,23.4500,1,0,0,0,1
889,1,1,26.000000,0,0,30.0000,0,1,1,0,0


In [27]:
test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.500000,0,0,7.8292,0,1,0,1,0
1,3,47.000000,1,0,7.0000,1,0,0,0,1
2,2,62.000000,0,0,9.6875,0,1,0,1,0
3,3,27.000000,0,0,8.6625,0,1,0,0,1
4,3,22.000000,1,1,12.2875,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,3,29.699118,0,0,8.0500,0,1,0,0,1
414,1,39.000000,0,0,108.9000,1,0,1,0,0
415,3,38.500000,0,0,7.2500,0,1,0,0,1
416,3,29.699118,0,0,8.0500,0,1,0,0,1


### 3.5 Seperate Features and the Target variable.

In [28]:
X = train.drop('Survived', axis=1).values
y = train.Survived.values

## 4. Implement `Logistic Regression` Model

In [29]:
class LogisticRegression:
    
    @staticmethod
    def sigmoid(z):
        return 1/(1+np.exp(-z))
    
    @staticmethod
    def add_bias(X):
        bias = np.ones((X.shape[0], 1))
        return np.concatenate([X, bias] , axis=1)
    
    def __init__(self, X, y, mu=0, std=1) -> None:
        self.X = self.add_bias(X)
        self.y = y
        self.w = np.random.normal(mu, std, size=(self.X.shape[1], 1))
        
    def get_L2(self, landa):
        return (landa/(2 * self.X.shape[0])) * np.sum(self.w ** 2)
        
    def cost(self, y_pred, landa, y_true=None):
        if y_true is not None:
            y = y_true
        else:
            y = self.y
        loss = ((y * np.log(y_pred)) + ((1 - y) * np.log(1 - y_pred)))
        return -1 * (np.mean(loss) + self.get_L2(landa))
    
    def predict(self, X):
        y_pred = np.dot(X, self.w).reshape((-1,))
        return self.sigmoid(y_pred)
    
    def predict_label(self, X):
        X = self.add_bias(X)
        y_pred = np.dot(X, self.w).reshape((-1,))
        return self.sigmoid(y_pred) >= 0.5
    
    def get_gradients(self, y_hat, landa):
        diff = (y_hat - self.y).reshape((-1, 1))
        reg_part = (landa/self.X.shape[0]) * self.w
        return np.mean(diff * self.X, axis=0).reshape((-1,1)) + reg_part
    
    def update_weights(self, grad, lr):
        self.w -= lr * grad
        
    def evaluate(self, landa, X, y=None):
        X = self.add_bias(X)
        y_pred = self.predict(X)
        return self.cost(y_pred, landa, y)
    
    def fit(self, lr, landa, iter, print_cost=False):
        for it in range(iter):
            y_pred = self.predict(self.X)
            cost_it = self.cost(y_pred, landa)
            grad = self.get_gradients(y_pred, landa)
            self.update_weights(grad, lr)
            if print_cost:
                print(f'cost in iteration {it} : {cost_it}')
        

## 5. Implement Standard Scaler

In [30]:
class Scaler:
    
    def __init__(self):
        self.mean = None
        self.std = None
      
    
    def set_mean(self, X):
        self.mean = np.mean(X, axis=0)
        
    def set_std(self, X):
        self.std = np.std(X, axis=0)
        
    def transform(self, X):
        return (X - self.mean)/self.std
    
    def fit(self, X):
        self.set_mean(X)
        self.set_std(X)
        

In [31]:
scaler = Scaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

## 6. Train the Model

In [32]:
np.random.seed(42)
model = LogisticRegression(X_scaled, y, 0, 1)
model.fit(0.02,0.5,1000, True)

cost in iteration 0 : 1.6962080593906723
cost in iteration 1 : 1.6848095802262475
cost in iteration 2 : 1.6735103639336453
cost in iteration 3 : 1.662309764522589
cost in iteration 4 : 1.651207124104248
cost in iteration 5 : 1.6402017739451864
cost in iteration 6 : 1.6292930355023576
cost in iteration 7 : 1.6184802214358172
cost in iteration 8 : 1.607762636596368
cost in iteration 9 : 1.597139578985663
cost in iteration 10 : 1.5866103406863323
cost in iteration 11 : 1.5761742087604662
cost in iteration 12 : 1.5658304661146325
cost in iteration 13 : 1.5555783923302697
cost in iteration 14 : 1.5454172644582478
cost in iteration 15 : 1.535346357776996
cost in iteration 16 : 1.5253649465135177
cost in iteration 17 : 1.5154723045270702
cost in iteration 18 : 1.5056677059553556
cost in iteration 19 : 1.4959504258233802
cost in iteration 20 : 1.486319740615163
cost in iteration 21 : 1.476774928808784
cost in iteration 22 : 1.4673152713752906
cost in iteration 23 : 1.4579400522421364
cost in i

## 7. Cross Validation

In [24]:
def kfold(k, data):
    length = len(data) // k
    folds = []
    for fold_num in range(k):
        row_start, row_end = fold_num * length , (fold_num+1) * length
        if fold_num == k - 1:
            row_end += len(data)%k
        fold_data = data[row_start:row_end, :]
        X_fold, y_fold = fold_data[: , :-1], fold_data[:, -1]
        folds.append((X_fold, y_fold))
    return folds

In [33]:
def get_train_from_folds(folds, idx):
    folds.pop(idx)
    X_folds = [fold[0] for fold in folds]
    y_folds = [fold[1] for fold in folds]
    X_train = np.concatenate(X_folds, axis=0)
    y_train = np.concatenate(y_folds, axis=0)
    return X_train, y_train

def cross_validation(k, model, data, params, repeat=10):
    train_history, valid_history = [], []
    for r in range(repeat):
        np.random.shuffle(data)
        folds = kfold(k, data)
        train_losses, valid_losses = [], []
        for valid_ind, (X_valid, y_valid) in enumerate(folds):
            X_train, y_train = get_train_from_folds(deepcopy(folds), valid_ind)
            m = model(X_train, y_train, 0, 1)
            m.fit(**params)
            train_loss = m.evaluate(params['landa'], X_train)
            valid_loss = m.evaluate(params['landa'], X_valid, y_valid)
            train_losses.append(train_loss)
            valid_losses.append(valid_loss)
        train_history.extend(train_losses)
        valid_history.extend(valid_losses)
    return train_history, valid_history
            

In [36]:
def searchCV(k, model, data, params, iter=500):
    train_dict, valid_dict = {}, {}
    for lr_val in params['lr']:
        for lamda_val in params['landa']:
            curr_params = {'lr':lr_val, 'landa':lamda_val, 'iter':iter}
            train_his, valid_his = cross_validation(k, model, data, curr_params)
            train_dict[f'lr:{lr_val},landa:{lamda_val}'] = train_his
            valid_dict[f'lr:{lr_val},landa:{lamda_val}'] = valid_his
    return train_dict, valid_dict     

#### In the cell below, a range of possible values are defined for each parameter, cross validation is performed for each combination of parameters.

In [37]:
dt = np.concatenate([X_scaled, y.reshape((-1,1))], axis=1)
params = {
    'lr':[0.01, 0.03, 0.1, 0.3, 0.5, 0.8],
    'landa':[0.01, 0.03, 0.1, 0.3, 0.5, 0.8, 1]
}
train_cv, valid_cv = searchCV(5, LogisticRegression, dt, params)

In [38]:
px.box(pd.DataFrame(train_cv), title='Loss function of LogisticRegression for various hyperparameters in train set')

In [39]:
px.box(pd.DataFrame(valid_cv), title='Loss function of LogisticRegression for various hyperparameters in validation set')

## 8. Train Best Model with whole Data

In [41]:
best_model = LogisticRegression(X_scaled, y)
best_model.fit(lr=0.3, landa=0.01, iter=500, print_cost=True)
y_pred = best_model.predict_label(test)

cost in iteration 0 : 1.1762492192304377
cost in iteration 1 : 1.0759123243139272
cost in iteration 2 : 0.9933989365250231
cost in iteration 3 : 0.9254703289279437
cost in iteration 4 : 0.8692155888936557
cost in iteration 5 : 0.8221886052623846
cost in iteration 6 : 0.7824122310987749
cost in iteration 7 : 0.7483268784811192
cost in iteration 8 : 0.7187251532577761
cost in iteration 9 : 0.692688019658718
cost in iteration 10 : 0.6695265068078966
cost in iteration 11 : 0.6487304116546101
cost in iteration 12 : 0.6299249480235738
cost in iteration 13 : 0.6128354241463891
cost in iteration 14 : 0.597259189843776
cost in iteration 15 : 0.5830437469753806
cost in iteration 16 : 0.5700699919840105
cost in iteration 17 : 0.5582397887930409
cost in iteration 18 : 0.5474672497792987
cost in iteration 19 : 0.537673156564039
cost in iteration 20 : 0.5287819098011555
cost in iteration 21 : 0.5207203375521772
cost in iteration 22 : 0.5134176884714411
cost in iteration 23 : 0.5068062182549609
cost 

## 9. Predict Test Data

In [133]:
out = pd.DataFrame({'prediction': y_pred})
out.to_csv('output.csv', index=False)

## Logistic Regression with `torch`

In [30]:
import torch

In [38]:
class LogisticRegression:
    
    @staticmethod
    def sigmoid(z):
        return 1/(1+torch.exp(-z))
    
    @staticmethod
    def add_bias(X):
        bias = torch.ones((X.shape[0], 1))
        return torch.cat((X, bias) , dim=1)
    
    def __init__(self, X, y, mu=0, std=1) -> None:
        self.X = self.add_bias(X)
        self.y = y
        self.w = torch.normal(mu, std, size=(self.X.shape[1], 1), requires_grad=True)
        
    def get_L2(self, landa):
        return (landa/(2 * self.X.shape[0])) * torch.sum(self.w ** 2)
        
    def cost(self, y_pred, landa, y_true=None):
        if y_true is not None:
            y = y_true
        else:
            y = self.y
        loss = ((y * torch.log(y_pred)) + ((1 - y) * torch.log(1 - y_pred)))
        result = -1 * (torch.mean(loss) + self.get_L2(landa))
        return result
    
    def predict(self, X):
        y_pred = torch.matmul(X, self.w).reshape((-1,))
        return self.sigmoid(y_pred)
    
    def predict_label(self, X):
        X = self.add_bias(X)
        y_pred = self.predict(X)
        return y_pred >= 0.5
    
    
    def update_weights(self, lr):
        with torch.no_grad():
            self.w -= lr * self.w.grad
        
    def evaluate(self, landa, X, y=None):
        X = self.add_bias(X)
        y_pred = self.predict(X)
        return self.cost(y_pred, landa, y)
    
    def fit(self, lr, landa, iter, print_cost=False):
        for it in range(iter):
            y_pred = self.predict(self.X)
            cost_it = self.cost(y_pred, landa)
            cost_it.backward()
            self.update_weights(lr)
            if print_cost:
                print(f'cost in iteration {it} : {cost_it}')
            self.w.grad.data.zero_()
        

In [39]:
torch.manual_seed(42)
X_scaled = torch.tensor(X_scaled).float()
y = torch.tensor(y).float()
model = LogisticRegression(X_scaled, y, 0, 1)
model.fit(0.02,0.5,1000, True)

cost in iteration 0 : 2.488128185272217
cost in iteration 1 : 2.4660496711730957
cost in iteration 2 : 2.444061517715454
cost in iteration 3 : 2.422165870666504
cost in iteration 4 : 2.4003632068634033
cost in iteration 5 : 2.3786556720733643
cost in iteration 6 : 2.3570446968078613
cost in iteration 7 : 2.335531234741211
cost in iteration 8 : 2.3141167163848877
cost in iteration 9 : 2.2928030490875244
cost in iteration 10 : 2.2715909481048584
cost in iteration 11 : 2.2504827976226807
cost in iteration 12 : 2.2294793128967285
cost in iteration 13 : 2.2085821628570557
cost in iteration 14 : 2.1877927780151367
cost in iteration 15 : 2.1671130657196045
cost in iteration 16 : 2.1465439796447754
cost in iteration 17 : 2.126087188720703
cost in iteration 18 : 2.105743885040283
cost in iteration 19 : 2.0855159759521484
cost in iteration 20 : 2.0654051303863525
cost in iteration 21 : 2.045412063598633
cost in iteration 22 : 2.025538921356201
cost in iteration 23 : 2.005786895751953
cost in ite

  X_scaled = torch.tensor(X_scaled).float()
  y = torch.tensor(y).float()
