# Impors

In [48]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score, confusion_matrix

import numpy as np

# Load data

In [49]:
df_train = pd.read_csv('data/ready/train.csv')
print(df_train.shape)
df_train.head()

(27838, 18)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,native.country_is_NA,workclass_is_NA,occupation_is_NA
0,31,3,124827,8,11,2,11,0,4,1,0,0,40,38,>50K,False,False,False
1,27,4,89718,15,10,5,11,3,4,0,0,0,40,38,<=50K,False,False,False
2,37,3,224947,8,11,4,7,1,4,1,0,0,40,38,<=50K,False,False,False
3,40,6,269733,8,11,4,2,1,4,1,0,0,40,38,<=50K,False,False,False
4,57,4,125000,14,15,2,9,0,4,1,0,0,35,38,>50K,False,False,False


In [50]:
df_val = pd.read_csv('data/ready/val.csv')
print(df_val.shape)
df_val.head()

(3094, 18)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,native.country_is_NA,workclass_is_NA,occupation_is_NA
0,58,3,128162,11,9,0,11,1,4,0,0,0,24,38,<=50K,False,False,False
1,25,3,570821,15,10,2,6,0,4,1,0,0,40,38,<=50K,False,False,False
2,50,3,43764,10,16,2,9,0,4,1,15024,0,50,38,>50K,False,False,False
3,21,7,207988,11,9,2,14,2,4,0,0,0,35,38,<=50K,False,True,True
4,41,5,112362,11,9,2,4,0,4,1,0,0,38,38,<=50K,False,False,False


# Prepare dataset

## Prep target

In [51]:
target_dict = {'<=50K': 0
              , '>50K': 1}

df_val['income'].replace(target_dict, inplace=True)
df_val['income'].value_counts()

0    2356
1     738
Name: income, dtype: int64

In [52]:
df_train['income'].replace(target_dict, inplace=True)
df_train['income'].value_counts()

0    21123
1     6715
Name: income, dtype: int64

## Prep X

In [53]:
scaler = StandardScaler()

cols_x = set(df_train) - set(['income'])

df_train.loc[:, cols_x] = scaler.fit_transform(df_train[cols_x])
df_val.loc[:, cols_x] = scaler.transform(df_val[cols_x])

  df_train.loc[:, cols_x] = scaler.fit_transform(df_train[cols_x])
  df_train.loc[:, cols_x] = scaler.fit_transform(df_train[cols_x])
  df_val.loc[:, cols_x] = scaler.transform(df_val[cols_x])
  df_val.loc[:, cols_x] = scaler.transform(df_val[cols_x])


# sklearn model

In [54]:
df_train['income']

0        1
1        0
2        0
3        0
4        1
        ..
27833    0
27834    0
27835    0
27836    1
27837    1
Name: income, Length: 27838, dtype: int64

In [55]:
lr = LogisticRegression()

lr.fit(df_train.drop(columns='income'), df_train['income'])

y_pred_train = lr.predict(df_train.drop(columns='income'))
y_pred_val = lr.predict(df_val.drop(columns='income'))

print('Train Balanced accuracy:', balanced_accuracy_score(df_train['income'], y_pred_train))
print('Validation Balanced accuracy:', balanced_accuracy_score(df_val['income'], y_pred_val))

display(confusion_matrix(df_val['income'], y_pred_val))

Train Balanced accuracy: 0.7010816305545624
Validation Balanced accuracy: 0.699237028448383


array([[2203,  153],
       [ 396,  342]])

# lr from scratch

In [64]:
class MyLogisticRegression:
    def __init__(self):
        self.n = None
        self.m = None
        self.w = None
        self.b = None
        self.X_train = None
        self.y_train = None
    
    def init_params(self, X, y, reset=True):
        self.X_train = self.to_numpy_transpose(X)
        self.y_train = self.to_numpy_transpose(y)
        self.n = self.X_train.shape[0]
        self.m = self.X_train.shape[1]
        if reset:
            self.w = np.zeros((self.n, 1))
            self.b = np.zeros((1, 1))
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def step(self, x):
        return (x > 0.5).astype(int)
    
    def to_numpy_transpose(self, x):
        return np.array(x).reshape(-1, x.shape[0])
    
    def predict(self, x):
        z = self.predict_proba(x)
        return self.step(z)
    # ----------
    def predict_proba(self, X):
        A = self.sigmoid(self.w.T @ X + self.b)
        return A
    
    def fit(self, X, y, n_epoch=100, lr=0.01, print_every=10, reset=True):
        self.init_params(X, y, reset=reset)
        for i in range(n_epoch):
            # calculate output of function
            probs = self.predict_proba(self.X_train)
            # calculate cost
            cost = self.calculate_cost(self.y_train, probs)
            # calculate gradients
            grads = self.calculate_gradients(self.X_train, self.y_train, probs)
            # update params
            self.w = self.w - lr * grads['dw']
            self.b = self.b - lr * grads['db']
            # print cost
            if i % print_every == 0:
                print(cost)
        
    
    def calculate_cost(self, y, probs):
        cost = -(1. / self.m) * np.sum(y * np.log(probs) + (1 - y) * np.log(1 - probs))
        return cost
    
    def calculate_gradients(self, X, y, probs):
        dz = probs - y
        dw = (X @ dz.T) / self.m
        db = np.sum(dz) / self.m
        grads = {'dw': dw, 'db': db}
        return grads
    # ----------
    
    def get_params(self):
        pass
        
    def set_params(self):
        pass

In [65]:
l = MyLogisticRegression()

l.fit(df_train.drop(columns='income'), df_train['income'], lr=1e-1)


0.6931471805599453
0.6397721039906498
0.6074172111983632
0.5875286762492731
0.5751025738373576
0.5672159755740575
0.5621397463002353
0.558832661192271
0.5566559920150475
0.5552110096474735


In [66]:
l.fit(df_train.drop(columns='income'), df_train['income'], lr=1e0, reset=False)

0.554244873244044
0.5522446163757838
0.5522120747833296
0.5522114848171068
0.5522114737896131
0.5522114735792402
0.5522114735751488
0.5522114735750676
0.5522114735750661
0.5522114735750661
