# Gradient Descent Practice

In this repository, you are expecting to see the following analyses from scratch:

* Model 1: use a stat package in python & regularization <br/>
* Model 2: write gradient descent from scratch  <br/>
* Model 3: add regularization in model 2 (later)  <br/>
* Model 4: stochastic gradient descent (later)  <br/>
* Model 5: mini-batch gradient descent (later)  <br/>

### 1. Select the dataset: iris

In [80]:
# import packages
import random
import math
import pandas as pd
import numpy as np
from sklearn import datasets

In [81]:
# read in the sample dataset from sklearn
iris = datasets.load_iris()
dat = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
# for simplicity, we only examine binary classification:
# map to whether not 1
value_map = {0. : 1, 
             1. : 0, 
             2. : 1} 
# replace the value in target & change column names
dat['target'] = dat['target'].map(value_map)
dat.columns = [var.replace(' (cm)', '') for var in dat.columns if '(cm)' in var] + ['target']
dat.head(3)

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1


### Model 1: Use statistical packages

In [3]:
# Package 1: statsmodels.api
import statsmodels.api as sm
dat1 = dat.copy()
dat1['intercept'] = 1 # Note, if we don't have the intercept, statsmodels will by default has not intercept
model = sm.Logit(dat1['target'], dat1.loc[:,dat1.columns!='target'])
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.485883
         Iterations 6


0,1,2,3
Dep. Variable:,target,No. Observations:,150.0
Model:,Logit,Df Residuals:,145.0
Method:,MLE,Df Model:,4.0
Date:,"Sun, 01 Jul 2018",Pseudo R-squ.:,0.2367
Time:,16:09:28,Log-Likelihood:,-72.882
converged:,True,LL-Null:,-95.477
,,LLR p-value:,3.631e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
sepal length,0.2527,0.649,0.389,0.697,-1.020,1.526
sepal width,2.7794,0.786,3.537,0.000,1.239,4.320
petal length,-1.2993,0.682,-1.904,0.057,-2.637,0.038
petal width,2.7043,1.163,2.326,0.020,0.426,4.983
intercept,-7.3229,2.498,-2.932,0.003,-12.219,-2.427


In [4]:
# Package 2: sklearn
from sklearn.linear_model import LogisticRegression
dat2 = dat.copy()
lr = LogisticRegression(random_state=0, C=1e5) # lower C means higher penalty, use convention in SVM
lr.fit(dat2.loc[:,dat2.columns!='target'], dat2['target'])
print('intercept: ', lr.intercept_)
dict(zip(dat2.columns[dat2.columns!='target'], lr.coef_[0]))

intercept:  [-7.32023848]


{'sepal length': 0.25233130342654525,
 'sepal width': 2.779020300705264,
 'petal length': -1.298869859155948,
 'petal width': 2.7035482616441704}

We can see the result is slightly different, probably because one requires smaller different between iterations to determine the convergence. Therefore, we believe everything is correct here. 

### Model 2: Build the logistic regression from scratch

In [5]:
# review the dataset
dat.describe()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,0.666667
std,0.828066,0.433594,1.76442,0.763161,0.472984
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,1.0
max,7.9,4.4,6.9,2.5,1.0


In [82]:
# build sigmoid functions
def sigmoid(x):
    return(1/(1+math.exp(-x))) # we should raise exception for different x data type

In [83]:
# build loss function 
def lr_loss(y=1, p=0.5):
    if (y in [0,1]) and (0<p<1):
        return(-y*math.log(p)-(1-y)*math.log(1-p))
    else:
        raise ValueError('input y or p is out of bound.')

In [84]:
# let's train the model
def lr_train(data = dat, fit_intercept = True, random_state = 0, alpha = 0.01, tol = 1e-4, target = 'target', varList = []):
    # assume there is no column called 'intercept'
    if fit_intercept:
        dat['intercept'] = 1
        varList.append('intercept')
    # initiate beta based on random_state:
    random.seed(random_state)
    init_beta = [0]*5
    new_beta = [0.26,  2.779, -1.2988, 2.703, -7.320]#[random.random() for i in range(len(varList))]
    # add two columns: predicted prob 
    data['pred'] = data.apply(lambda row: sigmoid(np.dot(row[varList], init_beta)), axis = 1)
    data['loss'] = data.apply(lambda row: lr_loss(y=row['target'], p=row['pred']), axis=1)
    # loop through
    while max(abs(np.array(new_beta) - np.array(init_beta)))>1e-5:
        for i in range(len(varList)):
            init_beta[i] = new_beta[i]
            new_beta[i] = init_beta[i] - alpha * (1/data.shape[0]) *sum((data['pred']-data['target'])*data[varList[i]]) #(*)?
            data['pred'] = data.apply(lambda row: sigmoid(np.dot(row[varList], new_beta)), axis = 1)
    return(new_beta)

In [86]:
print(lr_train(data=dat, varList = ['sepal length', 'sepal width', 'petal length', 'petal width']))

[0.2601115010607848, 2.7741458682048834, -1.305178458118098, 2.7010866692333706, -7.321589864367134]


There is something wrong with this chunk of code:
    1. it could be parallel computed
    2. the running time is really volatile -> could be something wrong with the code here (*)

In [66]:
# we write scripts so that it makes things easier to check and then put them in the above function
dat['intercept'] = 1
varList = ['sepal length', 'sepal width', 'petal length', 'petal width']
varList.append('intercept')
random.seed(1)
init_beta = [random.rando m() for i in range(len(varList))]

In [68]:
# add two columns: predicted prob 
dat['pred'] = dat.apply(lambda row: sigmoid(np.dot(row[varList], init_beta)), axis = 1)
dat['loss'] = dat.apply(lambda row: lr_loss(y=row['target'], p=row['pred']), axis=1)

In [69]:
# update beta values
alpha = 0.01
new_beta = [0.26,  2.779, -1.2988, 2.703, -7.320]
init_beta = [0,0,0,0,0]
while max(abs(np.array(new_beta) - np.array(init_beta)))>1e-5:
    for i in range(len(varList)):
        init_beta[i] = new_beta[i]
        new_beta[i] = init_beta[i] - alpha * (1/dat.shape[0]) *sum((dat['pred']-dat['target'])*dat[varList[i]])
        dat['pred'] = dat.apply(lambda row: sigmoid(np.dot(row[varList], new_beta)), axis = 1)
    print(new_beta)
print("=== finally the answer ===")    
print(new_beta)    

[0.24032791257612726, 2.779335505892976, -1.2983154652456175, 2.703151200594591, -7.319890177498307]
[0.2409881687323892, 2.7796371011489787, -1.297880947684358, 2.703286667023048, -7.319791451576524]
[0.24158135298035813, 2.779908285291391, -1.2974913015186553, 2.7034080204102486, -7.319702680446149]
[0.242114367531563, 2.7801521901339763, -1.297141918018386, 2.7035167122575756, -7.319622841646]
[0.24259339330805832, 2.780371619594205, -1.2968286680724581, 2.7036140426720703, -7.319551019242445]
[0.2430239672971272, 2.7805690849894025, -1.2965478511235158, 2.7037011765530394, -7.319486392459998]
[0.2434110512656727, 2.7807468363622054, -1.2962961497392833, 2.703779157978459, -7.319428225573095]
[0.24375909285528918, 2.780906890310733, -1.29607058916621, 2.7038489230028717, -7.31937585891118]
[0.2440720799504442, 2.7810510547376155, -1.2958685012915179, 2.7039113110519004, -7.319328700847808]
[0.2443535891006224, 2.78118095087918, -1.2956874925098776, 2.703967075075438, -7.319286220660

In [78]:
[0]*5

[0, 0, 0, 0, 0]