https://thelaziestprogrammer.com/sharrington/math-of-machine-learning/solving-logreg-newtons-method

https://rstudio-pubs-static.s3.amazonaws.com/160015_b192ca9855e84b57814e785ebd034a5e.html

https://www.dotnetlovers.com/Article/225/logistic-regression-explained

https://www.kaggle.com/code/elyas19/implement-logistic-regression-from-scratch/notebook

https://sonsnotation.blogspot.com/2020/11/2-logistic-regression.html

https://lee-jaejoon.github.io/stat-logistic/

In [1]:
import numpy as np
import pandas as pd
import random

# Data Generating

In [127]:
sample_size = 10000
x = np.random.normal(0,1,sample_size)
pi = 3 + (10 * x)
p = 1 / (1 + np.exp(-pi))
y = np.random.binomial(1,p,sample_size)
index_list=list(range(0,10000))
df = pd.DataFrame(y,x,columns=['y'])
df = df.reset_index().rename(columns={"index": "x"})
df['intercept'] = 1
df = df[['intercept','x','y']]

In [128]:
df

Unnamed: 0,intercept,x,y
0,1,1.510639,1
1,1,0.441886,1
2,1,0.457521,1
3,1,0.796044,1
4,1,0.201056,1
...,...,...,...
9995,1,0.781607,1
9996,1,1.543853,1
9997,1,-1.378448,0
9998,1,0.490335,1


# Newton's Method

In [131]:
# x = np.concatenate((df['x'].values.reshape(-1,1),df['intercept'].values.reshape(-1,1)), axis=1)
x = df[['x','intercept']].values
y = df['y'].values.reshape(-1,1)
weight_vec = np.random.normal(0,1,[2,1])
weight = [weight_vec]

while True:
    pi = 1/(1+np.exp(-x@weight_vec))
    grad = (1/len(df))*x.T@(pi-y)
    
    # np.dot, @ 매우 느림 .dot이 빠름
    H = (1/len(df))*x.T.dot(np.diag(pi.reshape(len(df)))).dot(np.diag((1-pi).reshape(len(df)))).dot(x)
    weight_vec = weight_vec - np.linalg.inv(H).T@grad # np.linalg.pinv
    weight.append(weight_vec)
    print(np.round(weight_vec,2))
    
    if all(np.round(weight[-2],10) == np.round(weight_vec,10)): # 이전 가중치와 소수점 10자리까지 같으면 중지
        break

[[1.59]
 [0.98]]
[[3.  ]
 [1.11]]
[[4.7]
 [1.5]]
[[6.69]
 [2.03]]
[[8.58]
 [2.57]]
[[9.7]
 [2.9]]
[[9.96]
 [2.98]]
[[9.97]
 [2.98]]
[[9.97]
 [2.98]]
[[9.97]
 [2.98]]


# Predict

In [132]:
df['p'] = 1/(1+np.exp(-(weight_vec[0]*df['x'] + weight_vec[1])))
df['p'].fillna(0,inplace=True)

# cutoff
df.loc[df['p'] >= 0.5, 'result'] = 1
df.loc[df['p'] < 0.5, 'result'] = 0

In [133]:
df

Unnamed: 0,intercept,x,y,p,result
0,1,1.510639,1,1.000000,1.0
1,1,0.441886,1,0.999379,1.0
2,1,0.457521,1,0.999469,1.0
3,1,0.796044,1,0.999982,1.0
4,1,0.201056,1,0.993198,1.0
...,...,...,...,...,...
9995,1,0.781607,1,0.999979,1.0
9996,1,1.543853,1,1.000000,1.0
9997,1,-1.378448,0,0.000021,0.0
9998,1,0.490335,1,0.999617,1.0


# Library

In [134]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [135]:
model = LogisticRegression()
model.fit(df['x'].values.reshape(-1,1), df["y"])

LogisticRegression()

# Comparison

In [136]:
# Coefficient
print("Implementation:",weight[-1].T)
print("sklearn library:",model.coef_, model.intercept_)

Implementation: [[9.96822816 2.97947763]]
sklearn library: [[9.24925939]] [2.77128429]


In [137]:
# Score
pred = model.predict(df['x'].values.reshape(-1,1))
print("Implementation:",sum(df.result.values == df.y.values)/100)
print("sklearn library:",np.round(accuracy_score(df['y'],pred)*100,2))

Implementation: 95.02
sklearn library: 95.01


# Application

In [110]:
import pandas as pd
from sklearn import datasets

data = datasets.load_breast_cancer()
ex = pd.DataFrame(data.data, columns = data.feature_names)
ex = ex[['mean radius', 'mean texture', 'mean area', 'mean symmetry']]
ex['target'] = data.target
ex

Unnamed: 0,mean radius,mean texture,mean area,mean symmetry,target
0,17.99,10.38,1001.0,0.2419,0
1,20.57,17.77,1326.0,0.1812,0
2,19.69,21.25,1203.0,0.2069,0
3,11.42,20.38,386.1,0.2597,0
4,20.29,14.34,1297.0,0.1809,0
...,...,...,...,...,...
564,21.56,22.39,1479.0,0.1726,0
565,20.13,28.25,1261.0,0.1752,0
566,16.60,28.08,858.1,0.1590,0
567,20.60,29.33,1265.0,0.2397,0


In [111]:
ex['intercept'] = 1
ex = ex[['intercept','mean radius','mean texture','mean area','mean symmetry','target']]

In [112]:
ex

Unnamed: 0,intercept,mean radius,mean texture,mean area,mean symmetry,target
0,1,17.99,10.38,1001.0,0.2419,0
1,1,20.57,17.77,1326.0,0.1812,0
2,1,19.69,21.25,1203.0,0.2069,0
3,1,11.42,20.38,386.1,0.2597,0
4,1,20.29,14.34,1297.0,0.1809,0
...,...,...,...,...,...,...
564,1,21.56,22.39,1479.0,0.1726,0
565,1,20.13,28.25,1261.0,0.1752,0
566,1,16.60,28.08,858.1,0.1590,0
567,1,20.60,29.33,1265.0,0.2397,0


In [138]:
ex_x =  ex[['intercept','mean radius','mean texture','mean area','mean symmetry']].values
ex_y = ex['target'].values.reshape(-1,1)
weight_vec = np.random.normal(0,1,[5,1])
weight=[weight_vec]


In [116]:
while True:
    pi = 1/(1+np.exp(-ex_x@weight_vec))
    grad = (1/len(ex))*ex_x.T@(pi-ex_y)
    
    # np.dot, @ 매우 느림 .dot이 빠름
    H = (1/len(ex))*ex_x.T.dot(np.diag(pi.reshape(len(ex)))).dot(np.diag((1-pi).reshape(len(ex)))).dot(ex_x)
    weight_vec = weight_vec - np.linalg.pinv(H).T@grad # np.linalg.pinv
    weight.append(weight_vec)
    print(np.round(weight_vec,2))
    
#    if all(np.round(weight[-2],10) == np.round(weight_vec,10)): # 이전 가중치와 소수점 10자리까지 같으면 중지
#        break
    if all(np.round(weight[-2],20) == np.round(weight_vec,20)): # 이전 가중치와 소수점 10자리까지 같으면 중지
        break

[[-0.34]
 [-0.64]
 [ 1.11]
 [ 0.19]
 [-0.15]]


In [None]:
x = np.concatenate((df['x'].values.reshape(-1,1),df['intercept'].values.reshape(-1,1)), axis=1)
y = df['y'].values.reshape(-1,1)
weight_vec = np.random.normal(0,1,[2,1])
weight=[weight_vec]

while True:
    pi = 1/(1+np.exp(-x@weight_vec))
    grad = (1/len(df))*x.T@(pi-y)
    
    # np.dot, @ 매우 느림 .dot이 빠름
    H = (1/len(df))*x.T.dot(np.diag(pi.reshape(len(df)))).dot(np.diag((1-pi).reshape(len(df)))).dot(x)
    weight_vec = weight_vec - np.linalg.inv(H).T@grad # np.linalg.pinv
    weight.append(weight_vec)
    print(np.round(weight_vec,2))
    
    if all(np.round(weight[-2],10) == np.round(weight_vec,10)): # 이전 가중치와 소수점 10자리까지 같으면 중지
        break

In [94]:
ex_model = LogisticRegression()
ex_model.fit(ex_x, ex_y)

  return f(*args, **kwargs)


LogisticRegression()

In [95]:
ex_model.coef_, ex_model.intercept_

(array([[ 0.10694209,  0.19408292, -0.21000379, -0.01413528, -1.31210423]]),
 array([10.94680214]))