In [2]:
import numpy as np
from sklearn import datasets

#GD를 활용한 LogisticRegression
class LogisticRegression:
    def __init__(self, learning_rate=0.01, threshold=0.01, max_iterations=100000, fit_intercept=True, verbose=False):
        self._learning_rate = learning_rate  # 학습 계수
        self._max_iterations = max_iterations  # 반복 횟수
        self._threshold = threshold  # 학습 중단 계수
        self._fit_intercept = fit_intercept  # 절편 사용 여부를 결정
        self._verbose = verbose  # 중간 진행사항 출력 여부

    # theta(W) 계수들 return
    def get_coeff(self):
        return self._W

    # 절편 추가
    def add_intercept(self, x_data):
        intercept = np.ones((x_data.shape[0], 1))
        return np.concatenate((intercept, x_data), axis=1)

    # 시그모이드 함수(로지스틱 함수)
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    # h: hypothesis, y: y_data_train
    # 사건 1이 일어날 확률과 사건 1이 일어나지 않을 확률을 
    def cost(self, h, y):##어떤뜻인지 파악하기(cost function에 대한 이해필요(GD,SGD 찾아보기))
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

    
    def fit(self, x_data, y_data):
        num_examples, num_features = np.shape(x_data)

        if self._fit_intercept:
            x_data = self.add_intercept(x_data)

        
        self._W = np.zeros(x_data.shape[1])

        for i in range(self._max_iterations):
            z = np.dot(x_data, self._W)
            hypothesis = self.sigmoid(z)
       
            

            #실제값과 예측값의 차이
            diff = hypothesis - y_data

            cost = self.cost(hypothesis, y_data)

            ##어떤 과정인지 설명하기
            # 내적(x_data^T 한 것과 diff를) 그리고 그 값을 훈련샘플 수 만큼 나누기.
            # = cost function의 편미분을 의미함.
            gradient = np.dot(x_data.transpose(), diff) / num_examples
            
            
            self._W -= self._learning_rate * gradient

           
            if cost < self._threshold:
                return False

           
            if (self._verbose == True and i % 100 == 0):
                print('cost :', cost)

    def predict_prob(self, x_data):
        if self._fit_intercept:
            x_data = self.add_intercept(x_data)

        return self.sigmoid(np.dot(x_data, self._W))

    def predict(self, x_data):
        
        return self.predict_prob(x_data).round()##왜 라운드 함수를 쓰는지 : round가 반올림 함수라면, 0 아니면 1로 나누기 위해서

In [3]:
import pandas as pd

In [4]:
df=pd.read_csv('Social_Network_Ads.csv')
df.head(3)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0


In [5]:
df.dtypes

User ID             int64
Gender             object
Age                 int64
EstimatedSalary     int64
Purchased           int64
dtype: object

In [6]:
df['Gender'] = df.Gender.map({'Male':1, 'Female':2})
print(df.dtypes)
df.head(3)

User ID            int64
Gender             int64
Age                int64
EstimatedSalary    int64
Purchased          int64
dtype: object


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,2,26,43000,0


In [7]:
# train test 나누기 방법1
from sklearn.preprocessing import MinMaxScaler

scaler_X = MinMaxScaler()
scaler_Y = MinMaxScaler()


x_data_train = (df.iloc[:int(len(df)*0.8),1:4])
#x_data_train = np.array(x_data_train)
scaler_X.fit(x_data_train)
x_data_train = pd.DataFrame(scaler_X.transform(x_data_train), index=x_data_train.index, columns=x_data_train.columns)

y_data_train = df.iloc[:int(len(df)*0.8),4]
#scaler_Y.fit_transform(y_data_train)
#y_data_train = np.array(y_data_train)




x_data_test = df.iloc[int(len(df)*0.8):, 1:4]
#x_data_test = np.array(x_data_test)
scaler_X.fit(x_data_test)
x_data_test = pd.DataFrame(scaler_X.transform(x_data_test), index=x_data_test.index, columns=x_data_test.columns)

y_data_test = df.iloc[int(len(df)*0.8):, 4]
#y_data_test = np.array(y_data_test)

In [8]:
# train test 나누기 방법2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X=df.iloc[:,1:4]
y=df.iloc[:,4]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler_X = MinMaxScaler()
scaler_Y = MinMaxScaler()

scaler_X.fit(X_train)
X_train = pd.DataFrame(scaler_X.transform(X_train), index=X_train.index, columns=X_train.columns)

scaler_X.fit(X_test)
X_test = pd.DataFrame(scaler_X.transform(X_test), index=X_test.index, columns=X_test.columns)

In [9]:
lr=LogisticRegression(learning_rate=0.5, max_iterations=10000, verbose=True)

In [10]:
lr.fit(x_data_train, y_data_train)

cost : 0.6931471805599467
cost : 0.4633566700481224
cost : 0.40396606829909104
cost : 0.37650664424866365
cost : 0.3614334055318898
cost : 0.35223733131945195
cost : 0.34622349037678307
cost : 0.34209513989286167
cost : 0.3391584567440826
cost : 0.33701210521213887
cost : 0.33540974350736136
cost : 0.3341929729936327
cost : 0.33325607710110267
cost : 0.33252631956994894
cost : 0.33195237583622694
cost : 0.33149725303332944
cost : 0.3311338027348283
cost : 0.3308417905331231
cost : 0.33060593215054135
cost : 0.3304145472621572
cost : 0.3302586182669557
cost : 0.3301311205594506
cost : 0.3300265385039335
cost : 0.3299405107167562
cost : 0.3298695668457842
cost : 0.329810930037923
cost : 0.3297623671893944
cost : 0.329722074371794
cost : 0.3296885884367787
cost : 0.32966071829843513
cost : 0.3296374911423109
cost : 0.32961811005234753
cost : 0.32960192043921605
cost : 0.3295883833013611
cost : 0.3295770538251343
cost : 0.3295675641820875
cost : 0.3295596096440886
cost : 0.329552937334655


In [11]:
lr_2=LogisticRegression(learning_rate=0.5, max_iterations=10000, verbose=True)
lr_2.fit(X_train, y_train)

cost : 0.6931471805599467
cost : 0.51296747678464
cost : 0.453257809790976
cost : 0.4245025446263518
cost : 0.40828052497324857
cost : 0.3981739726224388
cost : 0.39144609649976775
cost : 0.38675257063793306
cost : 0.38336278957509606
cost : 0.380848720361131
cost : 0.37894471034833654
cost : 0.37747821221410943
cost : 0.3763329640351631
cost : 0.3754282321638834
cost : 0.37470652769751184
cost : 0.3741260360373769
cost : 0.37365578751070183
cost : 0.3732724840421635
cost : 0.3729583597947435
cost : 0.3726997061475761
cost : 0.37248583440641087
cost : 0.37230833344164493
cost : 0.3721605300261309
cost : 0.3720370909896098
cost : 0.3719337261984349
cost : 0.371846964265927
cost : 0.37177398142384055
cost : 0.3717124697210907
cost : 0.37166053463762194
cost : 0.3716166149226762
cost : 0.3715794193810726
cost : 0.37154787669477696
cost : 0.37152109534982636
cost : 0.37149833145464806
cost : 0.37147896276269116
cost : 0.3714624676036665
cost : 0.3714484077209733
cost : 0.37143641423447626


In [12]:
preds=lr.predict(x_data_test)
preds

array([1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
       1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [13]:
preds_2=lr_2.predict(X_test)
preds_2

array([0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0.])

In [14]:
print(list(y_data_test))

[1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1]


In [19]:
print(list(y_test))

[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1]


In [15]:
print('앞 80% 분량을 train data로, 나머지 뒤 20% 분량을 test data로 지정하고 시행시 정확도 : ', (preds == y_data_test).mean())

앞 80% 분량을 train data로, 나머지 뒤 20% 분량을 test data로 지정하고 시행시 정확도 :  0.6625


In [16]:
print('random으로 train/test dataset 분류시 정확도 : ', (preds_2 == y_test).mean())

random으로 train/test dataset 분류시 정확도 :  0.8875


In [17]:
# iris dataset 활용 예제.
# 출처: https://github.com/yoonkt200/ml-theory-python/blob/master/01-regression/logistic-regression/logistic-regression.py

iris = datasets.load_iris()

X = iris.data[:, :]

y = (iris.target != 0) * 1



# 학습 implementation

model = LogisticRegression(learning_rate=0.1, threshold=0.01, max_iterations=10000, verbose=True)

model.fit(X, y)

preds = model.predict(X)

print((preds == y).mean())

print(model.get_coeff())

cost : 0.6931471805599454
cost : 0.05794011269381071
cost : 0.03120070412962307
cost : 0.02166715205967918
cost : 0.01672282428356398
cost : 0.013678670257084902
cost : 0.011607922362094413
cost : 0.010104037475698113
1.0
[-0.3140981  -0.49604387 -1.66996011  2.62273613  1.19189337]


In [18]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3