In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [2]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print("오차 행렬")
    print(confusion)
    print("정확도 : {0}, 정밀도 : {1}, 재현율 : {2}".format(accuracy, precision, recall))

In [3]:
# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    
    return df

In [4]:
# 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    
    return df

In [5]:
# 문자 형태 -> 숫자 형태
from sklearn.preprocessing import LabelEncoder
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
        
    return df

In [6]:
# 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    
    return df

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [8]:
titanic_df = pd.read_csv('./train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

In [9]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)

오차 행렬
[[104  14]
 [ 13  48]]
정확도 : 0.8491620111731844, 정밀도 : 0.7741935483870968, 재현율 : 0.7868852459016393


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)
pred_proba.shape

(179, 2)

In [11]:
import numpy as np
pred_proba_result = np.concatenate([pred_proba, pred.reshape(-1, 1)], axis=1)
pred_proba_result[:3]

array([[0.46170212, 0.53829788, 1.        ],
       [0.87864222, 0.12135778, 0.        ],
       [0.87728507, 0.12271493, 0.        ]])

In [12]:
from sklearn.preprocessing import Binarizer

In [13]:
x = [[1, -1, 2], [2, 0, 0], [0, 1.1, 1.2]]
# x의 개별 원소들이 threshold값보다 같거나 작으면 0을, 크면 1을 반환
binarizer = Binarizer(threshold=1.1)
print(binarizer.fit_transform(x))

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


## 분류 결정 임곗값 조정

In [17]:
c_threshold = 0.6
pred_1 = pred_proba[:,1].reshape(-1, 1)
binarizer = Binarizer(threshold=c_threshold).fit(pred_1)
c_pred = binarizer.transform(pred_1)

get_clf_eval(y_test, c_pred)

오차 행렬
[[112   6]
 [ 16  45]]
정확도 : 0.8770949720670391, 정밀도 : 0.8823529411764706, 재현율 : 0.7377049180327869


In [18]:
thresholds = [0.4, 0.45, 0.5, 0.55, 0.6]

def get_eval_by_threshold(y_test, pred_1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_1)
        custom_predict = binarizer.transform(pred_1)
        print("임곗값 : ", custom_threshold)
        get_clf_eval(y_test, custom_predict)
        
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1, 1), thresholds)

임곗값 :  0.4
오차 행렬
[[99 19]
 [10 51]]
정확도 : 0.8379888268156425, 정밀도 : 0.7285714285714285, 재현율 : 0.8360655737704918
임곗값 :  0.45
오차 행렬
[[103  15]
 [ 12  49]]
정확도 : 0.8491620111731844, 정밀도 : 0.765625, 재현율 : 0.8032786885245902
임곗값 :  0.5
오차 행렬
[[104  14]
 [ 13  48]]
정확도 : 0.8491620111731844, 정밀도 : 0.7741935483870968, 재현율 : 0.7868852459016393
임곗값 :  0.55
오차 행렬
[[109   9]
 [ 15  46]]
정확도 : 0.8659217877094972, 정밀도 : 0.8363636363636363, 재현율 : 0.7540983606557377
임곗값 :  0.6
오차 행렬
[[112   6]
 [ 16  45]]
정확도 : 0.8770949720670391, 정밀도 : 0.8823529411764706, 재현율 : 0.7377049180327869


## F1 스코어

In [19]:
from sklearn.metrics import f1_score

In [20]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print("오차 행렬")
    print(confusion)
    print("정확도 : {0}, 정밀도 : {1}, 재현율 : {2}, F1 스코어 : {3}".format(accuracy, precision, recall, f1))

In [21]:
thresholds = [0.4, 0.45, 0.5, 0.55, 0.6]

def get_eval_by_threshold(y_test, pred_1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_1)
        custom_predict = binarizer.transform(pred_1)
        print("임곗값 : ", custom_threshold)
        get_clf_eval(y_test, custom_predict)
        
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1, 1), thresholds)

임곗값 :  0.4
오차 행렬
[[99 19]
 [10 51]]
정확도 : 0.8379888268156425, 정밀도 : 0.7285714285714285, 재현율 : 0.8360655737704918, F1 스코어 : 0.7786259541984734
임곗값 :  0.45
오차 행렬
[[103  15]
 [ 12  49]]
정확도 : 0.8491620111731844, 정밀도 : 0.765625, 재현율 : 0.8032786885245902, F1 스코어 : 0.784
임곗값 :  0.5
오차 행렬
[[104  14]
 [ 13  48]]
정확도 : 0.8491620111731844, 정밀도 : 0.7741935483870968, 재현율 : 0.7868852459016393, F1 스코어 : 0.7804878048780488
임곗값 :  0.55
오차 행렬
[[109   9]
 [ 15  46]]
정확도 : 0.8659217877094972, 정밀도 : 0.8363636363636363, 재현율 : 0.7540983606557377, F1 스코어 : 0.793103448275862
임곗값 :  0.6
오차 행렬
[[112   6]
 [ 16  45]]
정확도 : 0.8770949720670391, 정밀도 : 0.8823529411764706, 재현율 : 0.7377049180327869, F1 스코어 : 0.8035714285714285


## ROC 곡선, AUC 값

In [22]:
from sklearn.metrics import roc_auc_score

In [29]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    auc = roc_auc_score(y_test, pred)
    print("오차 행렬")
    print(confusion)
    print("정확도 : {0}, 정밀도 : {1}, 재현율 : {2},\n F1 스코어 : {3}, AUC 값 : {4}"
          .format(accuracy, precision, recall, f1, auc))

In [30]:
thresholds = [0.4, 0.45, 0.5, 0.55, 0.6]

def get_eval_by_threshold(y_test, pred_1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_1)
        custom_predict = binarizer.transform(pred_1)
        print("임곗값 : ", custom_threshold)
        get_clf_eval(y_test, custom_predict)
        
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1, 1), thresholds)

임곗값 :  0.4
오차 행렬
[[99 19]
 [10 51]]
정확도 : 0.8379888268156425, 정밀도 : 0.7285714285714285, 재현율 : 0.8360655737704918,
 F1 스코어 : 0.7786259541984734, AUC 값 : 0.8375243123089747
임곗값 :  0.45
오차 행렬
[[103  15]
 [ 12  49]]
정확도 : 0.8491620111731844, 정밀도 : 0.765625, 재현율 : 0.8032786885245902,
 F1 스코어 : 0.784, AUC 값 : 0.8380800222283968
임곗값 :  0.5
오차 행렬
[[104  14]
 [ 13  48]]
정확도 : 0.8491620111731844, 정밀도 : 0.7741935483870968, 재현율 : 0.7868852459016393,
 F1 스코어 : 0.7804878048780488, AUC 값 : 0.8341205890525146
임곗값 :  0.55
오차 행렬
[[109   9]
 [ 15  46]]
정확도 : 0.8659217877094972, 정밀도 : 0.8363636363636363, 재현율 : 0.7540983606557377,
 F1 스코어 : 0.793103448275862, AUC 값 : 0.8389135871075298
임곗값 :  0.6
오차 행렬
[[112   6]
 [ 16  45]]
정확도 : 0.8770949720670391, 정밀도 : 0.8823529411764706, 재현율 : 0.7377049180327869,
 F1 스코어 : 0.8035714285714285, AUC 값 : 0.8434287302028343
