# 분류기 만들기

타이타닉 데이터의 생존여부 분류
- 규칙: 성별(sex) = 1 생존하지 않은 것으로 분류

In [24]:
from sklearn.base import BaseEstimator
import numpy as np

class MyDummyClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1:
                pred[i]=0
            else:
                pred[i]=1
        return pred
                

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [26]:
titanic_df = pd.read_csv('./data/titanic.csv')
titanic_df.head(2)
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)

In [27]:
from sklearn.preprocessing import LabelEncoder

#Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

#머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

#레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

#앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df

In [28]:
X_titanic_df = transform_features(X_titanic_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [75]:
# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

In [76]:
my_clf = MyDummyClassifier()
my_clf.fit(X_train, y_train)

In [77]:
my_pred = my_clf.predict(X_test)
accuracy_score(y_test, my_pred)

0.8324022346368715

In [78]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, my_pred)

array([[103,  15],
       [ 15,  46]])

In [79]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, my_pred), recall_score(y_test, my_pred)

(np.float64(0.7540983606557377), np.float64(0.7540983606557377))

# 로지스틱회귀, 랜덤포레스트, KNN의 정밀도, 재현율 비교하기

In [80]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print(confusion)
    print(''*20)
    print(accuracy, precision, recall)

In [82]:
# 랜덤포레스트 분류모델 생성
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)

# 정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[104  14]
 [ 15  46]]

0.8379888268156425 0.7666666666666667 0.7540983606557377


In [83]:
# KNN 분류모델 생성
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
pred = knn_clf.predict(X_test)

# 정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[101  17]
 [ 28  33]]

0.7486033519553073 0.66 0.5409836065573771


In [84]:
# 로지스틱회귀 분류모델 생성
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=2000)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

# 정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)


[[104  14]
 [ 13  48]]

0.8491620111731844 0.7741935483870968 0.7868852459016393


In [97]:
pred_proba = lr_clf.predict_proba(X_test)
pos_proba = pred_proba[:,1] # 양성클래스일 확률

threshold = 0.6 # 임계치
custom_proba = (pos_proba>=threshold).astype(int) # 임계치보다 크면 1
custom_proba
confusion_matrix(y_test, custom_proba)
get_clf_eval(y_test, custom_proba)

[[112   6]
 [ 16  45]]

0.8770949720670391 0.8823529411764706 0.7377049180327869


# 정밀도와 재현율의 변화

정밀도와 재현율의 불균형이 심할 때,
혹은 비지니스의 요구사항이 있을 때
임계치를 조정해야 한다.

임계치를 낮추면 정밀도는 낮아지고, 재현율은 올라간다.

# 평가 결과 확인하기

In [102]:
from sklearn.metrics import f1_score, classification_report
f1_score(y_test, pred) #정밀도와 재현율의 평균

np.float64(0.7804878048780488)

In [103]:
print(classification_report(y_test, pred))  #평가보고서

              precision    recall  f1-score   support

           0       0.89      0.88      0.89       118
           1       0.77      0.79      0.78        61

    accuracy                           0.85       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.85      0.85      0.85       179



In [104]:
import pandas as pd
pd.Series(lr_clf.coef_[0]).sort_values() # 피처의 중요도는 계수

1   -2.499594
0   -0.897322
3   -0.278152
7   -0.109094
4   -0.090091
6   -0.089830
2   -0.034793
5    0.000574
dtype: float64