타이타닉 생존자 예측 분류기 평가

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


1. 데이터 전처리
- Feature selection: 중복 필드 제거

In [4]:
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','who','deck']]
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck
886,0,2,male,27.0,0,0,13.0,S,man,
887,1,1,female,19.0,0,0,30.0,S,woman,B
888,0,3,female,,1,2,23.45,S,woman,
889,1,1,male,26.0,0,0,30.0,C,man,C
890,0,3,male,32.0,0,0,7.75,Q,man,


- 결측치 처리

In [5]:
#어느 컬럼에 결측치가 있는지 확인
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
who           0
deck        688
dtype: int64

In [6]:
df.who.value_counts()

who
man      537
woman    271
child     83
Name: count, dtype: int64

In [7]:
df.age.fillna(df[df.who.isin(['man', 'woman'])].age.mean().round(1), inplace=True)

In [8]:
df.age.isna().sum()

0

In [9]:
df.embarked.fillna('S', inplace=True)

In [10]:
df.drop(columns=['deck','who'], inplace=True)

In [11]:
#최종 확인
df.isna().sum().sum()

0

- 카테고리형 데이터를 숫자로 변환

In [13]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [15]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.tail(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
888,0,3,0,32.8,1,2,23.45,2
889,1,1,1,26.0,0,0,30.0,0
890,0,3,1,32.0,0,0,7.75,1


2. 훈련/테스트 데이터셋 분리

In [16]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2023
)

3. 학습

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2023,
 'verbose': 0,
 'warm_start': False}

In [19]:
from sklearn.model_selection import GridSearchCV

params = { 'max_depth':[4, 7, 10], 'min_samples_split':[2, 3, 4] }
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

In [20]:
grid_rf.best_params_

{'max_depth': 4, 'min_samples_split': 3}

In [21]:
rfc = RandomForestClassifier(random_state=2023)
params = { 'max_depth':[3, 4, 5, 6], 'min_samples_split':[2, 3, 4] }
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

In [22]:
grid_rf.best_params_

{'max_depth': 6, 'min_samples_split': 4}

4. 예측 및 평가

In [23]:
best_rf = grid_rf.best_estimator_
pred = best_rf.predict(X_test)
rf = pd.DataFrame({'y 실제값':y_test, 'y 예측값':pred})
rf.head()

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0


In [24]:
best_rf.score(X_test, y_test)

0.8100558659217877

5. 오차 행렬(Confusion matrix)

In [None]:
#https://datascienceschool.net/03%20machine%20learning/09.04%20%EB%B6%84%EB%A5%98%20%EC%84%B1%EB%8A%A5%ED%8F%89%EA%B0%80.html

In [None]:
#정확도(accuracy): 전체 샘플 중 맞게 예측한 샘플 수의 비율. TP+TN / TP+TN+FP+FN
#정밀도(precision): 양성 클래스에 속한다고 출력한 샘플 중 실제로 양성 클래스에 속하는 샘플 수의 비율. TP / TP+FP
#재현율(recall): 실제 양성 클래스에 속한 표본 중에 양성 클래스에 속한다고 출력한 표본의 수의 비율. TP / TP+FN