타이타닉 생존자 예측

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [3]:
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


1. 데이터 전처리
- Feature selection: 중복 필드 제거

In [4]:
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','who','deck']]
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck
886,0,2,male,27.0,0,0,13.0,S,man,
887,1,1,female,19.0,0,0,30.0,S,woman,B
888,0,3,female,,1,2,23.45,S,woman,
889,1,1,male,26.0,0,0,30.0,C,man,C
890,0,3,male,32.0,0,0,7.75,Q,man,


- 결측치 처리

In [5]:
#각 열에 누락된 값(NaN)의 개수
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
who           0
deck        688
dtype: int64

In [6]:
#who 열에 있는 각 값의 빈도
df.who.value_counts()

who
man      537
woman    271
child     83
Name: count, dtype: int64

In [7]:
df.age[df.who=='man'].isna().sum(), df.age[df.who=='woman'].isna().sum(), df[df.who=='child'].age.isna().sum()

(124, 53, 0)

In [8]:
#누락된 값을 해당 열의 man 또는 woman 그룹의 평균 나이로 대체
df.age.fillna(df[df.who.isin(['man','woman'])].age.mean().round(1), inplace=True)

In [9]:
#누락된 값(NaN)의 개수
df.age.isna().sum()

0

In [10]:
df.embarked.value_counts() #고유값의 빈도 계산

embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [11]:
df.embarked.fillna('S', inplace=True) #누락된 값을 'S'로 대체

In [12]:
df.drop(columns=['deck','who'], inplace=True)

In [13]:
df.isna().sum().sum()

0

카테고리형 데이터를 숫자로 변환

In [14]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [15]:
from sklearn.preprocessing import LabelEncoder #문자열 카테고리를 정수형으로 인코딩
le=LabelEncoder()

In [16]:
df.sex=le.fit_transform(df.sex)
df.embarked=le.fit_transform(df.embarked)
df.tail(3) #male -> 0, S -> 2로 변경

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
888,0,3,0,32.8,1,2,23.45,2
889,1,1,1,26.0,0,0,30.0,0
890,0,3,1,32.0,0,0,7.75,1


2. 훈련/테스트 데이터셋 분리
- Random Forest: Decision Tree 100개
- Grid Search CV

In [17]:
X=df.iloc[:,1:].values #입력 특성 데이터. 첫번째 열을 제외한 모든 열을 포함
y=df.iloc[:,0].values #타겟 데이터. 첫번째 열만 포함

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(
    X,y, stratify=y, test_size=0.2, random_state=2023
)

3. 학습

In [19]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.get_params()

#criterion: 트리를 분할하는데 사용할 기준 (기본값 gini)
#max_depth: 트리의 최대 깊이
#max_features: 각 노드에서 분할에 사용할 최대 특성 수
#min_samples_leaf: 리프 노드에 있어야 하는 최소 샘플 수
#min_samples_split: 내부 노드를 분할하는데 필요한 최소 샘플 수
#n_estimators: 트리의 개수
#random_state: 랜덤 시드 값 (결과의 재현성 보장)


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2023,
 'verbose': 0,
 'warm_start': False}

In [32]:
from sklearn.model_selection import GridSearchCV

params = { 'max_depth':[4, 7, 10], 'min_samples_split':[2, 3, 4] }
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

#훈련 데이터(X_xtrain, y_train)에 대해 랜덤 포레스트 모델을 훈련하고,
#GridSearch를 통해 최적의 하이퍼파라미터를 찾는 과정

In [21]:
grid_rf.best_params_

{'max_depth': 4, 'min_samples_split': 3}

In [22]:
rfc = RandomForestClassifier(random_state=2023)
params = { 'max_depth':[3, 4, 5, 6], 'min_samples_split':[2, 3, 4] }
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

In [23]:
grid_rf.best_params_

{'max_depth': 6, 'min_samples_split': 4}

4. 예측 및 평가

In [24]:
best_rf=grid_rf.best_estimator_
pred=best_rf.predict(X_test)
rf=pd.DataFrame({'y 실제값':y_test, 'y 예측값':pred})
rf.head()

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0


In [25]:
best_rf.score(X_test, y_test)

0.8100558659217877

5. Logistic Regression 모델

In [26]:
X[:5]

array([[ 3.    ,  1.    , 22.    ,  1.    ,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    , 38.    ,  1.    ,  0.    , 71.2833,  0.    ],
       [ 3.    ,  0.    , 26.    ,  0.    ,  0.    ,  7.925 ,  2.    ],
       [ 1.    ,  0.    , 35.    ,  1.    ,  0.    , 53.1   ,  2.    ],
       [ 3.    ,  1.    , 35.    ,  0.    ,  0.    ,  8.05  ,  2.    ]])

- 표준화

In [27]:
#표준화: 각 특성의 평균을 0으로, 표준편차를 1로 만들어 데이터를 정규 분포에 맞추는 과정

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_std = ss.fit_transform(X)
X_std[:5]

array([[ 0.82737724,  0.73769513, -0.63700389,  0.43279337, -0.47367361,
        -0.50244517,  0.58595414],
       [-1.56610693, -1.35557354,  0.58872284,  0.43279337, -0.47367361,
         0.78684529, -1.9423032 ],
       [ 0.82737724, -1.35557354, -0.3305722 , -0.4745452 , -0.47367361,
        -0.48885426,  0.58595414],
       [-1.56610693, -1.35557354,  0.35889908,  0.43279337, -0.47367361,
         0.42073024,  0.58595414],
       [ 0.82737724,  0.73769513,  0.35889908, -0.4745452 , -0.47367361,
        -0.48633742,  0.58595414]])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, stratify=y, test_size=0.2, random_state=2023
)

In [29]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.7486033519553073

- 정규화

In [30]:
from sklearn.preprocessing import MinMaxScaler
X_mm = MinMaxScaler().fit_transform(X)
X_mm[:5]

array([[1.        , 1.        , 0.27117366, 0.125     , 0.        ,
        0.01415106, 1.        ],
       [0.        , 0.        , 0.4722292 , 0.125     , 0.        ,
        0.13913574, 0.        ],
       [1.        , 0.        , 0.32143755, 0.        , 0.        ,
        0.01546857, 1.        ],
       [0.        , 0.        , 0.43453129, 0.125     , 0.        ,
        0.1036443 , 1.        ],
       [1.        , 1.        , 0.43453129, 0.        , 0.        ,
        0.01571255, 1.        ]])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X_mm, y, stratify=y, test_size=0.2, random_state=2023
)
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.770949720670391