# 01-4. titanic train data set

In [40]:
# 라이브러리 로드
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [32]:
# 전처리 함수 정의
# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

# 불필요한 속성 제거 함수
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

# 레이블 인코딩
def format_features(df):
    df['Cabin']=df['Cabin'].str[:1] # Carbin값의 첫번째 문자만 가져옴
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le=LabelEncoder()
        le=le.fit(df[feature])
        df[feature]=le.transform(df[feature])
    return df

# 전처리 함수 호출
def transform_features(df):
    df=fillna(df)
    df=drop_features(df)
    df=format_features(df)
    return df

In [16]:
# 데이터 로드
titanic_df = pd.read_csv('titanic_train.csv')
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [17]:
# 형태
titanic_df.shape

(891, 12)

In [18]:
# info
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [19]:
# describe
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [39]:
# 결정트리, 랜덤포레스트, 로지스틱회귀

y_df = titanic_df['Survived'] # 레이블 데이터 셋 - 생존 유무
X_df = titanic_df.drop('Survived',axis=1) # 레이블 행 제거

X_df = transform_features(X_df) # 전처리 함수 실행

# train, test 데이터 셋 분리
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=1)

# 클래스 생성
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
lr_clf = LogisticRegression()

# dt 학습/예측/평가
dt_clf.fit(X_train,y_train)
dt_pred = dt_clf.predict(X_test)
print('DecisionTree 정확도: {0:.4f}'.format(accuracy_score(y_test,dt_pred)))

# rf 학습/예측/평가
rf_clf.fit(X_train,y_train)
rf_pred = rf_clf.predict(X_test)
print('RandomForest 정확도: {0:.4f}'.format(accuracy_score(y_test,rf_pred)))

# lr 학습/예측/평가
lr_clf.fit(X_train,y_train)
lr_pred = lr_clf.predict(X_test)
print('LogisticRegression 정확도: {0:.4f}'.format(accuracy_score(y_test,lr_pred)))

DecisionTree 정확도: 0.7486
RandomForest 정확도: 0.7765
LogisticRegression 정확도: 0.7989


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
# 교차검증
# K 폴드
from sklearn.model_selection import KFold

def exec_kfold(clf,folds=5):
    # 폴드 세트를 5개인 객체 생성, 폴드 수만큼 예측 결과 저장 리스트 객체 생성
    kfold = KFold(n_splits=folds)
    scores=[]
    
    # 교차검증 수행
    for iter_count, (train_index,test_index) in enumerate(kfold.split(X_df)):
        X_train, X_test = X_df.values[train_index],X_df.values[test_index]
        y_train, y_test = y_df.values[train_index],y_df.values[test_index]
        # classifier 학습, 예측, 정확도 계산
        clf.fit(X_train,y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test,predictions)
        scores.append(accuracy)
        print("교차검증 {0} 정확도: {1:4f}".format(iter_count,accuracy))
    
    # 평균 정확도
    mean_score = np.mean(scores)
    print("평균 정확도: {0:4f}".format(mean_score))
    
exec_kfold(dt_clf,folds=5) # 4% 상승

교차검증 0 정확도: 0.754190
교차검증 1 정확도: 0.764045
교차검증 2 정확도: 0.792135
교차검증 3 정확도: 0.758427
교차검증 4 정확도: 0.820225
평균 정확도: 0.777804


In [53]:
# cross_val_score
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt_clf,X_df,y_df,cv=5)
for iter_count, accuracy in enumerate(scores):
    print("교차검증 {0} 정확도: {1:4f}".format(iter_count,accuracy))

print("평균 정확도: {0:4f}".format(np.mean(scores)))

교차검증 0 정확도: 0.743017
교차검증 1 정확도: 0.769663
교차검증 2 정확도: 0.792135
교차검증 3 정확도: 0.775281
교차검증 4 정확도: 0.825843
평균 정확도: 0.781188


In [55]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV

parameters={'max_depth':[2,3,5,10],
            'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}

grid_clf=GridSearchCV(dt_clf,param_grid=parameters,scoring='accuracy',cv=5)
grid_clf.fit(X_train,y_train)

print('최적 파라미터:', grid_clf.best_params_)
print('최고 정확도: {0:.4f}'.format(grid_clf.best_score_))
best_clf = grid_clf.best_estimator_

dpredictions = best_clf.predict(X_test)
accuracy = accuracy_score(y_test,dpredictions)
print('DecisionTree 정확도: {0:.4f}'.format(accuracy))

최적 파라미터: {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}
최고 정확도: 0.8343
DecisionTree 정확도: 0.8045
