# 타이타닉 생존 예측

In [None]:
# environment setup
try:
    import google.colab
    !git clone https://github.com/DevSlem/AI-Lecture-Incheon-Youngsun-HS.git
    %cd AI-Lecture-Incheon-Youngsun-HS
except ImportError:
    pass

In [None]:
import pandas as pd

# Train 과 Test 데이터를 준비
train_dt = pd.read_csv('data/train.csv')
test_dt = pd. read_csv('data/test.csv')

# 한번에 처리하기 위한 작업
data_train_test = [train_dt, test_dt]

In [None]:
# 정규식을 이용해 Title로 호칭 추출
for data in data_train_test:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

train_dt.head()

In [None]:
# Title 매핑작업
for data in data_train_test:
    data['Title'] = data['Title'].replace(['Capt', 'Col', 'Countess', 'Dr', 'Jonkheer', 'Major', 'Rev', 'Sir'], 'Others')
    data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
    data['Title'] = data['Title'].replace('Don', 'Mr')
    data['Title'] = data['Title'].replace(['Mme', 'Lady', 'Dona'], 'Mrs')

# Title 별 생존율
train_dt[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
# Name 삭제
for data in data_train_test:
    data.drop('Name', inplace=True, axis=1)

train_dt.head()

In [None]:
# 변경할 값을 dir로 저장
Title_mapping = {'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3, 'Others':4}

for data in data_train_test:
    data['Title'] = data['Title'].map(Title_mapping).astype(int)

train_dt.head()

In [None]:
# Age 결측값에 Title별 나이의 평균값으로 변경
for data in data_train_test:
    data['Age'].fillna(data.groupby('Title')['Age'].transform('mean'), inplace=True)

# 결측값 확인
train_dt.info()
print('-'*6)
test_dt.info()

In [None]:
# AgeRange에 N등분 한 범위를 넣어줍니다.
train_dt['AgeRange'] = pd.cut(train_dt['Age'], 5)

# AgeRange 의 값마다 생존율을 구합니다.
train_dt[['AgeRange', 'Survived']].groupby(['AgeRange'], as_index=False).mean().sort_values(by='AgeRange', ascending=True)

In [None]:
# AgeRange 범위 대로 값 변경
for data in data_train_test:
    data.loc[ data['Age'] <= 16, 'Age'] = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age'] = 4
    # int 로 변경
    data['Age'] = data['Age'].astype(int)

train_dt.head()

In [None]:
# AgeRange 삭제
train_dt.drop('AgeRange', inplace=True, axis=1)

train_dt.head()

In [None]:
# SibSp + Parch + 1 로 Familysize 생성
for data in data_train_test:
    data['Familysize'] = data['SibSp'] + data['Parch'] + 1

train_dt.head()

In [None]:
# 삭제할 Feature
drop_feature = ['SibSp', 'Parch']

for data in data_train_test:
    data.drop(drop_feature, inplace=True, axis=1)

train_dt.head()

In [None]:
# int type 의 숫자 0과 1로 매핑
for data in data_train_test:
    data['Sex'] = data['Sex'].map({'male':0, 'female':1}).astype(int)

train_dt.head()

In [None]:
# Embarked 결측값에 S 삽입
train_dt['Embarked'].fillna('S', inplace=True)

# Embarked 결측값 개수
print('결측값의 개수 :',train_dt['Embarked'].isnull().sum())

In [None]:
# 변경할 값을 dir에 저장
embarked_mapping = {'S':0, 'C':1, 'Q':2}

for data in data_train_test:
    data['Embarked'] = data['Embarked'].map(embarked_mapping).astype(int)

train_dt.head()

In [None]:
# Fare 결측값에 Pclass별 가격의 평균값으로 변경
for data in data_train_test:
    data['Fare'].fillna(data.groupby('Pclass')['Fare'].transform('mean'), inplace=True)

# 결측값 확인
train_dt.info()
print('-'*40)
test_dt.info()

In [None]:
# FareRange에 N등분 한 범위를 넣어줍니다.
train_dt['FareRange'] = pd.cut(train_dt['Fare'], 4)

# FareRange 의 값마다 생존율을 구합니다.
train_dt[['FareRange', 'Survived']].groupby(['FareRange'], as_index=False).mean().sort_values(by='FareRange', ascending=True)

In [None]:
# FareRange 범위 대로 값 변경
for data in data_train_test:
    data.loc[ data['Fare'] <= 128, 'Fare'] = 0
    data.loc[(data['Fare'] > 128) & (data['Fare'] <= 256), 'Fare'] = 1
    data.loc[(data['Fare'] > 256) & (data['Fare'] <= 384), 'Fare'] = 2
    data.loc[data['Fare'] > 384, 'Fare'] = 3
    # int 로 변경
    data['Fare'] = data['Fare'].astype(int)

train_dt.head()

In [None]:
# 삭제할 Feature를 List에 저장
data_drop = ['Ticket', 'Cabin', 'PassengerId']

for data in data_train_test:
    data.drop(data_drop, inplace=True, axis=1)

train_dt.head()

In [None]:
# Title 에 One-Hot Encoding 진행
train_dt = pd.get_dummies(train_dt, columns=['Title'], prefix='Title')
test_dt = pd.get_dummies(test_dt, columns=['Title'], prefix='Title')

In [None]:
# FareRange 제거
train_dt.drop('FareRange', inplace=True, axis=1)

train_dt.head()

In [None]:
# K-Fold를 위한 준비
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
# K = 10 의 K-Fold 설정
kfold = KFold(n_splits=10)

# Model 의 결과를 담을 List 생성
mean = []
accuracy = []
std = []

# 사용할 Model 명
classifiers =   'Decision Tree',


# 사용할 Model 설정
models =  DecisionTreeClassifier(),


In [None]:
import numpy as np
# Model 적용
cv_result = cross_val_score(model, X_train, target_label, cv = kfold, scoring='accuracy')
print(cv_result)

# 평균 accuracy
print("Mean CV Accuracy: ", np.mean(cv_result))
