# 타이타닉 생존 예측

In [3]:
# environment setup
try:
    import google.colab
    !git clone https://github.com/DevSlem/AI-Lecture-Incheon-Youngsun-HS.git
    %cd AI-Lecture-Incheon-Youngsun-HS
except ImportError:
    pass

In [1]:
import pandas as pd

# Train 과 Test 데이터를 준비
train_dt = pd.read_csv('data/train.csv')
test_dt = pd. read_csv('data/test.csv')

# 한번에 처리하기 위한 작업
data_train_test = [train_dt, test_dt]

In [2]:
# 정규식을 이용해 Title로 호칭 추출
for data in data_train_test:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [4]:
# Title 매핑작업
for data in data_train_test:
    data['Title'] = data['Title'].replace(['Capt', 'Col', 'Countess', 'Dr', 'Jonkheer', 'Major', 'Rev', 'Sir'], 'Others')
    data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
    data['Title'] = data['Title'].replace('Don', 'Mr')
    data['Title'] = data['Title'].replace(['Mme', 'Lady', 'Dona'], 'Mrs')

# Title 별 생존율
train_dt[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Title,Survived
3,Mrs,0.795276
1,Miss,0.702703
0,Master,0.575
4,Others,0.333333
2,Mr,0.156371


In [5]:
# Name 삭제
for data in data_train_test:
    data.drop('Name', inplace=True, axis=1)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,,S,Mr


In [6]:
# 변경할 값을 dir로 저장
Title_mapping = {'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3, 'Others':4}

for data in data_train_test:
    data['Title'] = data['Title'].map(Title_mapping).astype(int)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,male,35.0,0,0,373450,8.05,,S,0


In [7]:
# Age 결측값에 Title별 나이의 평균값으로 변경
for data in data_train_test:
    data['Age'].fillna(data.groupby('Title')['Age'].transform('mean'), inplace=True)

# 결측값 확인
train_dt.info()
print('-'*6)
test_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  Title        891 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB
------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   P

In [8]:
# AgeRange에 N등분 한 범위를 넣어줍니다.
train_dt['AgeRange'] = pd.cut(train_dt['Age'], 5)

# AgeRange 의 값마다 생존율을 구합니다.
train_dt[['AgeRange', 'Survived']].groupby(['AgeRange'], as_index=False).mean().sort_values(by='AgeRange', ascending=True)

Unnamed: 0,AgeRange,Survived
0,"(0.34, 16.336]",0.548077
1,"(16.336, 32.252]",0.39267
2,"(32.252, 48.168]",0.32
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


In [9]:
# AgeRange 범위 대로 값 변경
for data in data_train_test:
    data.loc[ data['Age'] <= 16, 'Age'] = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age'] = 4
    # int 로 변경
    data['Age'] = data['Age'].astype(int)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,AgeRange
0,1,0,3,male,1,1,0,A/5 21171,7.25,,S,0,"(16.336, 32.252]"
1,2,1,1,female,2,1,0,PC 17599,71.2833,C85,C,1,"(32.252, 48.168]"
2,3,1,3,female,1,0,0,STON/O2. 3101282,7.925,,S,2,"(16.336, 32.252]"
3,4,1,1,female,2,1,0,113803,53.1,C123,S,1,"(32.252, 48.168]"
4,5,0,3,male,2,0,0,373450,8.05,,S,0,"(32.252, 48.168]"


In [10]:
# AgeRange 삭제
train_dt.drop('AgeRange', inplace=True, axis=1)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,1,1,0,A/5 21171,7.25,,S,0
1,2,1,1,female,2,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,female,1,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,female,2,1,0,113803,53.1,C123,S,1
4,5,0,3,male,2,0,0,373450,8.05,,S,0


In [11]:
# SibSp + Parch + 1 로 Familysize 생성
for data in data_train_test:
    data['Familysize'] = data['SibSp'] + data['Parch'] + 1

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Familysize
0,1,0,3,male,1,1,0,A/5 21171,7.25,,S,0,2
1,2,1,1,female,2,1,0,PC 17599,71.2833,C85,C,1,2
2,3,1,3,female,1,0,0,STON/O2. 3101282,7.925,,S,2,1
3,4,1,1,female,2,1,0,113803,53.1,C123,S,1,2
4,5,0,3,male,2,0,0,373450,8.05,,S,0,1


In [12]:
# 삭제할 Feature
drop_feature = ['SibSp', 'Parch']

for data in data_train_test:
    data.drop(drop_feature, inplace=True, axis=1)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,Title,Familysize
0,1,0,3,male,1,A/5 21171,7.25,,S,0,2
1,2,1,1,female,2,PC 17599,71.2833,C85,C,1,2
2,3,1,3,female,1,STON/O2. 3101282,7.925,,S,2,1
3,4,1,1,female,2,113803,53.1,C123,S,1,2
4,5,0,3,male,2,373450,8.05,,S,0,1


In [13]:
# int type 의 숫자 0과 1로 매핑
for data in data_train_test:
    data['Sex'] = data['Sex'].map({'male':0, 'female':1}).astype(int)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,Title,Familysize
0,1,0,3,0,1,A/5 21171,7.25,,S,0,2
1,2,1,1,1,2,PC 17599,71.2833,C85,C,1,2
2,3,1,3,1,1,STON/O2. 3101282,7.925,,S,2,1
3,4,1,1,1,2,113803,53.1,C123,S,1,2
4,5,0,3,0,2,373450,8.05,,S,0,1


In [14]:
# Embarked 결측값에 S 삽입
train_dt['Embarked'].fillna('S', inplace=True)

# Embarked 결측값 개수
print('결측값의 개수 :',train_dt['Embarked'].isnull().sum())

결측값의 개수 : 0


In [15]:
# 변경할 값을 dir에 저장
embarked_mapping = {'S':0, 'C':1, 'Q':2}

for data in data_train_test:
    data['Embarked'] = data['Embarked'].map(embarked_mapping).astype(int)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,Title,Familysize
0,1,0,3,0,1,A/5 21171,7.25,,0,0,2
1,2,1,1,1,2,PC 17599,71.2833,C85,1,1,2
2,3,1,3,1,1,STON/O2. 3101282,7.925,,0,2,1
3,4,1,1,1,2,113803,53.1,C123,0,1,2
4,5,0,3,0,2,373450,8.05,,0,0,1


In [16]:
# Fare 결측값에 Pclass별 가격의 평균값으로 변경
for data in data_train_test:
    data['Fare'].fillna(data.groupby('Pclass')['Fare'].transform('mean'), inplace=True)

# 결측값 확인
train_dt.info()
print('-'*40)
test_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    int64  
 5   Ticket       891 non-null    object 
 6   Fare         891 non-null    float64
 7   Cabin        204 non-null    object 
 8   Embarked     891 non-null    int64  
 9   Title        891 non-null    int64  
 10  Familysize   891 non-null    int64  
dtypes: float64(1), int64(8), object(2)
memory usage: 76.7+ KB
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass   

In [17]:
# FareRange에 N등분 한 범위를 넣어줍니다.
train_dt['FareRange'] = pd.cut(train_dt['Fare'], 4)

# FareRange 의 값마다 생존율을 구합니다.
train_dt[['FareRange', 'Survived']].groupby(['FareRange'], as_index=False).mean().sort_values(by='FareRange', ascending=True)

Unnamed: 0,FareRange,Survived
0,"(-0.512, 128.082]",0.368113
1,"(128.082, 256.165]",0.724138
2,"(256.165, 384.247]",0.666667
3,"(384.247, 512.329]",1.0


In [18]:
# FareRange 범위 대로 값 변경
for data in data_train_test:
    data.loc[ data['Fare'] <= 128, 'Fare'] = 0
    data.loc[(data['Fare'] > 128) & (data['Fare'] <= 256), 'Fare'] = 1
    data.loc[(data['Fare'] > 256) & (data['Fare'] <= 384), 'Fare'] = 2
    data.loc[data['Fare'] > 384, 'Fare'] = 3
    # int 로 변경
    data['Fare'] = data['Fare'].astype(int)

train_dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,Title,Familysize,FareRange
0,1,0,3,0,1,A/5 21171,0,,0,0,2,"(-0.512, 128.082]"
1,2,1,1,1,2,PC 17599,0,C85,1,1,2,"(-0.512, 128.082]"
2,3,1,3,1,1,STON/O2. 3101282,0,,0,2,1,"(-0.512, 128.082]"
3,4,1,1,1,2,113803,0,C123,0,1,2,"(-0.512, 128.082]"
4,5,0,3,0,2,373450,0,,0,0,1,"(-0.512, 128.082]"


In [19]:
# 삭제할 Feature를 List에 저장
data_drop = ['Ticket', 'Cabin', 'PassengerId']

for data in data_train_test:
    data.drop(data_drop, inplace=True, axis=1)

train_dt.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Familysize,FareRange
0,0,3,0,1,0,0,0,2,"(-0.512, 128.082]"
1,1,1,1,2,0,1,1,2,"(-0.512, 128.082]"
2,1,3,1,1,0,0,2,1,"(-0.512, 128.082]"
3,1,1,1,2,0,0,1,2,"(-0.512, 128.082]"
4,0,3,0,2,0,0,0,1,"(-0.512, 128.082]"


In [20]:
# Title 에 One-Hot Encoding 진행
train_dt = pd.get_dummies(train_dt, columns=['Title'], prefix='Title')
test_dt = pd.get_dummies(test_dt, columns=['Title'], prefix='Title')

In [21]:
# FareRange 제거
train_dt.drop('FareRange', inplace=True, axis=1)

train_dt.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Familysize,Title_0,Title_1,Title_2,Title_3,Title_4
0,0,3,0,1,0,0,2,True,False,False,False,False
1,1,1,1,2,0,1,2,False,True,False,False,False
2,1,3,1,1,0,0,1,False,False,True,False,False
3,1,1,1,2,0,0,2,False,True,False,False,False
4,0,3,0,2,0,0,1,True,False,False,False,False


In [30]:
# K-Fold를 위한 준비
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [34]:
# K = 10 의 K-Fold 설정
kfold = KFold(n_splits=10)

# Model 의 결과를 담을 List 생성
mean = []
accuracy = []
std = []

# 사용할 Model 명
classifiers =   'Decision Tree',


# 사용할 Model 설정
models =  DecisionTreeClassifier(),


In [40]:
import numpy as np
# Model 적용
cv_result = cross_val_score(model, X_train, target_label, cv = kfold, scoring='accuracy')
print(cv_result)

# 평균 accuracy
print("Mean CV Accuracy: ", np.mean(cv_result))


[0.83333333 0.87640449 0.78651685 0.87640449 0.80898876 0.82022472
 0.79775281 0.78651685 0.86516854 0.86516854]
Mean CV Accuracy:  0.8316479400749064
