In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

## 머신러닝으로 타이타닉 생존자 예측하기
* 다양한 머신러닝 알고리즘을 이용해서
* 교차검증 방식으로 모델을 훈련시키고
* 예측 정확도를 통해 평가해 봄

In [2]:
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

## 데이터 불러오기

In [3]:
titanic = pd.read_csv('data/titanic.csv')
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S


In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   name      1309 non-null   object 
 3   sex       1309 non-null   object 
 4   age       1046 non-null   float64
 5   sibsp     1309 non-null   int64  
 6   parch     1309 non-null   int64  
 7   ticket    1309 non-null   object 
 8   fare      1308 non-null   float64
 9   cabin     295 non-null    object 
 10  embarked  1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


## 전처리 
* 분석대상 컬럼들은 반드시 숫자형 값들로 구성되어야 함
   + sex, embarked 컬럼을 숫자형으로 변환
* cabin은 결측치가 많기 때문에 컬럼자체를 제거
* ticket은 분석하기에 너무 많은 범주를 포함함 - 과감히 제거
* 승객직함titles을 추출해서 분석대상 컬럼으로 지정하고 숫자형으로 변환
* 최종컬럼 : pclass, sex, age, sibsp, parch, fare, embarked, titles


### 1. cabin, ticket 컬럼 제거

In [5]:
titanic.drop('cabin', axis=1, inplace=True)
titanic.drop('ticket', axis=1, inplace=True)

### 2. 승객나이, 승선위치 결측치 처리

In [6]:
# 결측치 확인
titanic.isna().sum()

pclass        0
survived      0
name          0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [7]:
# 나이의 결측치는 중앙값으로 대체
median = titanic.age.median()
titanic.age.fillna(median, inplace=True)

In [8]:
# 요금, 승선위치의 결측치는 소량이므로 바로 제거
titanic.dropna(inplace=True)

### 3. 승객이름에서 직함이라는 파생변수 생성

In [9]:
# 승객 이름 : Allen, Miss. Elisabeth Walton
# 지정한 규칙에 맞춰 특정문자열 추출 : extract(정규식)
fmt = ' ([A-Za-z]+)\.'
titanic['title'] = titanic.name.str.extract(fmt)

In [10]:
titanic['title'].value_counts()

Mr          756
Miss        259
Mrs         196
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Ms            2
Major         2
Capt          1
Sir           1
Dona          1
Jonkheer      1
Countess      1
Don           1
Mme           1
Lady          1
Name: title, dtype: int64

### 4. 성별, 승선위치, 직함등을 숫자형으로 변환

In [11]:
# 성별
encoder = LabelEncoder()
gender = titanic.sex
encoder.fit(gender)  # female : 0, male : 1
titanic['gender'] = encoder.transform(gender)

In [12]:
# 승선위치
encoder = LabelEncoder()
embarked = titanic.embarked
encoder.fit(embarked)  # C : 0, Q : 1, S : 2
titanic['Embarked'] = encoder.transform(embarked)

In [13]:
# 직함
encoder = LabelEncoder()
Title = titanic.title
encoder.fit(Title)
# Capt : 0, Col : 1, Countess : 2, Don : 3, Dona : 4, Dr : 5, Jonkheer : 6, Lady : 7, 
# Major : 8, Master : 9, Miss : 10, Mlle : 11, Mme : 12, Mr : 13, Mrs : 14, Ms : 15, Rev : 16, Sir : 17
titanic['Title'] = encoder.transform(Title)

In [14]:
# 컬럼명 변경 : titanic.rename(columns={'embark':'Embarked'})

In [16]:
titanic.loc[:,['sex','gender','embarked','Embarked','title','Title']].head()

Unnamed: 0,sex,gender,embarked,Embarked,title,Title
0,female,0,S,2,Miss,10
1,male,1,S,2,Master,9
2,female,0,S,2,Miss,10
3,male,1,S,2,Mr,13
4,female,0,S,2,Mrs,14


### 5. feature, target 추출

In [17]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1306 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1306 non-null   int64  
 1   survived  1306 non-null   int64  
 2   name      1306 non-null   object 
 3   sex       1306 non-null   object 
 4   age       1306 non-null   float64
 5   sibsp     1306 non-null   int64  
 6   parch     1306 non-null   int64  
 7   fare      1306 non-null   float64
 8   embarked  1306 non-null   object 
 9   title     1306 non-null   object 
 10  gender    1306 non-null   int32  
 11  Embarked  1306 non-null   int32  
 12  Title     1306 non-null   int32  
dtypes: float64(2), int32(3), int64(4), object(4)
memory usage: 127.5+ KB


In [18]:
data = titanic.iloc[:,[0,4,5,6,7,10,11,12]]
target = titanic.survived

In [19]:
data.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,gender,Embarked,Title
0,1,29.0,0,0,211.3375,0,2,10
1,1,0.9167,1,2,151.55,1,2,9
2,1,2.0,1,2,151.55,0,2,10
3,1,30.0,1,2,151.55,1,2,13
4,1,25.0,1,2,151.55,0,2,14


In [20]:
target.value_counts()

0    808
1    498
Name: survived, dtype: int64

### 6. 훈련/평가 데이터 분할

In [21]:
X_train, X_test, Y_train,Y_test = train_test_split(data,target, train_size=0.7,
                                stratify=target, random_state=2211161315)

### 7. 머신러닝 알고리즘 적용

In [22]:
dtclf = DecisionTreeClassifier(random_state=2211161315)

dtclf.fit(X_train, Y_train)
pred = dtclf.predict(X_test)

accuracy_score(Y_test,pred)

0.7448979591836735

In [23]:
rfclf = RandomForestClassifier(random_state=2211161315)

rfclf.fit(X_train, Y_train)
pred = rfclf.predict(X_test)

accuracy_score(Y_test,pred)

0.7729591836734694

### 8. 작업한 데이터프레임 파일로 저장

In [24]:
df = data.copy()
df['survived'] = target

df.to_csv('data/titanic2.csv', index=False)