## Kaggle competition - 타이타닉 생존자 예측

In [None]:
# 1) 캐글 데이터(https://www.kaggle.com/c/titanic/data) 다운로드
# 2) 데이터 확인
# 3) 결손치 처리
# 4) 문자열 처리
# 5) Survived 속성을 y로 나머지 피쳐를 X로 만듦
# 6) 학습/테스트 데이터 세트 분리
# 7) 적용할 알고리즘 선정:
# • 결정 트리 – DecisionTreeClassifier
# • 랜덤 포레스트 – RandomForestClassifier
# • 로지스틱 회귀 – LogisticRegression
# 8) 학습, 예측 및 평가
# 9) 교차 검증
# 10) GridSearchCV를 통해 최적 파라미터 도출

In [1]:
import pandas as pd
import numpy as np

### 데이터 확인

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.shape

(891, 12)

In [5]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
test.shape

(418, 11)

In [7]:
# 결측치 확인
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
# 결측치 확인
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

### 데이터 전처리

In [9]:
# 다중대체 하기 전, train과 test데이터 셋을 합치기
df = pd.concat([train,test])
df.shape

(1309, 12)

In [10]:
# 불필요한 컬럼인 이름, 캐빈 번호, 티켓 번호 삭제
# 추가로 타겟변수인 survived은 결측치 채울 때에는 없애고 진행하기
df = df.drop(['Name','Cabin','Ticket','Survived'], axis=1) 
df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


In [11]:
# 결측치 확인
df.isnull().sum()

PassengerId      0
Pclass           0
Sex              0
Age            263
SibSp            0
Parch            0
Fare             1
Embarked         2
dtype: int64

- Fare 컬럼 결측치 채우기

In [12]:
# Fare 결측치 포함된 행 확인
df[df['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
152,1044,3,male,60.5,0,0,,S


In [13]:
# Fare 결측치 행의 Pclass가 3이고, Embarked가 S 이므로 Pclass=3, Embarked=S인 승객들의 평균으로 결측치를 대체하기
pc3 = df['Pclass'] == 3
ems = df['Embarked'] == 'S'
is3s = df[pc3 & ems]
round(is3s['Fare'].mean(), 2)

14.44

In [14]:
# 평균치인 14.44로 값 채우기
df.loc[152, 'Fare'] = 14.44
# 확인
df.iloc[152:153]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
152,153,3,male,55.5,0,0,14.44,S


- Embarked 컬럼 결측치 채우기

In [15]:
# Embarked 결측치 포함된 행 확인
df[df['Embarked'].isnull()]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,62,1,female,38.0,0,0,80.0,
829,830,1,female,62.0,0,0,80.0,


In [16]:
# Fare가 70이상 90이하이고 Pclass가 1인 승객들에 대한 데이터프레임 생성
is70 = df['Fare'] >= 70
is90 = df['Fare'] <= 90
age38 = df['Age'] >= 38
age62 = df['Age'] <= 62
pc1 = df['Pclass'] == 1
is_want = df[is70 & is90 & age38 & age62 & pc1]
is_want.shape

(27, 8)

In [17]:
# 생성한 데이터프레임에서 Embarked의 빈도 확인
is_want.Embarked.value_counts()

C    16
S     8
Q     1
Name: Embarked, dtype: int64

In [18]:
# 최빈값인 C로 대체하기
df.loc[61, 'Embarked'] = 'C'
df.loc[829, 'Embarked'] = 'C'

# 확인
df.iloc[61:62] , df.iloc[829:830]

(    PassengerId  Pclass     Sex   Age  SibSp  Parch  Fare Embarked
 61           62       1  female  38.0      0      0  80.0        C,
      PassengerId  Pclass     Sex   Age  SibSp  Parch  Fare Embarked
 829          830       1  female  62.0      0      0  80.0        C)

- Age 컬럼 결측치 채우기

In [19]:
# 원-핫 인코딩 방식으로 성별을 숫자형 변수로 범주화
df = pd.get_dummies(data = df, columns = ['Sex'], prefix = 'Sex')
df.head(3)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male
0,1,3,22.0,1,0,7.25,S,0,1
1,2,1,38.0,1,0,71.2833,C,1,0
2,3,3,26.0,0,0,7.925,S,1,0


In [20]:
# 원-핫 인코딩 방식으로 Embarked를 숫자형 변수로 범주화
df = pd.get_dummies(data = df, columns = ['Embarked'], prefix = 'Embarked')
df.head(3)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,3,26.0,0,0,7.925,1,0,0,0,1


In [21]:
# 원-핫 인코딩 방식으로 Pclass를 숫자형 변수로 범주화 (범주가 아닌 실수값으로 인식하는 것 방지 위함)
df = pd.get_dummies(data = df, columns = ['Pclass'], prefix = 'Pclass')
df.head(3)

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,2,38.0,1,0,71.2833,1,0,1,0,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1,0,0,1


### 1 - 다중대체법 이용하여 age 결측치 처리하기

In [22]:
from impyute.imputation.cs import mice

In [23]:
# 다중대체법 이용시 PassengerId는 필요하지 않은 데이터이므로 제외한 것으로 분석 진행
np_imputed = mice(df.iloc[:, 1:].values)       # mice 학습시작
df_imputed = pd.DataFrame(np_imputed)  # 대체한 값으로 데이터프레임 만들기
df_imputed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,1.0,0.0,71.2833,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,35.0,1.0,0.0,53.1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,35.0,0.0,0.0,8.05,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [24]:
# 대체한 값으로 Age 컬럼 채우기
df['Age'] = df_imputed[[0]].values

In [25]:
# 결측치 없음을 확인
df.isnull().sum()

PassengerId    0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
Pclass_1       0
Pclass_2       0
Pclass_3       0
dtype: int64

### 2 - 딥러닝 이용하여 age 결측치 처리하기

In [36]:
df.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,2,38.0,1,0,71.2833,1,0,1,0,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1,0,0,1
3,4,35.0,1,0,53.1,1,0,0,0,1,1,0,0
4,5,35.0,0,0,8.05,0,1,0,0,1,0,0,1


In [None]:
##################결측치 대체 완료###################

### 머신러닝 진행

In [26]:
# 이제 결측치 채운 df2를 train과 test set 으로 분리하기
train2 = df[0:891]
test2 = df[891:]

In [27]:
# 행 개수 확인- 처음의 train과 test 와 일치하는지 확인
train2.shape, test2.shape

((891, 13), (418, 13))

In [48]:
type(train['Survived'])

pandas.core.series.Series

In [28]:
# 분석할 때 잠시 제외했던 Survived 컬럼을 다시 추가하기 
train2['Survived'] = train['Survived']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train2['Survived'] = train['Survived']


In [29]:
train2.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Survived
0,1,22.0,1,0,7.25,0,1,0,0,1,0,0,1,0
1,2,38.0,1,0,71.2833,1,0,1,0,0,1,0,0,1
2,3,26.0,0,0,7.925,1,0,0,0,1,0,0,1,1
3,4,35.0,1,0,53.1,1,0,0,0,1,1,0,0,1
4,5,35.0,0,0,8.05,0,1,0,0,1,0,0,1,0


In [61]:
test2.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,892,34.5,0,0,7.8292,0,1,0,1,0,0,0,1
1,893,47.0,1,0,7.0,1,0,0,0,1,0,0,1
2,894,62.0,0,0,9.6875,0,1,0,1,0,0,1,0
3,895,27.0,0,0,8.6625,0,1,0,0,1,0,0,1
4,896,22.0,1,1,12.2875,1,0,0,0,1,0,0,1


In [30]:
# 속성변수 뽑기
feature = train2.iloc[:, 1:13] 
feature.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,38.0,1,0,71.2833,1,0,1,0,0,1,0,0
2,26.0,0,0,7.925,1,0,0,0,1,0,0,1
3,35.0,1,0,53.1,1,0,0,0,1,1,0,0
4,35.0,0,0,8.05,0,1,0,0,1,0,0,1


In [31]:
# 타겟변수 뽑기
target = train2['Survived']
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

### 1. Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
# train/test set 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature, target, stratify = target, test_size=0.2, random_state=2021)

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
dtc = DecisionTreeClassifier(random_state=2021)
params = {
    'max_depth':[2,3,4,5,6],
    'min_samples_split':[2,3,4]
}
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=5)

In [36]:
# 학습 수행
grid_dtc.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]})

In [37]:
# 최적 예측 
from sklearn.metrics import accuracy_score
pred = grid_dtc.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)

0.7932960893854749

In [62]:
# test2 데이터에 대해 예측하기
predict = grid_dtc.best_estimator_.predict(test2.iloc[:, 1:13] )

In [63]:
predict

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [64]:
# 예측한 값을 Survived 컬럼에 채워넣기
test2['Survived'] = predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test2['Survived'] = predict


In [65]:
test2.shape

(418, 14)

In [66]:
test2.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Survived
0,892,34.5,0,0,7.8292,0,1,0,1,0,0,0,1,0
1,893,47.0,1,0,7.0,1,0,0,0,1,0,0,1,0
2,894,62.0,0,0,9.6875,0,1,0,1,0,0,1,0,0
3,895,27.0,0,0,8.6625,0,1,0,0,1,0,0,1,0
4,896,22.0,1,1,12.2875,1,0,0,0,1,0,0,1,1


### 2. SVM

In [38]:
from sklearn.svm import SVC
svc=SVC() # 객체 생성

In [39]:
# train/test set 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature, target, stratify = target, test_size=0.2, random_state=2021)

In [40]:
# 학습 수행
svc.fit(X_train,y_train)

SVC()

In [45]:
# 예측 수행
pred_sv = svc.predict(X_test)

In [46]:
# 모델 평가
score = accuracy_score(y_test, pred_sv)
print(f'SVM 예측 정확도{score:.4f}')

SVM 예측 정확도0.6983


### 3. 로지스틱 회귀

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [50]:
# 스케일링
scaled_data = StandardScaler().fit_transform(feature)

In [52]:
# 학습, 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(scaled_data, target, stratify=target, test_size=0.2)

In [53]:
lr_clf = LogisticRegression(random_state=2021) # 객체 생성
lr_clf.fit(X_train,y_train)  # 학습 수행

LogisticRegression(random_state=2021)

In [54]:
# 예측 수행
pred_lr = lr_clf.predict(X_test)

In [55]:
# 모델 평가
score = accuracy_score(y_test, pred_lr)
print(f'로지스틱회귀 예측 정확도{score:.4f}')

로지스틱회귀 예측 정확도0.7765


### 4. 랜덤포레스트

In [56]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
# train/test set 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature, target, stratify = target, test_size=0.2, random_state=2021)

In [62]:
params ={
    'n_estimators':[100],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

In [63]:
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)  # 학습 수행

GridSearchCV(cv=2, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10, 12],
                         'min_samples_leaf': [8, 12, 18],
                         'min_samples_split': [8, 16, 20],
                         'n_estimators': [100]})

In [64]:
pred_rf = grid_cv.best_estimator_.predict(X_test)
accuracy_score(y_test, pred_rf)

0.7988826815642458