In [1]:
import warnings
warnings.filterwarnings(action='ignore')
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost

In [2]:
df = pd.read_csv('data/titanic.csv')

In [3]:
## 데이터 개요
df.head()

## 데이터 변수 확인
df.columns # 변수 이름 확인
df.columns.size # 변수 개수 확인
df.shape # 행의 개수, 열의 개수
df.dtypes # 변수의 데이터 타입 (수치형으로 변환하기 위해)

## 특징(변수)는 절대 Na가 있으면 안되므로 반드시 Na 확인
df.isna().sum()

## 필요한 column(변수)만 select
selected_cols = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived']
data = df[selected_cols]

In [4]:
## 분석 데이터 개요
data.head()

## 데이터 변수 확인
data.columns # 변수 이름 확인
data.columns.size # 변수 개수 확인
data.shape # 행의 개수, 열의 개수
data.dtypes # 변수의 데이터 타입 (수치형으로 변환하기 위해)

## 특징(변수)는 절대 Na가 있으면 안되므로 반드시 Na 확인
data.isna().sum()
# data.info() # row뿐만이 아니라 column의 개수가 많아도 데이터가 크다고 할 수 있음

# Pclass        0
# Name          0
# Sex           0
# Age         177
# SibSp         0
# Parch         0
# Ticket        0
# Fare          0
# Cabin       687
# Embarked      2
# Survived      0
# dtype: int64


## na 정리
# 1. drop데이터 선별 (column)
# Cabin

# 2. drop데이터 선별 (row)
# Embarked
# data.drop(data.Embarked.isna(), axis=0)
data = data.loc[~data.Embarked.isna(),:]
data.isna().sum() ## 해당 row가 지워졌는지 확인



Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      0
Survived      0
dtype: int64

In [5]:
## Cabin imputation(채워넣기)
# 수치형 --> 외곡의 우려 있어서 세심한 주의 필요 (평균 사용시 주의 요망)
# 범주형데이터 --> real Na? 범주외의 대체값이 존재하는가?

## Cabin EDA
data.Cabin.str[:1].value_counts() # 첫글자를 기준으로 분류
data.Cabin.fillna('N',inplace=True) # Na 값을 전부 'N'으로 치환
data.isna().sum() # Na 개수 확인
data.Cabin.str[:1].value_counts() # 채워졌는지 확인 
newCabin = data.Cabin.str[:1]
data.drop('Cabin',axis=1,inplace=True)
data['Cabin'] = newCabin

In [6]:
## Age imputation

## Age EDA
data.Age.describe()
data.loc[data.Age <= 18,:] # 18세 미만인 사람
data.loc[data.Age.isna(),['Pclass','Fare']] # Age값이 NULL인 사람의 선실, 요금 정보
data.groupby(['Pclass','Sex'])['Fare'].mean()

## Age 예측을 위한 회귀모델 생성
df_age = data.loc[~data.Age.isna(),['Pclass','Sex','Fare','Embarked','Age']] # 예측을 위한 데이터
pred_age = data.loc[data.Age.isna(),['Pclass','Sex','Fare','Embarked']] # 예측할 데이터

le_sex = LabelEncoder()
le_sex.fit(df_age.Sex)
newSex = le_sex.transform(df_age.Sex)

le_embarked = LabelEncoder()
le_embarked.fit(df_age.Embarked)
newEmbarked = le_embarked.transform(df_age.Embarked)

df_age.drop(['Sex','Embarked'], axis=1, inplace=True)
newCols = np.c_[newSex, newEmbarked]
df_age = pd.concat([pd.DataFrame(newCols, index=df_age.index), df_age], axis=1)

age_model = RandomForestRegressor()
age_model.fit(df_age.iloc[:,:-1], df_age.iloc[:,-1])

## Null 예측 실행
newSex = le_sex.transform(pred_age.Sex)
newEmbarked = le_embarked.transform(pred_age.Embarked)

pred_age.drop(['Sex','Embarked'], axis=1, inplace=True)
newCols = np.c_[newSex, newEmbarked]
pred_age = pd.concat([pd.DataFrame(newCols, index=pred_age.index), pred_age], axis=1)

newAge = age_model.predict(pred_age)
pred_age['Age'] = newAge

## 
newAge = pd.merge(df_age, pred_age, how='outer')[['Age']]
newAge.reset_index(inplace=True, drop=True) # 인덱스 맞추기


## 기존 data에 newAge컬럼 추가
data.reset_index(inplace=True, drop=True) # 인덱스 맞추기
data.drop('Age', axis=1, inplace=True)
data['Age'] = newAge


In [7]:
## 파생 변수 생성 (Name) 결혼유무판단
## data.Name에서 Mrs. 가 존재하면 기혼
## data.Name에서 Mr. 가 가족(SibSp)이 있으면 기혼 또는 25세 이상이면 기혼
married = np.where(data.Name.str.contains('Mrs.') | (data.Name.str.contains('Mr.') & data.SibSp > 0), 1, 0)
data['Married'] = married

In [8]:
## Name, Ticket 컬럼 제거
data.drop(['Name', 'Ticket'], axis=1, inplace=True)
data.head()

## data의 컬럼 타입 확인
data.dtypes

Pclass        int64
Sex          object
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
Survived      int64
Cabin        object
Age         float64
Married       int64
dtype: object

In [9]:
## preprocessing (전처리) 
# categorical(범주형) 변수에서 종류(unique)가 2개면 정수 인코딩, 나머지는 원핫 인코딩
# obj = data.dtypes[data.dtypes == 'object'].index
# data[obj[0]].unique().size

# 수치형(연속형) 변수일 경우 정규화(MinMax, Robust, Standard)
# mmScale = (X - np.min(X)) / (np.max(X) - np.min(X))
# rbScale = (X - np.percentile(X, 50)) / (np.percentile(X, 75) - (np.percentile(X, 25))
# stScale = (X - np.mean(X)) / (np.sd(X))

In [10]:
# plt.boxplot(data.Age)
newAge = np.where(data.Age > np.percentile(data.Age, 90), 0, 
                  np.where(data.Age > np.percentile(data.Age, 75), 1, 
                           np.where(data.Age > np.percentile(data.Age, 25), 2, 3)))
data.drop('Age',axis=1,inplace=True)
data['Age'] = newAge

In [11]:
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# newPclass = OneHotEncoder().fit_transform(data[['Pclass']]).toarray()
# newSex = OneHotEncoder().fit_transform(data[['Sex']]).toarray()
# newSibSp = OneHotEncoder().fit_transform(data[['SibSp']]).toarray()
# newCols = np.c_[newPclass, newSex, newSibSp]
# newCols.shape

selected_cols = ['Pclass','Sex','SibSp', 'Age', 'Parch','Embarked','Cabin','Married']
ct = make_column_transformer(
    (OneHotEncoder(),selected_cols)
)
newCols = ct.fit_transform(data).toarray()
data.drop(selected_cols, axis=1, inplace=True)
data = pd.concat([pd.DataFrame(newCols), data], axis=1)
data.shape

(889, 39)

In [12]:
## train, test 분리
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
## Model Instance 생성
rf_model = RandomForestClassifier()
ada_model = AdaBoostClassifier()
lr_model = LogisticRegression()
svc_model = SVC(probability=True)
knn_model = KNeighborsClassifier()
xgb_param = {
    'n_estimators': 100, 
    'learning_rate': 0.08, 
    'gamma': 0, 
    'subsample': 0.75,
    'colsample_bytree': 1, 
    'max_depth': 7,
    'verbosity': 0
}
xgb_model = xgboost.XGBClassifier(**xgb_param)

In [14]:
## Model Training
rf_model.fit(X_train,y_train)
ada_model.fit(X_train,y_train)
lr_model.fit(X_train,y_train)
svc_model.fit(X_train,y_train)
knn_model.fit(X_train,y_train)
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.08, max_delta_step=0,
              max_depth=7, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=1,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.75,
              tree_method='exact', validate_parameters=1, verbosity=0)

In [15]:
rf_result = rf_model.predict_proba(X_test)
ada_result = ada_model.predict_proba(X_test)
lr_result = lr_model.predict_proba(X_test)
svc_result = svc_model.predict_proba(X_test)
knn_result = knn_model.predict_proba(X_test)
xgb_result = xgb_model.predict_proba(X_test)

# hard_voting_result = rf_model.predict(X_test) + ada_model.predict(X_test) + lr_model.predict(X_test) + svc_model.predict(X_test) + knn_model.predict(X_test)
hard_voting_result = rf_model.predict(X_test) + ada_model.predict(X_test) + lr_model.predict(X_test) + svc_model.predict(X_test) + xgb_model.predict(X_test)
hard_acc = sum(np.where(hard_voting_result > 2, 1, 0) == y_test) / hard_voting_result.size

soft_voting_result = rf_result + ada_result + lr_result + svc_result + knn_result
soft_acc = sum(np.array(soft_voting_result[:,0] < soft_voting_result[:,1], dtype='int') == y_test) / y_test.size

In [16]:
print('rf_model: ', rf_model.score(X_test,y_test))
print('ada_model: ', ada_model.score(X_test,y_test))
print('lr_model: ', lr_model.score(X_test,y_test))
print('svc_model: ', svc_model.score(X_test,y_test))
print('knn_model: ', knn_model.score(X_test,y_test))
print('xgb_model: ', xgb_model.score(X_test,y_test))
print('hard_voting_model: ', hard_acc)
print('soft_voting_model: ', soft_acc)

rf_model:  0.8089887640449438
ada_model:  0.8089887640449438
lr_model:  0.797752808988764
svc_model:  0.6966292134831461
knn_model:  0.7640449438202247
xgb_model:  0.8202247191011236
hard_voting_model:  0.8089887640449438
soft_voting_model:  0.8202247191011236


In [17]:
## Meta Model의 train x 생성
v1 = rf_model.predict(X_train)
v2 = ada_model.predict(X_train)
v3 = lr_model.predict(X_train)
v4 = svc_model.predict(X_train)
v5 = xgb_model.predict(X_train)
X_train_meta = np.c_[v1,v2,v3,v4,v5]

In [18]:
## Meta Model
meta_model = knn_model
meta_model.fit(X_train_meta,y_train)

KNeighborsClassifier()

In [19]:
## Meta Model test x 생성
v1 = rf_model.predict(X_test)
v2 = ada_model.predict(X_test)
v3 = lr_model.predict(X_test)
v4 = svc_model.predict(X_test)
v5 = xgb_model.predict(X_test)
X_test_meta = np.c_[v1,v2,v3,v4,v5]

In [20]:
## Meta Model 검증
meta_model.score(X_test_meta,y_test)

0.8089887640449438