In [41]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [42]:
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")
combine = [train_df, test_df]

# 1. 데이터 확인

In [43]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [44]:
train_df.describe(include = ['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Coleridge, Mr. Reginald Charles",male,1601,G6,S
freq,1,577,7,4,644


# 2. 전처리

## 1) 분석에 전혀 필요없는 변수 제거

In [45]:
train_df = train_df.drop(['Ticket','Cabin'],axis=1)
test_df = test_df.drop(['Ticket','Cabin'],axis=1)
train_df=train_df.drop(['Name','PassengerId'],axis=1)
test_df=test_df.drop(['Name'],axis=1)
combine = [train_df,test_df]

## 2) CATEGORICAL STR -> CATEGORICAL INT

In [46]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map({'female':1,'male':0}).astype(int)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


## 3) CATEGORICAL STR (결측치 존재 + 대상이 많은 경우) -> CATEGORICAL INT 
##### 2-1) IMPUTATION: MODE (string이기 때문에 최빈값으로)
##### 2-2) Mapping by Dict(카테고리가 많기 때문)

In [47]:
# 1. IMPUTATION - 최빈값
freq_port=train_df['Embarked'].dropna().mode()[0]

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

# 2. CONVERT
Embarked_mapping=defaultdict(int)
for i,t in enumerate(train_df['Embarked'].unique()):
    Embarked_mapping[t]=i

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map(Embarked_mapping)

## 4) INT -> CATEGORICAL INT 
##### 3-1) IMPUTATION: MEDIAN
##### 3-2) Mapping by Dict(카테고리가 많기 때문)

In [48]:
# 1. TEST SET의 IMPUTATION: MEDIAN 이용
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

# 2. TRAIN SET에서 구간을 정해서 만들어 냄
train_df['FareBand']=pd.qcut(train_df['Fare'],4)
train_df[['FareBand','Survived']].groupby(['FareBand'],as_index=False).mean().sort_values(by='FareBand',ascending=True)

FareBand_lst=sorted(train_df['FareBand'].unique())

# 3. FareBand 구간에 포함되는 정도로 변수값을 Catregorical하게 변경
for dataset in combine:
    for i in range(len(FareBand_lst)):
        dataset.loc[(train_df['Fare'] > FareBand_lst[i].left) & (dataset['Fare'] <= FareBand_lst[i].right),'Fare']=i
        
    dataset['Fare']=dataset['Fare'].astype(int)
    
#4. 임시로 만들어뒀던 구간 변수를 제거
train_df = train_df.drop(['FareBand'],axis=1)

#5. 다시 재결합
combine=[train_df,test_df]

## 5) 계층별 Imputation

In [49]:
guess_ages=np.zeros((2,3))

for dataset in combine:
    for i in range(2): #성별에 따라
        for j in range(3): #탑승 Class에 따라
            guess_df=dataset[(dataset['Sex']==i) & (dataset['Pclass']==j+1)]['Age'].dropna()
            age_guess=guess_df.median()
            guess_ages[i,j]=int(age_guess/0.5+0.5)*0.5
            
    for i in range(0,2):
        for j in range(0,3):
            dataset.loc[(dataset['Age'].isnull()) & (dataset['Sex']==i) & (dataset['Pclass']==j+1),'Age']=guess_ages[i,j]
            
    dataset['Age']=dataset['Age'].astype(int)

# 3. 1차 모델링

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

### 1) X, Y 분리

In [52]:
X_train = combine[0].drop("Survived",axis=1)
Y_train = combine[0]["Survived"]
X_test  = combine[1].drop("PassengerId",axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

### 2) Classifier

In [64]:
params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],# OVERFITTING 방지: max_depth
           'min_samples_leaf' : [8, 12, 18],# OVERFITTING 방지: min_samples_leaf
           'min_samples_split' : [2, 4, 8, 16, 20] # OVERFITTING 방지
            #Default = 2 → 작게 설정할 수록 분할 노드가 많아져 과적합 가능성 증가
          }


# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, Y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 8, 'min_samples_leaf': 12, 'min_samples_split': 2, 'n_estimators': 10}
최고 예측 정확도: 0.8193


In [63]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, Y_train)
pred = rf_model.predict(X_test)
pred_df = pd.DataFrame(pred, columns=['Pred'])
pred_df

Unnamed: 0,Pred
0,0
1,0
2,0
3,1
4,0
...,...
413,1
414,1
415,0
416,1


### 3) Regressor

In [66]:
from sklearn.ensemble import RandomForestRegressor

In [67]:
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, Y_train)
pred = rf_model.predict(X_test)
pred_df = pd.DataFrame(pred, columns=['Pred'])
pred_df

Unnamed: 0,Pred
0,0.010588
1,0.050000
2,0.450000
3,0.759667
4,0.414000
...,...
413,0.570071
414,1.000000
415,0.000000
416,0.570071
