### <b>■ Random Forest</b>
    의사결정트리 + Ensemble
    
    Ensemble
        약한 학습자를 여러개 결합하면 강한 학습자를 만들 수 있는 아이디어를 기반으로 하고
        앙상블 모형은 여러 개의 분류 모형을 같이 사용하여 한꺼번에 평가하는 모델
    

In [3]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')

# 컬럼이 모두다 출력될 수 있도록 출력할 열의 개수 한도를 늘리기
pd.set_option('display.max_columns',15)

# 2단계 결측치 확인하고 제거하거나 치환한다.
# 2.1 타이타닉 데이터 프레임의 자료형을 확인한다.
mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)
rdf = rdf.dropna( subset=['age'], how='any', axis=0)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women']]

# 파이썬의 의사결정트리 모델을 사용하려면 데이터가 다 숫자여야한다.
# 랜덤포레스트 : 의사결정트리 + Bagging

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 33)

from  sklearn.ensemble   import  RandomForestClassifier 
tree_model = RandomForestClassifier( n_estimators=100, oob_score=True, random_state= 9 ).fit(X_train, y_train)
# n_estimator : 생성할 트리의 개수. 약한 학습자 100개 생성. 수가 많아지면 시간이 오래걸림
# oob_score : out of bag 기능의 사용여부. 훈련이 끝난 후 자동으로 oob 평가를 수행. 평가를 보고싶으면 아래의 print를 수행
            # out of bag : 100개의 tree가 훈련 데이터를 사용할 때 63%만 사용, 나머지 37%의 oob sample로 평가. 앙상블의 평가를 oob 평가를 평균하여 획득
print ( tree_model.oob_score_)

y_hat = tree_model.predict( X_test )

from sklearn import metrics
randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )
print( randomforest_matrix )

tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
f1_report = metrics.classification_report( y_test, y_hat )
print( f1_report )

from sklearn.metrics import accuracy_score
accuracy = accuracy_score( y_test, y_hat)
print(accuracy)

0.7454909819639278
[[102  24]
 [ 18  71]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       126
           1       0.75      0.80      0.77        89

    accuracy                           0.80       215
   macro avg       0.80      0.80      0.80       215
weighted avg       0.81      0.80      0.81       215

0.8046511627906977


### ※ 문제235. 아까 점심시간 문제처럼 지금 수행하고 있는 Random Forest 모델의 최적의 하이퍼 파라미터를 for loop문으로 찾는데 oob_score=False로 수행
    알아야 할 파라미터
        1. 훈련 데이터와 테스트 데이터 나눌 때 random_state
        2. RandomForestClassifier 모델 생성시 random_state

In [5]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',15)

df = sns.load_dataset('titanic')

mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)
rdf = rdf.dropna( subset=['age'], how='any', axis=0)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women']]

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

b=[]
c=[]
for i in range(1, 50):
    X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = i)
    
    for k in range(1,50):
        b.append((i,k))
        
        from  sklearn.ensemble   import  RandomForestClassifier 
        tree_model = RandomForestClassifier( n_estimators=100, oob_score=False, random_state= k ).fit(X_train, y_train)
                
        y_hat = tree_model.predict( X_test )
        
        from sklearn import metrics
        randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )

        tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
        f1_report = metrics.classification_report( y_test, y_hat )
        
        from sklearn.metrics import accuracy_score
        accuracy = accuracy_score( y_test, y_hat)
        c.append(accuracy)

idx = c.index(np.max(c))
print('data_random_state: ', b[idx][0], 'model_random_state: ', b[idx][1], 'accuracy:', c[idx])

data_random_state:  47 model_random_state:  26 accuracy: 0.8372093023255814


### <b>■ 랜덤포레스트 모델의 성능을 올리기 위해 데이터 전처리 하는 방법</b>
    1. 결측치가 많은 컬럼은 삭제 - cabin
    2. 나이의 결측치를 치환 - 최빈값으로 치환
        - Kaggle : 정확도 0.80861, 상위 6%
    3. 이상치 제거 - fare의 이상치를 제거
        - Kaggle : 정확도 0.81339, 상위 5%
        
    * 파생변수 생성
        1. 이름의 호칭을 나이 결측치를 채워넣는다 - 상위 4%
        
        

In [32]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')

pd.set_option('display.max_columns',15)

mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)
rdf = rdf.dropna( subset=['age'], how='any', axis=0)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women']]

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 47)

from  sklearn.ensemble   import  RandomForestClassifier 
tree_model = RandomForestClassifier( n_estimators=200, oob_score=False, random_state= 26).fit(X_train, y_train)
# print ( tree_model.oob_score_)

y_hat = tree_model.predict( X_test )

from sklearn import metrics
randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )
print( randomforest_matrix )

tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
f1_report = metrics.classification_report( y_test, y_hat )
print( f1_report )

from sklearn.metrics import accuracy_score
accuracy = accuracy_score( y_test, y_hat)
print(accuracy)

[[105  16]
 [ 20  74]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.85       121
           1       0.82      0.79      0.80        94

    accuracy                           0.83       215
   macro avg       0.83      0.83      0.83       215
weighted avg       0.83      0.83      0.83       215

0.8325581395348837


### ※ 문제236. seaborn 타이타닉에서 결측치가 가장 많은 컬럼인 cabin을 삭제하시오

In [33]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',15)

df = sns.load_dataset('titanic')
df = df.drop(col)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True
