### <b>■ Random Forest</b>
    의사결정트리 + Ensemble
    
    Ensemble
        약한 학습자를 여러개 결합하면 강한 학습자를 만들 수 있는 아이디어를 기반으로 하고
        앙상블 모형은 여러 개의 분류 모형을 같이 사용하여 한꺼번에 평가하는 모델
    

In [3]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')

# 컬럼이 모두다 출력될 수 있도록 출력할 열의 개수 한도를 늘리기
pd.set_option('display.max_columns',15)

# 2단계 결측치 확인하고 제거하거나 치환한다.
# 2.1 타이타닉 데이터 프레임의 자료형을 확인한다.
mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)
rdf = rdf.dropna( subset=['age'], how='any', axis=0)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women']]

# 파이썬의 의사결정트리 모델을 사용하려면 데이터가 다 숫자여야한다.
# 랜덤포레스트 : 의사결정트리 + Bagging

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 33)

from  sklearn.ensemble   import  RandomForestClassifier 
tree_model = RandomForestClassifier( n_estimators=100, oob_score=True, random_state= 9 ).fit(X_train, y_train)
# n_estimator : 생성할 트리의 개수. 약한 학습자 100개 생성. 수가 많아지면 시간이 오래걸림
# oob_score : out of bag 기능의 사용여부. 훈련이 끝난 후 자동으로 oob 평가를 수행. 평가를 보고싶으면 아래의 print를 수행
            # out of bag : 100개의 tree가 훈련 데이터를 사용할 때 63%만 사용, 나머지 37%의 oob sample로 평가. 앙상블의 평가를 oob 평가를 평균하여 획득
print ( tree_model.oob_score_)

y_hat = tree_model.predict( X_test )

from sklearn import metrics
randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )
print( randomforest_matrix )

tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
f1_report = metrics.classification_report( y_test, y_hat )
print( f1_report )

from sklearn.metrics import accuracy_score
accuracy = accuracy_score( y_test, y_hat)
print(accuracy)

0.7454909819639278
[[102  24]
 [ 18  71]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       126
           1       0.75      0.80      0.77        89

    accuracy                           0.80       215
   macro avg       0.80      0.80      0.80       215
weighted avg       0.81      0.80      0.81       215

0.8046511627906977


### ※ 문제235. 아까 점심시간 문제처럼 지금 수행하고 있는 Random Forest 모델의 최적의 하이퍼 파라미터를 for loop문으로 찾는데 oob_score=False로 수행
    알아야 할 파라미터
        1. 훈련 데이터와 테스트 데이터 나눌 때 random_state
        2. RandomForestClassifier 모델 생성시 random_state

In [5]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',15)

df = sns.load_dataset('titanic')

mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)
rdf = rdf.dropna( subset=['age'], how='any', axis=0)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women']]

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

b=[]
c=[]
for i in range(1, 50):
    X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = i)
    
    for k in range(1,50):
        b.append((i,k))
        
        from  sklearn.ensemble   import  RandomForestClassifier 
        tree_model = RandomForestClassifier( n_estimators=100, oob_score=False, random_state= k ).fit(X_train, y_train)
                
        y_hat = tree_model.predict( X_test )
        
        from sklearn import metrics
        randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )

        tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
        f1_report = metrics.classification_report( y_test, y_hat )
        
        from sklearn.metrics import accuracy_score
        accuracy = accuracy_score( y_test, y_hat)
        c.append(accuracy)

idx = c.index(np.max(c))
print('data_random_state: ', b[idx][0], 'model_random_state: ', b[idx][1], 'accuracy:', c[idx])

data_random_state:  47 model_random_state:  26 accuracy: 0.8372093023255814


### <b>■ 랜덤포레스트 모델의 성능을 올리기 위해 데이터 전처리 하는 방법</b>
    1. 결측치가 많은 컬럼은 삭제 - cabin
    2. 나이의 결측치를 치환 - 최빈값으로 치환
        - Kaggle : 정확도 0.80861, 상위 6%
    3. 이상치 제거 - fare의 이상치를 제거
        - Kaggle : 정확도 0.81339, 상위 5%
        
    * 파생변수 생성
        1. 이름의 호칭을 나이 결측치를 채워넣는다 - 상위 4%
        
        

In [32]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')

pd.set_option('display.max_columns',15)

mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)
rdf = rdf.dropna( subset=['age'], how='any', axis=0)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women']]

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 47)

from  sklearn.ensemble   import  RandomForestClassifier 
tree_model = RandomForestClassifier( n_estimators=200, oob_score=False, random_state= 26).fit(X_train, y_train)
# print ( tree_model.oob_score_)

y_hat = tree_model.predict( X_test )

from sklearn import metrics
randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )
print( randomforest_matrix )

tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
f1_report = metrics.classification_report( y_test, y_hat )
print( f1_report )

from sklearn.metrics import accuracy_score
accuracy = accuracy_score( y_test, y_hat)
print(accuracy)

[[105  16]
 [ 20  74]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.85       121
           1       0.82      0.79      0.80        94

    accuracy                           0.83       215
   macro avg       0.83      0.83      0.83       215
weighted avg       0.83      0.83      0.83       215

0.8325581395348837


### ※ 문제236. seaborn 타이타닉에서 결측치가 가장 많은 컬럼인 cabin을 삭제하시오

In [36]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',15)

df = sns.load_dataset('titanic')
df = df.drop(['deck'],axis=1,)
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


### ※ 문제237. seaborn 타이타닉의 나이의 결측치를 최빈값으로 치환하시오

In [56]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',15)

df = sns.load_dataset('titanic')

mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)

most_freq = df['age'].value_counts(dropna=True).idxmax()
rdf['age'].fillna(most_freq, inplace=True)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women']]

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 47)

from  sklearn.ensemble   import  RandomForestClassifier 
tree_model = RandomForestClassifier( n_estimators=200, oob_score=True, random_state= 26).fit(X_train, y_train)
# print ( tree_model.oob_score_)

y_hat = tree_model.predict( X_test )

from sklearn import metrics
randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )
print( randomforest_matrix )

tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
f1_report = metrics.classification_report( y_test, y_hat )
print( f1_report )

from sklearn.metrics import accuracy_score
accuracy = accuracy_score( y_test, y_hat)
print(accuracy)

[[152  14]
 [ 29  73]]
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       166
           1       0.84      0.72      0.77       102

    accuracy                           0.84       268
   macro avg       0.84      0.82      0.82       268
weighted avg       0.84      0.84      0.84       268

0.8395522388059702


In [49]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',15)

df = sns.load_dataset('titanic')

mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)

most_freq = df['age'].value_counts(dropna=True).idxmax()
rdf['age'].fillna(most_freq, inplace=True)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women']]

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

b=[]
c=[]
for i in range(1, 50):
    X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = i)
    
    for k in range(1,50):
        b.append((i,k))
        
        from  sklearn.ensemble   import  RandomForestClassifier 
        tree_model = RandomForestClassifier( n_estimators=100, oob_score=False, random_state= k ).fit(X_train, y_train)
                
        y_hat = tree_model.predict( X_test )
        
        from sklearn import metrics
        randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )

        tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
        f1_report = metrics.classification_report( y_test, y_hat )
        
        from sklearn.metrics import accuracy_score
        accuracy = accuracy_score( y_test, y_hat)
        c.append(accuracy)

idx = c.index(np.max(c))
print('data_random_state: ', b[idx][0], 'model_random_state: ', b[idx][1], 'accuracy:', c[idx])

[[142  22]
 [ 29  75]]
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       164
           1       0.77      0.72      0.75       104

    accuracy                           0.81       268
   macro avg       0.80      0.79      0.80       268
weighted avg       0.81      0.81      0.81       268

0.8097014925373134


### ※ 문제238. 운임(fare)의 이상치를 제거하시오
    이상치 = (운임 > 운임평균 + 5*표준편차)

In [59]:
# 예제 : seaborn 타이타닉 데이터로 Random Forest 모델 생성
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',15)

df = sns.load_dataset('titanic')

mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)

most_freq = df['age'].value_counts(dropna=True).idxmax()
rdf['age'].fillna(most_freq, inplace=True)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

local_std = df.fare.std()*5
res = df['fare'][df.fare>local_std]
rdf = rdf[rdf.fare < local_std]

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women','fare']]

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women','fare'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

b=[]
c=[]
for i in range(1, 50):
    X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = i)
    
    for k in range(1,50):
        b.append((i,k))
        
        from  sklearn.ensemble   import  RandomForestClassifier 
        tree_model = RandomForestClassifier( n_estimators=100, oob_score=False, random_state= k ).fit(X_train, y_train)
                
        y_hat = tree_model.predict( X_test )
        
        from sklearn import metrics
        randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )

        tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
        f1_report = metrics.classification_report( y_test, y_hat )
        
        from sklearn.metrics import accuracy_score
        accuracy = accuracy_score( y_test, y_hat)
        c.append(accuracy)

idx = c.index(np.max(c))
print('data_random_state: ', b[idx][0], 'model_random_state: ', b[idx][1], 'accuracy:', c[idx])

data_random_state:  6 model_random_state:  11 accuracy: 0.8679245283018868


In [58]:
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',15)

df = sns.load_dataset('titanic')

mask4 = (df.age<10) | (df.sex=='female') 
df['child_women'] = mask4.astype(int)

rdf = df.drop(['deck','embark_town'], axis =1)

most_freq = df['age'].value_counts(dropna=True).idxmax()
rdf['age'].fillna(most_freq, inplace=True)

most_freq = rdf['embarked'].value_counts().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)

local_std = rdf.fare.std()*5
res = rdf['fare'][df.fare>local_std]
rdf = rdf[rdf.fare < local_std]

ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked','child_women','fare']]

gender = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf,gender], axis= 1)
onehot_embarked = pd.get_dummies(ndf['embarked'])
ndf = pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['sex','embarked'], axis=1, inplace = True)

x = ndf[ ['pclass', 'age' ,'sibsp', 'parch' ,'female' ,'male', 'C' ,'Q' ,'S', 'child_women','fare'] ]
y = ndf['survived'] # 종속변수

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(x).transform(x)

from  sklearn.ensemble   import  RandomForestClassifier 
tree_model = RandomForestClassifier( n_estimators=100, oob_score=False, random_state= 37).fit(X_train, y_train)
# print ( tree_model.oob_score_)

y_hat = tree_model.predict( X_test )

from sklearn import metrics
randomforest_matrix = metrics.confusion_matrix( y_test, y_hat )
print( randomforest_matrix )

tn, fp, fn, tp = metrics.confusion_matrix( y_test, y_hat ).ravel()
f1_report = metrics.classification_report( y_test, y_hat )
print( f1_report )

from sklearn.metrics import accuracy_score
accuracy = accuracy_score( y_test, y_hat)
print(accuracy)

[[148  19]
 [ 39  59]]
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       167
           1       0.76      0.60      0.67        98

    accuracy                           0.78       265
   macro avg       0.77      0.74      0.75       265
weighted avg       0.78      0.78      0.77       265

0.7811320754716982


### ※ 문제239. Kaggle의 타이타닉 데이터를 랜덤포레스트로 모델을 생성하고 결과를 Kaggle에 제출해서 순위를 확인하시오

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns', 15)

df = pd.read_csv("train.csv")

mask4 = (df.Age < 10) | (df.Sex == 'female')
df['child_women'] = mask4.astype(int)

rdf = df.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1)

most_freq = rdf['Age'].value_counts(dropna=True).idxmax()
rdf['Age'].fillna(most_freq, inplace=True)

most_freq = rdf['Embarked'].value_counts().idxmax()
rdf['Embarked'].fillna(most_freq, inplace=True)

local_std = rdf.Fare.std() * 5
rdf = rdf[:][rdf['Fare'] < local_std]
ndf = rdf

gender = pd.get_dummies(ndf['Sex'])
ndf = pd.concat([ndf, gender], axis=1)
onehot_embarked = pd.get_dummies(ndf['Embarked'])
ndf = pd.concat([ndf, onehot_embarked], axis=1)
ndf.drop(['Sex', 'Embarked'], axis=1, inplace=True)

x = ndf[['Fare', 'Pclass', 'Age', 'SibSp', 'Parch',
         'female', 'male', 'C', 'Q', 'S', 'child_women']]
y = ndf['Survived']  # 종속변수

X = preprocessing.StandardScaler().fit(x).transform(x)

"""
test data
"""

x_ktest = pd.read_csv("test.csv")
mask4 = (x_ktest.Age < 10) | (x_ktest.Sex == 'female')
x_ktest['child_women'] = mask4.astype(int)

rdf_x_ktest = x_ktest.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1)

most_freq = rdf_x_ktest['Age'].value_counts(dropna=True).idxmax()
rdf_x_ktest['Age'].fillna(most_freq, inplace=True)

most_freq = rdf_x_ktest['Embarked'].value_counts().idxmax()
rdf_x_ktest['Embarked'].fillna(most_freq, inplace=True)

most_freq = rdf_x_ktest['Fare'].value_counts().idxmax()
rdf_x_ktest['Fare'].fillna(most_freq, inplace=True)

ndf_x_ktest = rdf_x_ktest

gender = pd.get_dummies(ndf_x_ktest['Sex'])
ndf_x_ktest = pd.concat([ndf_x_ktest, gender], axis=1)
onehot_embarked = pd.get_dummies(ndf_x_ktest['Embarked'])
ndf_x_ktest = pd.concat([ndf_x_ktest, onehot_embarked], axis=1)
ndf_x_ktest.drop(['Sex', 'Embarked'], axis=1, inplace=True)

x = ndf_x_ktest[['Fare', 'Pclass', 'Age', 'SibSp', 'Parch','female', 'male', 'C', 'Q', 'S', 'child_women']]

X_test = preprocessing.StandardScaler().fit(x).transform(x)

tree_model = RandomForestClassifier(n_estimators=200, oob_score=False, random_state=14).fit(X, y)

y_hat = tree_model.predict(X_test)

for i, a in enumerate(y_hat):
    print(str(i+892) + ',' + str(a))

892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,0
900,1
901,0
902,0
903,0
904,1
905,0
906,1
907,1
908,0
909,0
910,0
911,1
912,1
913,1
914,1
915,0
916,1
917,0
918,1
919,0
920,0
921,0
922,0
923,0
924,1
925,0
926,0
927,0
928,0
929,0
930,0
931,0
932,0
933,0
934,0
935,1
936,1
937,0
938,0
939,0
940,1
941,1
942,0
943,0
944,1
945,0
946,0
947,0
948,0
949,0
950,0
951,1
952,0
953,0
954,0
955,0
956,1
957,1
958,1
959,0
960,0
961,1
962,0
963,0
964,1
965,0
966,1
967,0
968,0
969,1
970,0
971,0
972,1
973,0
974,0
975,0
976,0
977,0
978,0
979,1
980,0
981,1
982,0
983,0
984,1
985,0
986,0
987,0
988,1
989,0
990,0
991,0
992,1
993,0
994,0
995,0
996,1
997,0
998,0
999,0
1000,0
1001,0
1002,0
1003,0
1004,1
1005,1
1006,1
1007,0
1008,0
1009,1
1010,0
1011,1
1012,1
1013,0
1014,1
1015,0
1016,0
1017,1
1018,0
1019,1
1020,0
1021,0
1022,0
1023,0
1024,0
1025,0
1026,0
1027,0
1028,0
1029,0
1030,0
1031,0
1032,0
1033,1
1034,0
1035,0
1036,0
1037,0
1038,0
1039,0
1040,0
1041,0
1042,1
1043,0
1044,0
1045,1
1046,0
1047,0
1048,1
1049,0
10

### ※ 문제240. 나이의 결측치를 최빈값이 아니라 호칭의 평균나이로 채워넣고 다시 학습시켜 Kaggle에 올리시오

In [143]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns', 15)

df = pd.read_csv("train.csv")

mask4 = (df.Age < 10) | (df.Sex == 'female')
df['child_women'] = mask4.astype(int)

# most_freq = rdf['Age'].value_counts(dropna=True).idxmax()
# rdf['Age'].fillna(most_freq, inplace=True)

appel = df['Name'].str.split(',').str[1].str.split('.')
df['title'] = appel.str.get(0)
age_m = pd.DataFrame({'mean_age':round(df.groupby('title')['Age'].mean(),1)})
age_m

df = pd.merge(df, age_m, on='title',how='left')
df.Age.fillna(df.mean_age, inplace=True)

most_freq = df['Embarked'].value_counts().idxmax()
df['Embarked'].fillna(most_freq, inplace=True)

local_std = df.Fare.std() * 5
df = df[:][df['Fare'] < local_std]

rdf = df.drop(['PassengerId', 'Cabin', 'Name', 'Ticket', 'mean_age','title'], axis=1)

ndf = rdf

gender = pd.get_dummies(ndf['Sex'])
ndf = pd.concat([ndf, gender], axis=1)
onehot_embarked = pd.get_dummies(ndf['Embarked'])
ndf = pd.concat([ndf, onehot_embarked], axis=1)
ndf.drop(['Sex', 'Embarked'], axis=1, inplace=True)

x = ndf[['Fare', 'Pclass', 'Age', 'SibSp', 'Parch', 'female', 'male', 'C', 'Q', 'S', 'child_women']]
y = ndf['Survived']  # 종속변수

X = preprocessing.StandardScaler().fit(x).transform(x)

"""
test data
"""

x_test = pd.read_csv("test.csv")
mask4 = (x_ktest.Age < 10) | (x_ktest.Sex == 'female')
x_test['child_women'] = mask4.astype(int)

# most_freq = rdf_x_ktest['Age'].value_counts(dropna=True).idxmax()
# rdf_x_ktest['Age'].fillna(most_freq, inplace=True)

appel = x_test['Name'].str.split(',').str[1].str.strip().str.split('.')
x_test['title'] = appel.str.get(0)

test_age_m = pd.DataFrame({'mean_age':round(x_test.groupby('title')['Age'].mean(),1)})
test_age_m.loc['Ms']=28.0

x_test = pd.merge(x_test, test_age_m, on='title',how='left')
x_test.Age.fillna(x_test.mean_age, inplace=True)

most_freq = x_test['Embarked'].value_counts().idxmax()
x_test['Embarked'].fillna(most_freq, inplace=True)

most_freq = x_test['Fare'].value_counts().idxmax()
x_test['Fare'].fillna(most_freq, inplace=True)

rdf_x_ktest = x_test.drop(['PassengerId', 'Cabin', 'Name', 'Ticket','mean_age','title'], axis=1,)
ndf_x_ktest = rdf_x_ktest


gender = pd.get_dummies(ndf_x_ktest['Sex'])
ndf_x_ktest = pd.concat([ndf_x_ktest, gender], axis=1)
onehot_embarked = pd.get_dummies(ndf_x_ktest['Embarked'])
ndf_x_ktest = pd.concat([ndf_x_ktest, onehot_embarked], axis=1)
ndf_x_ktest.drop(['Sex', 'Embarked'], axis=1, inplace=True)

x = ndf_x_ktest[['Fare', 'Pclass', 'Age', 'SibSp', 'Parch','female', 'male', 'C', 'Q', 'S', 'child_women']]

X_test = preprocessing.StandardScaler().fit(x).transform(x)

tree_model = RandomForestClassifier(n_estimators=400, oob_score=False,).fit(X, y)

y_hat = tree_model.predict(X_test)

for i, a in enumerate(y_hat):
    print(str(i+892) + ',' + str(a))

892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,0
900,1
901,0
902,0
903,0
904,1
905,0
906,1
907,1
908,0
909,1
910,0
911,1
912,1
913,1
914,1
915,1
916,1
917,0
918,1
919,0
920,0
921,0
922,0
923,0
924,1
925,0
926,0
927,1
928,0
929,0
930,0
931,1
932,0
933,0
934,0
935,1
936,1
937,0
938,0
939,0
940,1
941,1
942,0
943,0
944,1
945,0
946,0
947,0
948,0
949,0
950,0
951,1
952,0
953,0
954,0
955,0
956,1
957,1
958,1
959,0
960,0
961,1
962,0
963,0
964,1
965,0
966,1
967,0
968,0
969,1
970,0
971,0
972,1
973,0
974,0
975,0
976,0
977,0
978,0
979,1
980,0
981,1
982,0
983,0
984,1
985,0
986,0
987,0
988,1
989,0
990,1
991,0
992,1
993,0
994,0
995,0
996,1
997,0
998,0
999,0
1000,0
1001,0
1002,0
1003,0
1004,1
1005,1
1006,1
1007,0
1008,0
1009,1
1010,0
1011,1
1012,1
1013,0
1014,1
1015,0
1016,0
1017,1
1018,0
1019,1
1020,0
1021,0
1022,0
1023,0
1024,0
1025,0
1026,0
1027,0
1028,0
1029,0
1030,0
1031,0
1032,0
1033,1
1034,0
1035,0
1036,0
1037,0
1038,0
1039,0
1040,0
1041,0
1042,1
1043,0
1044,0
1045,1
1046,0
1047,0
1048,1
1049,0
10