### sklearn.ensemble.RandomForestClassifier
### sklearn.ensemble.RandomForestRegressor

##### 무작위 데이터 샘플링 과정
- 데이터에서 부트스트래핑을 통해 N개의 데이터셋 생성
- 샘플링된 각 데이터셋에서 임의의 변수를 선택(총 M개 중 sqrt(M)개 또는 M/3개)
- Decision Tree를 종합하여 앙상블 모델 생성 이후 OOB Error를 통해 오분류율 평가

#### 주요 Hyperparameter
- n_estimators : 나무의 수(의사결정나무 모델의 구성 수), 기본값 100
- max_features : 선택 변수의 수로 특성치의 반영 정도
- 'auto' / 'sqrt' : sqrt(n_features)
- 'log2' : log2
- none : n_features

##### DecisionTreeClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
##### DecisionTreeRegressor(n_estimators, *, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

# 분석 코드 - Classification

In [1]:
# 라이브러리 및 데이터 로드
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error

df = pd.read_csv('../input/big-data-certification-study/breast-cancer-wisconsin.csv', encoding='utf-8')
df.head()

Unnamed: 0,code,Clump_Thickness,Cell_Size,Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [2]:
# 데이터 분리
X=df.drop(columns=['code','Class'])
y=df[['Class']]
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,random_state=42)

In [3]:
# 정규화
scaler=MinMaxScaler()
scaler.fit(X_train)
mm_X_train=scaler.transform(X_train)
mm_X_test=scaler.transform(X_test)

In [4]:
# 모델 적용
model=RandomForestClassifier()
model.fit(mm_X_train, y_train)
pred_train=model.predict(mm_X_train)
model.score(mm_X_train,y_train)

# 과대적합

1.0

In [5]:
# 혼동행렬, 분류예측 보고서
cm_train=confusion_matrix(y_train,pred_train)
cfr_train=classification_report(y_train,pred_train)
print('혼동행렬 :\n',cm_train,
      '\n\n\n분류예측 보고서 :\n',cfr_train)

혼동행렬 :
 [[333   0]
 [  0 179]] 


분류예측 보고서 :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512



In [6]:
# 모델 적용
pred_test=model.predict(mm_X_test)
model.score(mm_X_test, y_test)

0.9707602339181286

In [7]:
# 혼동행렬, 분류예측 보고서
cm_test=confusion_matrix(y_test,pred_test)
cfr_test=classification_report(y_test,pred_test)
print('혼동행렬 :\n',cm_test,'\n\n\n분류예측 보고서 :\n',cfr_test)

혼동행렬 :
 [[106   5]
 [  0  60]] 


분류예측 보고서 :
               precision    recall  f1-score   support

           0       1.00      0.95      0.98       111
           1       0.92      1.00      0.96        60

    accuracy                           0.97       171
   macro avg       0.96      0.98      0.97       171
weighted avg       0.97      0.97      0.97       171



In [8]:
# Hyperparameter Tuning
# Grid Search
param_g = {'n_estimators':range(100, 1000, 100),
           'max_features':['auto', 'log2']}
grid= GridSearchCV(RandomForestClassifier(), param_g, cv=5,
                   return_train_score=True)
grid.fit(mm_X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': ['auto', 'log2'],
                         'n_estimators': range(100, 1000, 100)},
             return_train_score=True)

In [9]:
print('Best Parameter :', grid.best_params_)
print('Best Score :',round(grid.best_score_,4))
print('Test Score :',round(grid.score(mm_X_test, y_test),4))

Best Parameter : {'max_features': 'auto', 'n_estimators': 100}
Best Score : 0.9746
Test Score : 0.9649


In [10]:
# Randomized Search
param_r={'n_estimators':randint(low=100, high=1000),
         'max_features':['auto','log2']}
random=RandomizedSearchCV(RandomForestClassifier(),
                          param_distributions=param_r,
                          cv=5, n_iter=20,
                          return_train_score=True)
random.fit(mm_X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=20,
                   param_distributions={'max_features': ['auto', 'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f1ec077c050>},
                   return_train_score=True)

In [11]:
print('Best Parameter :', random.best_params_)
print('Best Score :',round(random.best_score_,4))
print('Test Score :',round(random.score(mm_X_test, y_test),4))

Best Parameter : {'max_features': 'auto', 'n_estimators': 755}
Best Score : 0.9746
Test Score : 0.9649


# 분석 코드 - Regression

In [12]:
df2=pd.read_csv('../input/big-data-certification-study/house_price.csv', encoding='utf-8')
df2.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [13]:
# 데이터 분리
X=df2.drop(columns=['house_value'])
y=df2[['house_value']]
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42)

In [14]:
# 정규화
scale=MinMaxScaler()
scale.fit(X_train)
ms_X_train=scale.transform(X_train)
ms_x_test=scale.transform(X_test)

In [15]:
# 모델 적용
model_r=RandomForestRegressor()
model_r.fit(ms_X_train, y_train)
pred_x=model_r.predict(ms_X_train)
model_r.score(ms_X_train, y_train)

0.9462693236131015

In [16]:
pred_y=model_r.predict(ms_x_test)
model_r.score(ms_x_test,y_test)

0.6244026736061874

In [17]:
# RMSE
rmse_train=np.sqrt(mean_squared_error(y_train,pred_x))
rmse_test=np.sqrt(mean_squared_error(y_test,pred_y))
print('Train RMSE :', round(rmse_train),
      '\nTest RMSE :', round(rmse_test))

Train RMSE : 22124 
Test RMSE : 58590


In [18]:
# Hyperparameter Tuning
# Grid Search
g_param = {'n_estimators':range(100, 1000, 100),
           'max_features':['auto', 'log2']}
g_search= GridSearchCV(RandomForestRegressor(),
                       g_param, cv=5,
                       return_train_score=True)
g_search.fit(ms_X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_features': ['auto', 'log2'],
                         'n_estimators': range(100, 1000, 100)},
             return_train_score=True)

In [19]:
print('Best Parameter :', g_search.best_params_)
print('Best Score :',round(g_search.best_score_,4))
print('Test Score :',round(g_search.score(ms_x_test, y_test),4))

Best Parameter : {'max_features': 'log2', 'n_estimators': 800}
Best Score : 0.6266
Test Score : 0.6389


In [20]:
# Randomized Search
r_param={'n_estimators':randint(low=100, high=1000),
         'max_features':['auto','log2']}
r_search=RandomizedSearchCV(RandomForestRegressor(),
                            param_distributions=r_param,
                            cv=5, n_iter=20,
                            return_train_score=True)
r_search.fit(ms_X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=20,
                   param_distributions={'max_features': ['auto', 'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f1ec06992d0>},
                   return_train_score=True)

In [21]:
print('Best Parameter :', r_search.best_params_)
print('Best Score :',round(r_search.best_score_,4))
print('Test Score :',round(r_search.score(ms_x_test, y_test),4))

Best Parameter : {'max_features': 'log2', 'n_estimators': 853}
Best Score : 0.6269
Test Score : 0.6384
