# 랜덤 포레스트(Random Forest)

In [1]:
import pandas as pd
feature_name_df = pd.read_csv('../00.data/UCI_HAR_Dataset/features.txt', sep='\s+', header=None, names=['col_index','col_name'])

In [2]:
def get_new_feature_name_df(old_df):
    dup_df = pd.DataFrame({'dup_cnt':feature_name_df.groupby('col_name').cumcount()})
    new_df = pd.merge(old_df.reset_index(), dup_df.reset_index())
    new_df['col_name'] = new_df[['col_name', 'dup_cnt']].\
        apply(lambda x: x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis=1)
    new_df = new_df.drop(['index'], axis=1)
    return new_df

In [3]:
new_feature_df = get_new_feature_name_df(feature_name_df)
feature_list = list(new_feature_df.col_name.values)

In [4]:
X_train = pd.read_csv('../00.data/UCI_HAR_Dataset/train/X_train.txt', 
                      header=None, sep='\s+', names=feature_list)
X_test = pd.read_csv('../00.data/UCI_HAR_Dataset/test/X_test.txt', 
                     header=None, sep='\s+', names=feature_list)
y_train = pd.read_csv('../00.data/UCI_HAR_Dataset/train/y_train.txt', 
                      header=None, sep='\s+', names=['action'])
y_test = pd.read_csv('../00.data/UCI_HAR_Dataset/test/y_test.txt', 
                     header=None, sep='\s+', names=['action'])
# ---------------------------------------------------------------데이터준비 끝   

## 랜덤 포레스트 모델 생성/학습/예측/평가

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {acc:.4f}')
#03_ex파일에 디시전트리로 0.8717나왔는데.. 오늘 랜덤포레스트로 0.9220 올랐다!
#여러모델로하면좋은거다???

랜덤 포레스트 정확도: 0.9274


### 최적 파라미터 찾기 

In [6]:
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[ 10, 30, 50, 100, 200],
     'max_depth': [8, 12, 16],
     'min_samples_split': [12, 16, 20]
}

In [8]:
rf_clf = RandomForestClassifier(random_state=156, n_jobs=-1)#cpu코어를최대한사용해서작업하라 더 빨리하라고!!그래도시간걸림;
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9146
최적 파라미터: {'max_depth': 8, 'min_samples_split': 12, 'n_estimators': 50}


In [9]:
df = pd.DataFrame(grid_cv.cv_results_)
df = df[['param_n_estimators', 'param_max_depth', 'param_min_samples_split', 'mean_test_score']]
df

Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,mean_test_score
0,10,8,12,0.89785
1,30,8,12,0.908324
2,50,8,12,0.914581
3,100,8,12,0.911861
4,200,8,12,0.911861
5,10,8,16,0.897443
6,30,8,16,0.903155
7,50,8,16,0.907781
8,100,8,16,0.909414
9,200,8,16,0.908325


- 튜닝된 파라미터로 재평가

In [10]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9141


### 재탐색

In [11]:
params = {
    'n_estimators': [10, 50, 100]
}
rf_clf = RandomForestClassifier(random_state=156, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9085
최적 파라미터: {'n_estimators': 50}


In [12]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9247


## K 최근접 이웃(K-Nearest Neighbor)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [14]:
#학습을 안하는데도 시간이 좀 걸림 ㅋㅋ
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
accuracy_score(y_test, pred)
#결과값 0.9015

0.9015948422124194