# 1. 파일 읽기

In [1]:
import pandas as pd

filepath = 'data/features_dup.txt'
df = pd.read_csv(filepath, names=['index', 'name'])
df.head()

Unnamed: 0,index,name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y


In [3]:
X_train = pd.read_csv('data/train/X_train.txt', sep=r'\s+', names=df['name'])
X_test = pd.read_csv('data/test/X_test.txt', sep=r'\s+', names=df['name'])

y_train = pd.read_csv('data/train/y_train.txt', sep=r'\s+', names=['action'])
y_test = pd.read_csv('data/test/y_test.txt', sep=r'\s+', names=['action'])

In [4]:
y_train[:3]

Unnamed: 0,action
0,5
1,5
2,5


In [5]:
y_train.value_counts()

action
6         1407
5         1374
4         1286
1         1226
2         1073
3          986
Name: count, dtype: int64

# 2. 모델 만들기

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=100)
rf.fit(X_train, y_train.values.reshape(-1))
pred = rf.predict(X_test)
print('정확도 :', accuracy_score(y_test, pred))

정확도 : 0.9321343739395996


# 3. 튜닝

In [10]:
from sklearn.model_selection import GridSearchCV

def get_best_params(n_estimators_list, max_depth_list, min_samples_leaf_list, min_samples_split_list):
    params = {
        'n_estimators':n_estimators_list,
        'max_depth':max_depth_list,
        'min_samples_leaf':min_samples_leaf_list,
        'min_samples_split':min_samples_split_list
    }

    grid_cv = GridSearchCV(rf, param_grid=params, scoring='accuracy', cv=5, n_jobs=-1)
    grid_cv.fit(X_train, y_train.values.reshape(-1))

    print('Best Score :', grid_cv.best_score_)
    print('Best Hyper Parameters :', grid_cv.best_params_)

In [9]:
n_estimatos_list = [100]
max_depth_list = range(5, 16, 5)
min_samples_leaf_list = range(5, 16, 5)
min_samples_split_list = range(5, 15, 5)

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.9236986269694827
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 100}


> 5분 소요

In [11]:
# max_depth : 15
# min_samples_leaf : 10
# min_samples_split : 5

n_estimatos_list = [300]
max_depth_list = range(5, 16, 5)
min_samples_leaf_list = range(5, 16, 5)
min_samples_split_list = range(5, 15, 5)

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.9249220993631987
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 300}


> 13분정도 걸림

In [12]:
n_estimatos_list = [500]
max_depth_list = [15]
min_samples_leaf_list = [10]
min_samples_split_list = [5]

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.9247865073969764
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 500}


>4분 소요

In [13]:
n_estimatos_list = [400]
max_depth_list = [15]
min_samples_leaf_list = [10]
min_samples_split_list = [5]

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.925058338767186
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 400}


>3분 소요

In [14]:
n_estimatos_list = range(350, 450, 20)
max_depth_list = [15]
min_samples_leaf_list = [10]
min_samples_split_list = [5]

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.925330632592942
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 430}


> 3분 소요

In [15]:
n_estimatos_list = range(410, 470, 10)
max_depth_list = [15]
min_samples_leaf_list = [10]
min_samples_split_list = [5]

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.925330632592942
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 430}


In [16]:
n_estimatos_list = [430]
max_depth_list = range(10, 30, 5)
min_samples_leaf_list = [10]
min_samples_split_list = [5]

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.925330632592942
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 430}


In [17]:
n_estimatos_list = [430]
max_depth_list = range(13, 18, 1)
min_samples_leaf_list = [10]
min_samples_split_list = [5]

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.925330632592942
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 430}


In [18]:
n_estimatos_list = [430]
max_depth_list = [15]
min_samples_leaf_list = range(8, 13, 1)
min_samples_split_list = [5]

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.9257390733315759
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 9, 'min_samples_split': 5, 'n_estimators': 430}


In [19]:
n_estimatos_list = [430]
max_depth_list = [15]
min_samples_leaf_list = [9]
min_samples_split_list = range(2, 7, 1)

get_best_params(n_estimatos_list, max_depth_list, min_samples_leaf_list, min_samples_split_list)

Best Score : 0.9257390733315759
Best Hyper Parameters : {'max_depth': 15, 'min_samples_leaf': 9, 'min_samples_split': 2, 'n_estimators': 430}
