In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/dataSet.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,2.602562,2.563770,37.098059,185.083765,M,Bad
1,3.057403,5.110418,74.813240,190.168347,M,Bad
2,2.571436,2.770216,60.483945,192.010416,M,Bad
3,3.703299,6.709837,47.163448,191.049732,M,Bad
4,4.334830,3.814075,66.392621,184.219977,M,Bad
...,...,...,...,...,...,...
9020,2.445309,1.606003,1.208863,177.634456,M,Turtle
9021,4.492581,0.550904,3.183047,178.549713,M,Turtle
9022,2.882963,0.556252,1.638212,179.546651,M,Turtle
9023,2.480577,0.545658,4.890720,178.549713,M,Turtle


In [3]:
df.columns=['shoulder_angle_lst', 'ear_angle_lst', 'turtle_dis_lst', 'face_dis_lst', 'gender', 'class']
df

Unnamed: 0,shoulder_angle_lst,ear_angle_lst,turtle_dis_lst,face_dis_lst,gender,class
0,2.602562,2.563770,37.098059,185.083765,M,Bad
1,3.057403,5.110418,74.813240,190.168347,M,Bad
2,2.571436,2.770216,60.483945,192.010416,M,Bad
3,3.703299,6.709837,47.163448,191.049732,M,Bad
4,4.334830,3.814075,66.392621,184.219977,M,Bad
...,...,...,...,...,...,...
9020,2.445309,1.606003,1.208863,177.634456,M,Turtle
9021,4.492581,0.550904,3.183047,178.549713,M,Turtle
9022,2.882963,0.556252,1.638212,179.546651,M,Turtle
9023,2.480577,0.545658,4.890720,178.549713,M,Turtle


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9025 entries, 0 to 9024
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   shoulder_angle_lst  9025 non-null   float64
 1   ear_angle_lst       9025 non-null   float64
 2   turtle_dis_lst      9025 non-null   float64
 3   face_dis_lst        9025 non-null   float64
 4   gender              9025 non-null   object 
 5   class               9025 non-null   object 
dtypes: float64(4), object(2)
memory usage: 423.2+ KB


In [5]:
X = df.iloc[:, :-1]
y = df['class']
X.info(), y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9025 entries, 0 to 9024
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   shoulder_angle_lst  9025 non-null   float64
 1   ear_angle_lst       9025 non-null   float64
 2   turtle_dis_lst      9025 non-null   float64
 3   face_dis_lst        9025 non-null   float64
 4   gender              9025 non-null   object 
dtypes: float64(4), object(1)
memory usage: 352.7+ KB
<class 'pandas.core.series.Series'>
RangeIndex: 9025 entries, 0 to 9024
Series name: class
Non-Null Count  Dtype 
--------------  ----- 
9025 non-null   object
dtypes: object(1)
memory usage: 70.6+ KB


(None, None)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6317, 5), (2708, 5), (6317,), (2708,))

In [7]:
# one-hot incoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6317, 6), (2708, 6), (6317,), (2708,))

### 스케일링, 모델링, 하이퍼파라미터 수정

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
import pickle
from sklearn.model_selection import GridSearchCV

In [11]:
model_rf = RandomForestClassifier(random_state=123, n_jobs=-1)

In [30]:
params_rf = {'max_depth' : range(18, 23),
             'min_samples_split' : range(2, 3),
             'n_estimators' : range(270, 281, 5),
             'max_features' : [0.6, 0.7, 0.8],
             'min_samples_leaf' : range(2, 5)
            }
grid_rf = GridSearchCV(model_rf, param_grid=params_rf, cv=5, verbose=2)
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV] END max_depth=18, max_features=0.6, min_samples_leaf=2, min_samples_split=2, n_estimators=270; total time=   0.7s
[CV] END max_depth=18, max_features=0.6, min_samples_leaf=2, min_samples_split=2, n_estimators=270; total time=   0.7s
[CV] END max_depth=18, max_features=0.6, min_samples_leaf=2, min_samples_split=2, n_estimators=270; total time=   0.8s
[CV] END max_depth=18, max_features=0.6, min_samples_leaf=2, min_samples_split=2, n_estimators=270; total time=   0.8s
[CV] END max_depth=18, max_features=0.6, min_samples_leaf=2, min_samples_split=2, n_estimators=270; total time=   0.8s
[CV] END max_depth=18, max_features=0.6, min_samples_leaf=2, min_samples_split=2, n_estimators=275; total time=   0.8s
[CV] END max_depth=18, max_features=0.6, min_samples_leaf=2, min_samples_split=2, n_estimators=275; total time=   0.8s
[CV] END max_depth=18, max_features=0.6, min_samples_leaf=2, min_samples_split=2, n_estimators=275; tota

In [31]:
print(grid_rf.best_params_)
print(grid_rf.best_score_)
final_rf = grid_rf.best_estimator_
final_rf.fit(X_train, y_train)

{'max_depth': 22, 'max_features': 0.7, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 280}
0.9236948394920672


In [32]:
rf_pred = final_rf.predict(X_test)
print('RandomForest\n', classification_report(y_test, rf_pred), '\n', accuracy_score(y_test, rf_pred))

RandomForest
               precision    recall  f1-score   support

         Bad       0.94      0.90      0.92       934
        Good       0.95      0.96      0.96       656
      Turtle       0.93      0.95      0.94      1118

    accuracy                           0.94      2708
   macro avg       0.94      0.94      0.94      2708
weighted avg       0.94      0.94      0.94      2708
 
 0.9361152141802068


In [33]:
with open('../models/rf_model_opt4.pkl', 'wb') as f:
    pickle.dump(final_rf, f)


### opt2
{'max_depth': 25, 'max_features': 0.7, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 250}

0.9178381540836064

RandomForest

               precision    recall  f1-score   support

         Bad       0.94      0.90      0.92       934
        Good       0.94      0.96      0.95       656
      Turtle       0.93      0.94      0.93      1118

    accuracy                           0.93      2708
   macro avg       0.93      0.94      0.93      2708
weighted avg       0.93      0.93      0.93      2708
 
 0.9327917282127031

### op3
{'max_depth': 21, 'max_features': 0.7, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 275}

0.9221119346141897

RandomForest

               precision    recall  f1-score   support

         Bad       0.94      0.90      0.92       934
        Good       0.94      0.96      0.95       656
      Turtle       0.93      0.95      0.94      1118

    accuracy                           0.94      2708
   macro avg       0.94      0.94      0.94      2708
weighted avg       0.94      0.94      0.94      2708
 
 0.9353766617429837

opt1 < opt2 < opt3 < opt4