In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/dataSet.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,2.602562,2.563770,37.098059,185.083765,M,Bad
1,3.057403,5.110418,74.813240,190.168347,M,Bad
2,2.571436,2.770216,60.483945,192.010416,M,Bad
3,3.703299,6.709837,47.163448,191.049732,M,Bad
4,4.334830,3.814075,66.392621,184.219977,M,Bad
...,...,...,...,...,...,...
9020,2.445309,1.606003,1.208863,177.634456,M,Turtle
9021,4.492581,0.550904,3.183047,178.549713,M,Turtle
9022,2.882963,0.556252,1.638212,179.546651,M,Turtle
9023,2.480577,0.545658,4.890720,178.549713,M,Turtle


In [3]:
df.columns=['shoulder_angle_lst', 'ear_angle_lst', 'turtle_dis_lst', 'face_dis_lst', 'gender', 'class']
df

Unnamed: 0,shoulder_angle_lst,ear_angle_lst,turtle_dis_lst,face_dis_lst,gender,class
0,2.602562,2.563770,37.098059,185.083765,M,Bad
1,3.057403,5.110418,74.813240,190.168347,M,Bad
2,2.571436,2.770216,60.483945,192.010416,M,Bad
3,3.703299,6.709837,47.163448,191.049732,M,Bad
4,4.334830,3.814075,66.392621,184.219977,M,Bad
...,...,...,...,...,...,...
9020,2.445309,1.606003,1.208863,177.634456,M,Turtle
9021,4.492581,0.550904,3.183047,178.549713,M,Turtle
9022,2.882963,0.556252,1.638212,179.546651,M,Turtle
9023,2.480577,0.545658,4.890720,178.549713,M,Turtle


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9025 entries, 0 to 9024
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   shoulder_angle_lst  9025 non-null   float64
 1   ear_angle_lst       9025 non-null   float64
 2   turtle_dis_lst      9025 non-null   float64
 3   face_dis_lst        9025 non-null   float64
 4   gender              9025 non-null   object 
 5   class               9025 non-null   object 
dtypes: float64(4), object(2)
memory usage: 423.2+ KB


In [5]:
X = df.iloc[:, :-1]
y = df['class']
X.info(), y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9025 entries, 0 to 9024
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   shoulder_angle_lst  9025 non-null   float64
 1   ear_angle_lst       9025 non-null   float64
 2   turtle_dis_lst      9025 non-null   float64
 3   face_dis_lst        9025 non-null   float64
 4   gender              9025 non-null   object 
dtypes: float64(4), object(1)
memory usage: 352.7+ KB
<class 'pandas.core.series.Series'>
RangeIndex: 9025 entries, 0 to 9024
Series name: class
Non-Null Count  Dtype 
--------------  ----- 
9025 non-null   object
dtypes: object(1)
memory usage: 70.6+ KB


(None, None)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6317, 5), (2708, 5), (6317,), (2708,))

In [7]:
# one-hot incoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6317, 6), (2708, 6), (6317,), (2708,))

### 스케일링, 모델링

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
import pickle

In [11]:
pipelines = {
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier()),
    'sgd' : make_pipeline(StandardScaler(), SGDClassifier()),
    'ada' : make_pipeline(StandardScaler(), AdaBoostClassifier())
}

In [12]:
fit_model = {}
for name, pipeline in pipelines.items():
    model = pipeline.fit(X_train, y_train)
    fit_model[name] = model

In [13]:
rf_pred = fit_model['rf'].predict(X_test)
sgd_pred = fit_model['sgd'].predict(X_test)
ada_pred = fit_model['ada'].predict(X_test)

In [14]:
print('RandomForest\n', classification_report(y_test, rf_pred), '\n', accuracy_score(y_test, rf_pred))

RandomForest
               precision    recall  f1-score   support

         Bad       0.94      0.90      0.92       934
        Good       0.95      0.97      0.96       656
      Turtle       0.93      0.95      0.94      1118

    accuracy                           0.94      2708
   macro avg       0.94      0.94      0.94      2708
weighted avg       0.94      0.94      0.94      2708
 
 0.9353766617429837


In [15]:
print('SGD\n', classification_report(y_test, sgd_pred), '\n', accuracy_score(y_test, sgd_pred))

SGD
               precision    recall  f1-score   support

         Bad       0.91      0.66      0.76       934
        Good       0.77      0.89      0.82       656
      Turtle       0.77      0.87      0.82      1118

    accuracy                           0.80      2708
   macro avg       0.82      0.81      0.80      2708
weighted avg       0.82      0.80      0.80      2708
 
 0.8028064992614475


In [16]:
print('Ada\n', classification_report(y_test, ada_pred), '\n', accuracy_score(y_test, ada_pred))

Ada
               precision    recall  f1-score   support

         Bad       0.86      0.79      0.82       934
        Good       0.89      0.93      0.91       656
      Turtle       0.83      0.87      0.85      1118

    accuracy                           0.86      2708
   macro avg       0.86      0.86      0.86      2708
weighted avg       0.86      0.86      0.85      2708
 
 0.8552437223042836


In [17]:
with open('../models/rf_model.pkl', 'wb') as f:
    pickle.dump(fit_model['rf'], f)


In [18]:
X_test

Unnamed: 0,shoulder_angle_lst,ear_angle_lst,turtle_dis_lst,face_dis_lst,gender_F,gender_M
7557,4.222230,1.005086,109.406958,162.111073,False,True
784,18.949788,31.607502,44.546128,158.795466,False,True
3850,0.926536,6.632515,79.156765,210.736803,False,True
6325,1.444126,7.037941,18.540328,119.519873,False,True
6724,1.027617,3.764035,33.591009,141.509717,False,True
...,...,...,...,...,...,...
3498,0.612766,2.108095,42.222184,231.311478,False,True
5730,1.959927,1.070824,48.173323,166.075284,False,True
6663,0.815543,4.044486,36.718700,167.191507,False,True
2674,1.468801,5.088260,21.864608,224.439301,True,False


In [19]:
rf_pred

array(['Bad', 'Bad', 'Good', ..., 'Turtle', 'Turtle', 'Good'],
      dtype=object)