# [DACON] 손동작 분류 경진대회

## 모듈 불러오기

In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# metrics로 accuracy를 사용
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# stacking model에 사용할 알고리즘
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

## 데이터 불러오기

In [2]:
path = "/content/drive/MyDrive/Colab Notebooks/dacon/손동작 분류 경진대회/hand_gesture_data/"

In [3]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [4]:
print(train.shape)
print(test.shape)

(2335, 34)
(9343, 33)


# 데이터 스케일링

In [5]:
train_x = train.drop(['id', 'target'], axis = 1)
test_x = test.drop(['id'], axis = 1)
train_y = train['target']

mins = train_x.min()
maxs = train_x.max()
mins[:5]

sensor_1    -94.746969
sensor_2    -63.942094
sensor_3   -122.195138
sensor_4   -111.870691
sensor_5    -94.147972
dtype: float64

데이터 내 칼럼별로 최솟값, 최댓값을 추출했습니다. 데이터들을 스케일링 하기 위한 목적입니다.

In [None]:
train_x = (train_x - mins) / (maxs - mins)
test_x = (test_x - mins) / (maxs - mins)
train_x.describe().T[['min', 'max']]

(데이터 - 최솟값) / (최댓값 - 최솟값) 연산을 거치게 되면 데이터 값들이 모두 0~1 사이로 가지게 됩니다.

딥러닝에서 입력값을 표준화 시키는 것이 상당히 중요합니다.

In [13]:
train_x.head()

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,...,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27,sensor_28,sensor_29,sensor_30,sensor_31,sensor_32
0,0.541473,0.606731,0.526447,0.490227,0.497738,0.671732,0.464596,0.447091,0.561398,0.548431,...,0.404892,0.473689,0.592187,0.55188,0.492702,0.588748,0.557137,0.625632,0.46574,0.488933
1,0.565373,0.606021,0.510563,0.472168,0.509495,0.641768,0.588175,0.412336,0.484649,0.462025,...,0.502381,0.493417,0.579928,0.571626,0.500264,0.53834,0.657252,0.66456,0.474202,0.471883
2,0.695714,0.595535,0.493912,0.420966,0.563662,0.838724,0.3756,0.381509,0.465033,0.441859,...,0.456094,0.474226,0.974169,0.551168,0.51503,0.515998,0.26947,0.637364,0.457429,0.518783
3,0.56554,0.597091,0.577908,0.508696,0.512573,0.698242,0.340187,0.452233,0.516345,0.526889,...,0.498393,0.492909,0.623802,0.608209,0.505761,0.483252,0.550562,0.646261,0.43758,0.528218
4,0.603221,0.644232,0.345542,0.527245,0.660879,0.774011,0.371651,0.398969,0.388327,0.462653,...,0.404961,0.493201,0.699825,0.606327,0.405131,0.567022,0.597186,0.709483,0.538139,0.422955


In [None]:
test_x.head()

## 모델 정의하기

In [18]:
# 기반 모델(개별 ML 모델 객체 생성)
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=30)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

# 메타 모델(스태킹으로 만들어진 데이터 학습 및 예측)
lr_final = LogisticRegression(C=10)

## 학습하기
k-fold 기법 활용

In [22]:
kf = KFold(n_splits=5)
cv_accuracy = []

n_iter = 0
for train_index, test_index in kf.split(train_x):
  # print("TRAIN: ", train_index, "TEST: ", test_index)
  X_train, X_test = train_x.iloc[train_index], train_x.iloc[test_index]
  y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]

  rf_clf.fit(X_train, y_train)
  rf_pred = rf_clf.predict(X_test)
  accuracy = accuracy_score(y_test, rf_pred)
  cv_accuracy.append(accuracy)

  # 정확도 측정
  n_iter += 1
  print('\n {} 교차 검증 정확도 : {} , 학습 데이터 크기 : {} , 검증 데이터 크기 : {} '.format(n_iter, accuracy, X_train.shape[0], X_test.shape[0]))
print("\n")
print("\n 평균검증 정확도 : ", np.mean(cv_accuracy))


 1 교차 검증 정확도 : 0.7537473233404711 , 학습 데이터 크기 : 1868 , 검증 데이터 크기 : 467 

 2 교차 검증 정확도 : 0.7623126338329764 , 학습 데이터 크기 : 1868 , 검증 데이터 크기 : 467 

 3 교차 검증 정확도 : 0.7708779443254818 , 학습 데이터 크기 : 1868 , 검증 데이터 크기 : 467 

 4 교차 검증 정확도 : 0.7451820128479657 , 학습 데이터 크기 : 1868 , 검증 데이터 크기 : 467 

 5 교차 검증 정확도 : 0.7773019271948608 , 학습 데이터 크기 : 1868 , 검증 데이터 크기 : 467 



 평균검증 정확도 :  0.7618843683083512


In [None]:
# 개별 모델 학습
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=100)

In [None]:
# 기반 모델 예측 세트와 정확도 확인
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print('KNN 정확도 :',accuracy_score(y_test, knn_pred))
print('RF 정확도 :',accuracy_score(y_test, rf_pred))
print('DT 정확도 :',accuracy_score(y_test, dt_pred))
print('ADA부스트 정확도 :',accuracy_score(y_test, ada_pred))

KNN 정확도 : 0.5203426124197003
RF 정확도 : 0.7794432548179872
DT 정확도 : 0.5417558886509636
ADA부스트 정확도 : 0.6638115631691649


## 예측하기

In [23]:
# 랜덤 포레스트 모델
rf_predict = rf_clf.predict(test_x)
rf_predict

array([0, 0, 1, ..., 2, 0, 3])

In [24]:
rf_submit = sample_submission.copy()

In [25]:
rf_submit["target"] = rf_predict
rf_submit.head()

Unnamed: 0,id,target
0,1,0
1,2,0
2,3,1
3,4,3
4,5,2


In [26]:
rf_submit.to_csv(path+"rf_submission.csv", index=False)

## 참고
* https://continuous-development.tistory.com/166