# [DACON] 손동작 분류 경진대회

## 모듈 불러오기

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# metrics로 accuracy를 사용
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# stacking model에 사용할 알고리즘
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

## 데이터 불러오기

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/dacon/손동작 분류 경진대회/hand_gesture_data/"

In [None]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [None]:
print(train.shape)
print(test.shape)

(2335, 34)
(9343, 33)


# 데이터 스케일링

In [None]:
train_x = train.drop(['id', 'target'], axis = 1)
test_x = test.drop(['id'], axis = 1)
train_y = train['target']

mins = train_x.min()
maxs = train_x.max()
mins[:5]

sensor_1    -94.746969
sensor_2    -63.942094
sensor_3   -122.195138
sensor_4   -111.870691
sensor_5    -94.147972
dtype: float64

데이터 내 칼럼별로 최솟값, 최댓값을 추출했습니다. 데이터들을 스케일링 하기 위한 목적입니다.

In [None]:
train_x = (train_x - mins) / (maxs - mins)
test_x = (test_x - mins) / (maxs - mins)
train_x.describe().T[['min', 'max']]

Unnamed: 0,min,max
sensor_1,0.0,1.0
sensor_2,0.0,1.0
sensor_3,0.0,1.0
sensor_4,0.0,1.0
sensor_5,0.0,1.0
sensor_6,0.0,1.0
sensor_7,0.0,1.0
sensor_8,0.0,1.0
sensor_9,0.0,1.0
sensor_10,0.0,1.0


(데이터 - 최솟값) / (최댓값 - 최솟값) 연산을 거치게 되면 데이터 값들이 모두 0~1 사이로 가지게 됩니다.

딥러닝에서 입력값을 표준화 시키는 것이 상당히 중요합니다.

In [None]:
train_x.head()

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,...,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27,sensor_28,sensor_29,sensor_30,sensor_31,sensor_32
0,0.541473,0.606731,0.526447,0.490227,0.497738,0.671732,0.464596,0.447091,0.561398,0.548431,...,0.404892,0.473689,0.592187,0.55188,0.492702,0.588748,0.557137,0.625632,0.46574,0.488933
1,0.565373,0.606021,0.510563,0.472168,0.509495,0.641768,0.588175,0.412336,0.484649,0.462025,...,0.502381,0.493417,0.579928,0.571626,0.500264,0.53834,0.657252,0.66456,0.474202,0.471883
2,0.695714,0.595535,0.493912,0.420966,0.563662,0.838724,0.3756,0.381509,0.465033,0.441859,...,0.456094,0.474226,0.974169,0.551168,0.51503,0.515998,0.26947,0.637364,0.457429,0.518783
3,0.56554,0.597091,0.577908,0.508696,0.512573,0.698242,0.340187,0.452233,0.516345,0.526889,...,0.498393,0.492909,0.623802,0.608209,0.505761,0.483252,0.550562,0.646261,0.43758,0.528218
4,0.603221,0.644232,0.345542,0.527245,0.660879,0.774011,0.371651,0.398969,0.388327,0.462653,...,0.404961,0.493201,0.699825,0.606327,0.405131,0.567022,0.597186,0.709483,0.538139,0.422955


In [None]:
test_x.head()

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,...,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27,sensor_28,sensor_29,sensor_30,sensor_31,sensor_32
0,0.579466,0.664216,0.478221,0.462479,0.503306,0.566129,0.47703,0.543749,0.574315,0.504881,...,0.470875,0.524279,0.642406,0.566189,0.465995,0.510944,0.519182,0.710178,0.482063,0.588771
1,0.621169,0.500035,0.589914,0.705113,0.487918,0.726033,0.503889,0.468561,0.548885,0.431895,...,0.396176,0.39493,0.522646,0.516345,0.462824,0.519457,0.642173,0.647421,0.716899,0.772796
2,0.549661,0.624955,0.48252,0.4942,0.492335,0.679272,0.526176,0.411687,0.561481,0.513985,...,0.428931,0.469933,0.630224,0.613736,0.483554,0.437953,0.543224,0.610437,0.501635,0.527867
3,0.579167,0.577235,0.538253,0.630389,0.502305,0.613327,0.478447,0.455935,0.529974,0.504574,...,0.386216,0.512341,0.636133,0.628702,0.640278,0.482561,0.571459,0.673724,0.389071,0.510093
4,0.531131,0.595174,0.521691,0.481178,0.667155,0.62261,0.420499,0.385399,0.156312,0.411781,...,0.42379,0.40734,0.587512,0.59894,0.513942,0.502221,0.944224,0.700688,0.417758,0.511256


## 데이터 분리

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=5)
y_train = y_train.values.ravel()

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1868, 32)
(467, 32)
(1868,)
(467,)


## 모델 정의하기

In [None]:
# 기반 모델(개별 ML 모델 객체 생성)
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=30)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

# 메타 모델(스태킹으로 만들어진 데이터 학습 및 예측)
lr_final = LogisticRegression(C=10)

## 학습하기
stacking 기법 활용

In [None]:
# 개별 모델 학습
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=100)

In [None]:
# 기반 모델 예측 세트와 정확도 확인
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print('KNN 정확도 :',accuracy_score(y_test, knn_pred))
print('RF 정확도 :',accuracy_score(y_test, rf_pred))
print('DT 정확도 :',accuracy_score(y_test, dt_pred))
print('ADA부스트 정확도 :',accuracy_score(y_test, ada_pred))

KNN 정확도 : 0.5203426124197003
RF 정확도 : 0.7794432548179872
DT 정확도 : 0.5417558886509636
ADA부스트 정확도 : 0.6638115631691649


In [None]:
# 기반 모델의 예측 결과를 스태킹
stacked_pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
print(stacked_pred.shape)

(4, 467)


In [None]:
# transpose를 이용, 행과 열의 위치를 교환, 칼럼 레벨로 각 모델의 예측 결과를 피처로 사용
stacked_pred = np.transpose(stacked_pred)
print(stacked_pred.shape)

(467, 4)


In [None]:
# 메타 모델은 기반 모델의 예측 결과를 기반으로 학습
lr_final.fit(stacked_pred, y_test)
final_pred = lr_final.predict(stacked_pred)

print('최종 메타 모델 정확도 : ',accuracy_score(y_test, final_pred))

최종 메타 모델 정확도 :  0.576017130620985


## 예측하기

In [None]:
# 랜덤 포레스트 모델
rf_predict = rf_clf.predict(test_x)
rf_predict

array([0, 0, 1, ..., 2, 0, 3])

In [None]:
rf_submit = sample_submission.copy()

In [None]:
rf_submit["target"] = rf_predict
rf_submit.head()

Unnamed: 0,id,target
0,1,0
1,2,0
2,3,1
3,4,3
4,5,2


In [None]:
rf_submit.to_csv(path+"rf_submission.csv", index=False)

In [None]:
# stacking 모델
lr_predict = lr_final.predict(test_x)
lr_submit = sample_submission.copy()

lr_submit["target"] = lr_predict

lr_submit.to_csv(path+"lr_submission.csv", index=False)

ValueError: ignored

## 참고
* https://dacon.io/competitions/official/235876/codeshare/4656?page=1&dtype=recent