# 결정 트리 실습

사용자 행동 인식 데이터 세트

[원본 데이터셋]
https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones

데이터 세트 정보:

실험은 19-48세 사이의 30명의 지원자 그룹으로 수행되었습니다. 각자 스마트폰(Samsung Galaxy S II)을 허리에 차고 6가지 활동(
1:WALKING, 
2:WALKING_UPSTAIRS, 
3:WALKING_DOWNSTAIRS, 4:SITTING, 5:STANDING, 6:LAYING)을 수행했습니다. 내장된 가속도계와 자이로스코프를 사용하여 50Hz의 일정한 속도로 3축 선형 가속도와 3축 각속도를 캡처했습니다. 실험은 데이터에 수동으로 레이블을 지정하기 위해 비디오로 녹화되었습니다. 얻은 데이터 세트는 훈련 데이터 생성을 위해 70%, 테스트 데이터 생성을 위해 30%가 선택된 두 세트로 무작위로 분할되었습니다.

센서 신호(가속도계 및 자이로스코프)는 노이즈 필터를 적용하여 사전 처리된 다음 2.56초 및 50% 중첩(128 판독/창)의 고정 너비 슬라이딩 창에서 샘플링되었습니다. 중력 및 신체 운동 성분을 갖는 센서 가속도 신호는 Butterworth 저역 통과 필터를 사용하여 신체 가속도와 중력으로 분리되었습니다. 중력은 저주파 성분만 있다고 가정하므로 차단 주파수가 0.3Hz인 필터를 사용했습니다. 각 창에서 시간 및 주파수 영역에서 변수를 계산하여 특징 벡터를 얻었습니다.

이 데이터 세트에 대한 자세한 내용은 README.txt 파일을 확인하십시오.

참가자 중 한 명과 함께 녹화된 6가지 활동의 예를 포함하는 실험 비디오는 다음 링크에서 볼 수 있습니다.

https://www.youtube.com/watch?v=XOEN9W05_4A]


**Mission**

수집된 데이터를 기반으로 결정트리를 이용해 어떤 행동을 하고 있는지 예측해 봅시다.

In [None]:
# ## 코랩을 사용할 때
# #드라이브 마운트
# from google.colab import drive
# drive.mount('/content/drive')

# #현재 작업 위치 이동
# #띄어쓰기에 \붙일 것
# %cd /content/drive/Othercomputers/내\ 노트북_before/Devpy/13.머신러닝

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## 데이터 로드

1. 피처만 있는 파일을 로드
2. 훈련데이터와 피처를 합쳐 훈련 데이터 프레임 만듬
3. 데스트데이터와 피처를 합쳐 테스트 데이터 프레임 만듬

In [2]:
# 피처이름 로드 
feature_name_df = pd.read_csv('./dataset/human_activity/features_new.txt', header=None,names=['column_name'])
feature_name_df #561개. 피처이름만 -> 데이터프레임의 컬럼명이 되어야함

Unnamed: 0,column_name
0,tBodyAcc-mean()-X
1,tBodyAcc-mean()-Y
2,tBodyAcc-mean()-Z
3,tBodyAcc-std()-X
4,tBodyAcc-std()-Y
...,...
556,"angle(tBodyGyroMean,gravityMean)"
557,"angle(tBodyGyroJerkMean,gravityMean)"
558,"angle(X,gravityMean)"
559,"angle(Y,gravityMean)"


In [3]:
# 피처이름을 리스트로
feature_name = feature_name_df['column_name'].values.tolist()
feature_name

['tBodyAcc-mean()-X',
 'tBodyAcc-mean()-Y',
 'tBodyAcc-mean()-Z',
 'tBodyAcc-std()-X',
 'tBodyAcc-std()-Y',
 'tBodyAcc-std()-Z',
 'tBodyAcc-mad()-X',
 'tBodyAcc-mad()-Y',
 'tBodyAcc-mad()-Z',
 'tBodyAcc-max()-X',
 'tBodyAcc-max()-Y',
 'tBodyAcc-max()-Z',
 'tBodyAcc-min()-X',
 'tBodyAcc-min()-Y',
 'tBodyAcc-min()-Z',
 'tBodyAcc-sma()',
 'tBodyAcc-energy()-X',
 'tBodyAcc-energy()-Y',
 'tBodyAcc-energy()-Z',
 'tBodyAcc-iqr()-X',
 'tBodyAcc-iqr()-Y',
 'tBodyAcc-iqr()-Z',
 'tBodyAcc-entropy()-X',
 'tBodyAcc-entropy()-Y',
 'tBodyAcc-entropy()-Z',
 'tBodyAcc-arCoeff()-X,1',
 'tBodyAcc-arCoeff()-X,2',
 'tBodyAcc-arCoeff()-X,3',
 'tBodyAcc-arCoeff()-X,4',
 'tBodyAcc-arCoeff()-Y,1',
 'tBodyAcc-arCoeff()-Y,2',
 'tBodyAcc-arCoeff()-Y,3',
 'tBodyAcc-arCoeff()-Y,4',
 'tBodyAcc-arCoeff()-Z,1',
 'tBodyAcc-arCoeff()-Z,2',
 'tBodyAcc-arCoeff()-Z,3',
 'tBodyAcc-arCoeff()-Z,4',
 'tBodyAcc-correlation()-X,Y',
 'tBodyAcc-correlation()-X,Z',
 'tBodyAcc-correlation()-Y,Z',
 'tGravityAcc-mean()-X',
 'tGravityA

### 데이터셋

In [18]:

X_test_df1 = pd.read_csv('./dataset/human_activity/test/X_test.txt',header=None,names=feature_name) # 테스트용 #2947 rows × 561 columns
X_test_df1
# 첫번째 컬럼에 모든 데이터가 들어감 -> 띄어쓰기를 기준으로 다음 칸으로 가야하니 split응용해볼 것

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,2.5717778e-001 -2.3285230e-002 -1.4653762e-0...,,,,,,,,,,...,,,,,,,,,,
1,2.8602671e-001 -1.3163359e-002 -1.1908252e-0...,,,,,,,,,,...,,,,,,,,,,
2,2.7548482e-001 -2.6050420e-002 -1.1815167e-0...,,,,,,,,,,...,,,,,,,,,,
3,2.7029822e-001 -3.2613869e-002 -1.1752018e-0...,,,,,,,,,,...,,,,,,,,,,
4,2.7483295e-001 -2.7847788e-002 -1.2952716e-0...,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2942,3.1015462e-001 -5.3391250e-002 -9.9108716e-0...,,,,,,,,,,...,,,,,,,,,,
2943,3.6338465e-001 -3.9214016e-002 -1.0591509e-0...,,,,,,,,,,...,,,,,,,,,,
2944,3.4996609e-001 3.0077442e-002 -1.1578796e-0...,,,,,,,,,,...,,,,,,,,,,
2945,2.3759383e-001 1.8466870e-002 -9.6498932e-0...,,,,,,,,,,...,,,,,,,,,,


In [19]:
type(X_test_df1['tBodyAcc-mean()-X'])

pandas.core.series.Series

In [20]:
X_test_df1['tBodyAcc-mean()-X'][0]

'  2.5717778e-001 -2.3285230e-002 -1.4653762e-002 -9.3840400e-001 -9.2009078e-001 -6.6768331e-001 -9.5250112e-001 -9.2524867e-001 -6.7430222e-001 -8.9408755e-001 -5.5457721e-001 -4.6622295e-001  7.1720847e-001  6.3550240e-001  7.8949666e-001 -8.7776423e-001 -9.9776606e-001 -9.9841381e-001 -9.3434525e-001 -9.7566897e-001 -9.4982365e-001 -8.3047780e-001 -1.6808416e-001 -3.7899553e-001  2.4621698e-001  5.2120364e-001 -4.8779311e-001  4.8228047e-001 -4.5462113e-002  2.1195505e-001 -1.3489443e-001  1.3085848e-001 -1.4176313e-002 -1.0597085e-001  7.3544013e-002 -1.7151642e-001  4.0062978e-002  7.6988933e-002 -4.9054573e-001 -7.0900265e-001  9.3648925e-001 -2.8271916e-001  1.1528825e-001 -9.2542727e-001 -9.3701413e-001 -5.6428842e-001 -9.3001992e-001 -9.3782195e-001 -6.0558770e-001  9.0608259e-001 -2.7924413e-001  1.5289519e-001  9.4446140e-001 -2.6215956e-001 -7.6161676e-002 -1.7826920e-002  8.2929682e-001 -8.6462060e-001 -9.6779531e-001 -9.4972666e-001 -9.4611920e-001 -7.5971815e-001 -4.249

In [21]:
X_test_df1['tBodyAcc-mean()-X'][0].split()

['2.5717778e-001',
 '-2.3285230e-002',
 '-1.4653762e-002',
 '-9.3840400e-001',
 '-9.2009078e-001',
 '-6.6768331e-001',
 '-9.5250112e-001',
 '-9.2524867e-001',
 '-6.7430222e-001',
 '-8.9408755e-001',
 '-5.5457721e-001',
 '-4.6622295e-001',
 '7.1720847e-001',
 '6.3550240e-001',
 '7.8949666e-001',
 '-8.7776423e-001',
 '-9.9776606e-001',
 '-9.9841381e-001',
 '-9.3434525e-001',
 '-9.7566897e-001',
 '-9.4982365e-001',
 '-8.3047780e-001',
 '-1.6808416e-001',
 '-3.7899553e-001',
 '2.4621698e-001',
 '5.2120364e-001',
 '-4.8779311e-001',
 '4.8228047e-001',
 '-4.5462113e-002',
 '2.1195505e-001',
 '-1.3489443e-001',
 '1.3085848e-001',
 '-1.4176313e-002',
 '-1.0597085e-001',
 '7.3544013e-002',
 '-1.7151642e-001',
 '4.0062978e-002',
 '7.6988933e-002',
 '-4.9054573e-001',
 '-7.0900265e-001',
 '9.3648925e-001',
 '-2.8271916e-001',
 '1.1528825e-001',
 '-9.2542727e-001',
 '-9.3701413e-001',
 '-5.6428842e-001',
 '-9.3001992e-001',
 '-9.3782195e-001',
 '-6.0558770e-001',
 '9.0608259e-001',
 '-2.7924413e-0

In [22]:
import numpy as np
x = np.array(X_test_df1['tBodyAcc-mean()-X'][0].split())
x

array(['2.5717778e-001', '-2.3285230e-002', '-1.4653762e-002',
       '-9.3840400e-001', '-9.2009078e-001', '-6.6768331e-001',
       '-9.5250112e-001', '-9.2524867e-001', '-6.7430222e-001',
       '-8.9408755e-001', '-5.5457721e-001', '-4.6622295e-001',
       '7.1720847e-001', '6.3550240e-001', '7.8949666e-001',
       '-8.7776423e-001', '-9.9776606e-001', '-9.9841381e-001',
       '-9.3434525e-001', '-9.7566897e-001', '-9.4982365e-001',
       '-8.3047780e-001', '-1.6808416e-001', '-3.7899553e-001',
       '2.4621698e-001', '5.2120364e-001', '-4.8779311e-001',
       '4.8228047e-001', '-4.5462113e-002', '2.1195505e-001',
       '-1.3489443e-001', '1.3085848e-001', '-1.4176313e-002',
       '-1.0597085e-001', '7.3544013e-002', '-1.7151642e-001',
       '4.0062978e-002', '7.6988933e-002', '-4.9054573e-001',
       '-7.0900265e-001', '9.3648925e-001', '-2.8271916e-001',
       '1.1528825e-001', '-9.2542727e-001', '-9.3701413e-001',
       '-5.6428842e-001', '-9.3001992e-001', '-9.37821

In [23]:
X_test_df1['tBodyAcc-mean()-X'][560].split()

['2.7454548e-001',
 '1.0493163e-002',
 '-1.4317331e-001',
 '-3.6553942e-001',
 '-1.0841591e-001',
 '-6.1563483e-001',
 '-3.8040627e-001',
 '-1.1764577e-001',
 '-6.0930513e-001',
 '-1.9371192e-001',
 '1.3402426e-002',
 '-5.6738071e-001',
 '3.7532070e-001',
 '2.4486604e-001',
 '6.3054848e-001',
 '-2.9007984e-001',
 '-7.9739135e-001',
 '-8.4447824e-001',
 '-9.2855280e-001',
 '-3.9704234e-001',
 '-3.3849529e-001',
 '-6.2726682e-001',
 '3.8273298e-001',
 '3.9374440e-001',
 '-1.5050823e-001',
 '-4.9013668e-001',
 '5.8860731e-001',
 '-5.1772400e-001',
 '3.0004139e-001',
 '-1.8632082e-001',
 '1.5976058e-001',
 '6.1536244e-002',
 '1.3801873e-001',
 '-2.4874812e-001',
 '1.9510160e-001',
 '-1.0113565e-001',
 '1.8198168e-002',
 '7.6693031e-002',
 '-7.6398068e-002',
 '-9.4939023e-002',
 '9.7397732e-001',
 '-6.3024324e-002',
 '1.1338760e-001',
 '-9.7595296e-001',
 '-9.3198363e-001',
 '-9.6477019e-001',
 '-9.7511019e-001',
 '-9.3738541e-001',
 '-9.6242941e-001',
 '9.0668741e-001',
 '-5.3458955e-002',

In [24]:
X_test_data_list =[]
for i in range(561) :
    x = X_test_df1['tBodyAcc-mean()-X'][i].split()
    X_test_data_list.append(x)

X_test_data =  np.array(X_test_data_list)

In [25]:
X_test_data

array([['2.5717778e-001', '-2.3285230e-002', '-1.4653762e-002', ...,
        '-7.2000927e-001', '2.7680104e-001', '-5.7978304e-002'],
       ['2.8602671e-001', '-1.3163359e-002', '-1.1908252e-001', ...,
        '-6.9809082e-001', '2.8134292e-001', '-8.3898014e-002'],
       ['2.7548482e-001', '-2.6050420e-002', '-1.1815167e-001', ...,
        '-7.0277146e-001', '2.8008303e-001', '-7.9346197e-002'],
       ...,
       ['2.6841076e-001', '2.0047468e-002', '-1.9595794e-001', ...,
        '-8.9839095e-001', '1.2516693e-001', '-5.0890161e-002'],
       ['3.0029053e-001', '-5.5739891e-002', '-1.3905246e-001', ...,
        '-8.9016893e-001', '1.1299863e-001', '-6.1195312e-002'],
       ['2.7454548e-001', '1.0493163e-002', '-1.4317331e-001', ...,
        '-8.8938747e-001', '1.2682593e-001', '-5.7172255e-002']],
      dtype='<U15')

In [26]:
# 학습(X_train) 피처 데이터 셋과 테스트(X_test) 피처 데이터을 DataFrame으로 로딩. 컬럼명은 feature_name 적용
X_test_df = pd.DataFrame(data=X_test_data, columns=feature_name)

In [27]:
X_test_df

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,2.5717778e-001,-2.3285230e-002,-1.4653762e-002,-9.3840400e-001,-9.2009078e-001,-6.6768331e-001,-9.5250112e-001,-9.2524867e-001,-6.7430222e-001,-8.9408755e-001,...,7.1645446e-002,-3.3037044e-001,-7.0597388e-001,6.4624029e-003,1.6291982e-001,-8.2588562e-001,2.7115145e-001,-7.2000927e-001,2.7680104e-001,-5.7978304e-002
1,2.8602671e-001,-1.3163359e-002,-1.1908252e-001,-9.7541469e-001,-9.6745790e-001,-9.4495817e-001,-9.8679880e-001,-9.6840133e-001,-9.4582340e-001,-8.9408755e-001,...,-4.0118872e-001,-1.2184509e-001,-5.9494387e-001,-8.3494968e-002,1.7499572e-002,-4.3437455e-001,9.2059323e-001,-6.9809082e-001,2.8134292e-001,-8.3898014e-002
2,2.7548482e-001,-2.6050420e-002,-1.1815167e-001,-9.9381904e-001,-9.6992551e-001,-9.6274798e-001,-9.9440345e-001,-9.7073498e-001,-9.6348267e-001,-9.3926027e-001,...,6.2891313e-002,-1.9042189e-001,-6.4073573e-001,-3.4956250e-002,2.0230203e-001,6.4103354e-002,1.4506843e-001,-7.0277146e-001,2.8008303e-001,-7.9346197e-002
3,2.7029822e-001,-3.2613869e-002,-1.1752018e-001,-9.9474279e-001,-9.7326761e-001,-9.6709068e-001,-9.9527433e-001,-9.7447101e-001,-9.6889736e-001,-9.3860975e-001,...,1.1669529e-001,-3.4441804e-001,-7.3612380e-001,-1.7067021e-002,1.5443783e-001,3.4013408e-001,2.9640709e-001,-6.9895383e-001,2.8411379e-001,-7.7108002e-002
4,2.7483295e-001,-2.7847788e-002,-1.2952716e-001,-9.9385248e-001,-9.6744548e-001,-9.7829499e-001,-9.9411140e-001,-9.6595259e-001,-9.7734600e-001,-9.3860975e-001,...,-1.2171128e-001,-5.3468487e-001,-8.4659517e-001,-2.2226521e-003,-4.0046393e-002,7.3671509e-001,-1.1854473e-001,-6.9224496e-001,2.9072202e-001,-7.3856810e-002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,3.0344228e-001,-2.2841831e-002,-1.3996845e-001,-3.8373339e-001,-3.3969521e-002,-6.3071670e-001,-3.9208541e-001,-5.8326636e-003,-6.3413964e-001,-3.5431646e-001,...,2.6258041e-001,-2.8190306e-001,-5.6819631e-001,-2.2625916e-001,-2.6998847e-001,-7.3409439e-001,7.6796734e-001,-8.8900900e-001,1.2149650e-001,-5.9582326e-002
557,2.9192158e-001,-1.0980288e-002,-1.1394106e-001,-4.0129463e-001,-7.2742596e-002,-6.4613001e-001,-4.2599808e-001,-8.0388989e-002,-6.2908100e-001,-2.3866725e-001,...,4.4684767e-001,-5.0706398e-002,-3.1641981e-001,-3.6485957e-001,4.9343299e-001,6.9080429e-001,6.7589673e-001,-8.8890006e-001,1.2782088e-001,-5.7105091e-002
558,2.6841076e-001,2.0047468e-002,-1.9595794e-001,-5.2335240e-001,-1.6378318e-001,-5.8627242e-001,-5.6398056e-001,-1.9904573e-001,-5.7146197e-001,-2.3866725e-001,...,-1.3191085e-001,-3.8967932e-001,-7.6430068e-001,1.0591469e-001,-2.4404317e-001,9.9453228e-001,6.9252847e-001,-8.9839095e-001,1.2516693e-001,-5.0890161e-002
559,3.0029053e-001,-5.5739891e-002,-1.3905246e-001,-4.4272372e-001,2.2990934e-002,-6.0842339e-001,-4.7712633e-001,1.9864571e-003,-6.0065467e-001,-1.9371192e-001,...,1.3324050e-001,-2.2610151e-001,-5.4878465e-001,-1.0621677e-001,-6.6290391e-001,-9.5887822e-001,6.9142459e-001,-8.9016893e-001,1.1299863e-001,-6.1195312e-002


In [42]:
X_test_df

array([['2.5717778e-001', '-2.3285230e-002', '-1.4653762e-002', ...,
        '-7.2000927e-001', '2.7680104e-001', '-5.7978304e-002'],
       ['2.8602671e-001', '-1.3163359e-002', '-1.1908252e-001', ...,
        '-6.9809082e-001', '2.8134292e-001', '-8.3898014e-002'],
       ['2.7548482e-001', '-2.6050420e-002', '-1.1815167e-001', ...,
        '-7.0277146e-001', '2.8008303e-001', '-7.9346197e-002'],
       ...,
       ['2.6841076e-001', '2.0047468e-002', '-1.9595794e-001', ...,
        '-8.9839095e-001', '1.2516693e-001', '-5.0890161e-002'],
       ['3.0029053e-001', '-5.5739891e-002', '-1.3905246e-001', ...,
        '-8.9016893e-001', '1.1299863e-001', '-6.1195312e-002'],
       ['2.7454548e-001', '1.0493163e-002', '-1.4317331e-001', ...,
        '-8.8938747e-001', '1.2682593e-001', '-5.7172255e-002']],
      dtype='<U15')

In [44]:
X_testDf = pd.DataFrame(data=X_test_df, columns=feature_name)
X_testDf

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,2.5717778e-001,-2.3285230e-002,-1.4653762e-002,-9.3840400e-001,-9.2009078e-001,-6.6768331e-001,-9.5250112e-001,-9.2524867e-001,-6.7430222e-001,-8.9408755e-001,...,7.1645446e-002,-3.3037044e-001,-7.0597388e-001,6.4624029e-003,1.6291982e-001,-8.2588562e-001,2.7115145e-001,-7.2000927e-001,2.7680104e-001,-5.7978304e-002
1,2.8602671e-001,-1.3163359e-002,-1.1908252e-001,-9.7541469e-001,-9.6745790e-001,-9.4495817e-001,-9.8679880e-001,-9.6840133e-001,-9.4582340e-001,-8.9408755e-001,...,-4.0118872e-001,-1.2184509e-001,-5.9494387e-001,-8.3494968e-002,1.7499572e-002,-4.3437455e-001,9.2059323e-001,-6.9809082e-001,2.8134292e-001,-8.3898014e-002
2,2.7548482e-001,-2.6050420e-002,-1.1815167e-001,-9.9381904e-001,-9.6992551e-001,-9.6274798e-001,-9.9440345e-001,-9.7073498e-001,-9.6348267e-001,-9.3926027e-001,...,6.2891313e-002,-1.9042189e-001,-6.4073573e-001,-3.4956250e-002,2.0230203e-001,6.4103354e-002,1.4506843e-001,-7.0277146e-001,2.8008303e-001,-7.9346197e-002
3,2.7029822e-001,-3.2613869e-002,-1.1752018e-001,-9.9474279e-001,-9.7326761e-001,-9.6709068e-001,-9.9527433e-001,-9.7447101e-001,-9.6889736e-001,-9.3860975e-001,...,1.1669529e-001,-3.4441804e-001,-7.3612380e-001,-1.7067021e-002,1.5443783e-001,3.4013408e-001,2.9640709e-001,-6.9895383e-001,2.8411379e-001,-7.7108002e-002
4,2.7483295e-001,-2.7847788e-002,-1.2952716e-001,-9.9385248e-001,-9.6744548e-001,-9.7829499e-001,-9.9411140e-001,-9.6595259e-001,-9.7734600e-001,-9.3860975e-001,...,-1.2171128e-001,-5.3468487e-001,-8.4659517e-001,-2.2226521e-003,-4.0046393e-002,7.3671509e-001,-1.1854473e-001,-6.9224496e-001,2.9072202e-001,-7.3856810e-002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,3.0344228e-001,-2.2841831e-002,-1.3996845e-001,-3.8373339e-001,-3.3969521e-002,-6.3071670e-001,-3.9208541e-001,-5.8326636e-003,-6.3413964e-001,-3.5431646e-001,...,2.6258041e-001,-2.8190306e-001,-5.6819631e-001,-2.2625916e-001,-2.6998847e-001,-7.3409439e-001,7.6796734e-001,-8.8900900e-001,1.2149650e-001,-5.9582326e-002
557,2.9192158e-001,-1.0980288e-002,-1.1394106e-001,-4.0129463e-001,-7.2742596e-002,-6.4613001e-001,-4.2599808e-001,-8.0388989e-002,-6.2908100e-001,-2.3866725e-001,...,4.4684767e-001,-5.0706398e-002,-3.1641981e-001,-3.6485957e-001,4.9343299e-001,6.9080429e-001,6.7589673e-001,-8.8890006e-001,1.2782088e-001,-5.7105091e-002
558,2.6841076e-001,2.0047468e-002,-1.9595794e-001,-5.2335240e-001,-1.6378318e-001,-5.8627242e-001,-5.6398056e-001,-1.9904573e-001,-5.7146197e-001,-2.3866725e-001,...,-1.3191085e-001,-3.8967932e-001,-7.6430068e-001,1.0591469e-001,-2.4404317e-001,9.9453228e-001,6.9252847e-001,-8.9839095e-001,1.2516693e-001,-5.0890161e-002
559,3.0029053e-001,-5.5739891e-002,-1.3905246e-001,-4.4272372e-001,2.2990934e-002,-6.0842339e-001,-4.7712633e-001,1.9864571e-003,-6.0065467e-001,-1.9371192e-001,...,1.3324050e-001,-2.2610151e-001,-5.4878465e-001,-1.0621677e-001,-6.6290391e-001,-9.5887822e-001,6.9142459e-001,-8.9016893e-001,1.1299863e-001,-6.1195312e-002


In [35]:
X_train_df1 = pd.read_csv('./dataset/human_activity/train/X_train.txt',header=None,names=feature_name) 
X_train_df1

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,2.8858451e-001 -2.0294171e-002 -1.3290514e-0...,,,,,,,,,,...,,,,,,,,,,
1,2.7841883e-001 -1.6410568e-002 -1.2352019e-0...,,,,,,,,,,...,,,,,,,,,,
2,2.7965306e-001 -1.9467156e-002 -1.1346169e-0...,,,,,,,,,,...,,,,,,,,,,
3,2.7917394e-001 -2.6200646e-002 -1.2328257e-0...,,,,,,,,,,...,,,,,,,,,,
4,2.7662877e-001 -1.6569655e-002 -1.1536185e-0...,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,2.9966534e-001 -5.7193414e-002 -1.8123302e-0...,,,,,,,,,,...,,,,,,,,,,
7348,2.7385271e-001 -7.7493259e-003 -1.4746837e-0...,,,,,,,,,,...,,,,,,,,,,
7349,2.7338737e-001 -1.7010616e-002 -4.5021828e-0...,,,,,,,,,,...,,,,,,,,,,
7350,2.8965416e-001 -1.8843044e-002 -1.5828059e-0...,,,,,,,,,,...,,,,,,,,,,


In [37]:
X_train_df1['tBodyAcc-mean()-X'].size

7352

In [38]:
# 학습용 데이터 (train용)
X_train_df1 = pd.read_csv('./dataset/human_activity/train/X_train.txt',header=None,names=feature_name) 

X_train_data_list =[]
for i in range(7352) :
    x = X_train_df1['tBodyAcc-mean()-X'][i].split()
    X_train_data_list.append(x)

X_train_data =  np.array(X_train_data_list)

X_train_df = pd.DataFrame(data=X_train_data, columns=feature_name)

In [39]:
X_train_df

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,2.8858451e-001,-2.0294171e-002,-1.3290514e-001,-9.9527860e-001,-9.8311061e-001,-9.1352645e-001,-9.9511208e-001,-9.8318457e-001,-9.2352702e-001,-9.3472378e-001,...,-7.4323027e-002,-2.9867637e-001,-7.1030407e-001,-1.1275434e-001,3.0400372e-002,-4.6476139e-001,-1.8445884e-002,-8.4124676e-001,1.7994061e-001,-5.8626924e-002
1,2.7841883e-001,-1.6410568e-002,-1.2352019e-001,-9.9824528e-001,-9.7530022e-001,-9.6032199e-001,-9.9880719e-001,-9.7491437e-001,-9.5768622e-001,-9.4306751e-001,...,1.5807454e-001,-5.9505094e-001,-8.6149931e-001,5.3476955e-002,-7.4345661e-003,-7.3262621e-001,7.0351059e-001,-8.4478760e-001,1.8028889e-001,-5.4316717e-002
2,2.7965306e-001,-1.9467156e-002,-1.1346169e-001,-9.9537956e-001,-9.6718701e-001,-9.7894396e-001,-9.9651994e-001,-9.6366837e-001,-9.7746859e-001,-9.3869155e-001,...,4.1450281e-001,-3.9074815e-001,-7.6010372e-001,-1.1855926e-001,1.7789948e-001,1.0069921e-001,8.0852908e-001,-8.4893347e-001,1.8063731e-001,-4.9117815e-002
3,2.7917394e-001,-2.6200646e-002,-1.2328257e-001,-9.9609149e-001,-9.8340270e-001,-9.9067510e-001,-9.9709947e-001,-9.8274984e-001,-9.8930250e-001,-9.3869155e-001,...,4.0457253e-001,-1.1729020e-001,-4.8284451e-001,-3.6787973e-002,-1.2892494e-002,6.4001104e-001,-4.8536645e-001,-8.4864938e-001,1.8193476e-001,-4.7663183e-002
4,2.7662877e-001,-1.6569655e-002,-1.1536185e-001,-9.9813862e-001,-9.8081727e-001,-9.9048163e-001,-9.9832113e-001,-9.7967187e-001,-9.9044113e-001,-9.4246912e-001,...,8.7753013e-002,-3.5147093e-001,-6.9920515e-001,1.2332005e-001,1.2254196e-001,6.9357829e-001,-6.1597061e-001,-8.4786525e-001,1.8515116e-001,-4.3892254e-002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,2.9966534e-001,-5.7193414e-002,-1.8123302e-001,-1.9538652e-001,3.9904850e-002,7.7078081e-002,-2.8230064e-001,4.3615631e-002,6.0410083e-002,2.1079544e-001,...,-7.0156695e-002,-5.8843274e-001,-8.8032443e-001,-1.9043686e-001,8.2971842e-001,2.0697215e-001,-4.2561858e-001,-7.9188305e-001,2.3860439e-001,4.9819139e-002
7348,2.7385271e-001,-7.7493259e-003,-1.4746837e-001,-2.3530853e-001,4.8162805e-003,5.9279994e-002,-3.2255234e-001,-2.9456250e-002,8.0585116e-002,1.1744028e-001,...,1.6525919e-001,-3.9073832e-001,-6.8074445e-001,6.4906712e-002,8.7567905e-001,-8.7903279e-001,4.0021936e-001,-7.7183960e-001,2.5267595e-001,5.0052558e-002
7349,2.7338737e-001,-1.7010616e-002,-4.5021828e-002,-2.1821818e-001,-1.0382198e-001,2.7453270e-001,-3.0451515e-001,-9.8913034e-002,3.3258449e-001,4.3998772e-002,...,1.9503401e-001,2.5145333e-002,-3.0402936e-001,5.2805928e-002,-2.6672437e-001,8.6440401e-001,7.0116882e-001,-7.7913261e-001,2.4914484e-001,4.0811188e-002
7350,2.8965416e-001,-1.8843044e-002,-1.5828059e-001,-2.1913944e-001,-1.1141169e-001,2.6889320e-001,-3.1048749e-001,-6.8200325e-002,3.1947326e-001,1.0170184e-001,...,1.3865423e-002,6.3906883e-002,-3.4431361e-001,-1.0136012e-001,7.0073969e-001,9.3667394e-001,-5.8947895e-001,-7.8518142e-001,2.4643223e-001,2.5339478e-002


In [None]:
# 학습 레이블과 테스트 레이블 데이터을 DataFrame으로 로딩하고 컬럼명은 action으로 부여

In [40]:
y_train_df = pd.read_csv('./dataset/human_activity/train/y_train.txt', header=None,names=['action'])
y_train_df

Unnamed: 0,action
0,5
1,5
2,5
3,5
4,5
...,...
7347,2
7348,2
7349,2
7350,2


In [63]:
y_test_df = pd.read_csv('./dataset/human_activity/test/y_test.txt', header=None,names=['action'])
y_test_df

Unnamed: 0,action
0,5
1,5
2,5
3,5
4,5
...,...
2942,2
2943,2
2944,2
2945,2


In [42]:
#action 피처의 class별 갯수
y_train_df['action'].value_counts()

6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: action, dtype: int64

In [43]:
y_test_df['action'].value_counts()

6    537
5    532
1    496
4    491
2    471
3    420
Name: action, dtype: int64

In [None]:
# 두 데이터의 경우를 합쳐서 카운트 하는 부분...

In [64]:
type(X_train_data), type(X_train_df), type(y_train_df), type(y_test_df) # Series가 되어야함

(numpy.ndarray,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [65]:
y_test_df = y_test_df.squeeze() # 데이터프레임을 시리즈로 변환
type(y_test_df)

pandas.core.series.Series

In [66]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# DesisionTree분류 클래스 사용(randomstate 156)
dt_clf = DecisionTreeClassifier(random_state=156)

# 훈련
dt_clf.fit(X_train_df , y_train_df)


# 예측
pred = dt_clf.predict(X_test_df)


#평가
'예측 정확도: {0:.4f}'.format(accuracy_score(y_test_df, pred)) # y_test_df 에서 에러

ValueError: Found input variables with inconsistent numbers of samples: [2947, 561]

In [67]:
# DecisionTreeClassifier의 하이퍼 파라미터 추출
print('DecisionTreeClassifier 기본 하이퍼 파라미터:\n', dt_clf.get_params())

DecisionTreeClassifier 기본 하이퍼 파라미터:
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 156, 'splitter': 'best'}


## 하이퍼파라미터 튜닝1

In [68]:
from sklearn.model_selection import GridSearchCV

#max_depth를 바꿔가며 훈련해보자
#나무의 깊이를 얼마만큼 성장시킬것인가

params = {
    'max_depth' : [ 6, 8 ,10, 12, 16 ,20, 24]
}


In [None]:
# GridSearchCV 사용

import pandas as pd

# param_grid의 하이퍼 파라미터들을 3개의 train, test set fold 로 나누어서 테스트 수행 설정.  
# refit=True 가 default 임. True이면 가장 좋은 파라미터 설정으로 재 학습 시킴.  
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True) # 알고리즘, 파라미터, 검증횟수

# 붓꽃 Train 데이터로 param_grid의 하이퍼 파라미터들을 순차적으로 학습/평가 .
grid_dtree.fit(X_train_df, y_train_df)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)


# 훈련



In [None]:
# GridSearchCV객체의 cv_results_ 속성을 DataFrame으로 생성. 


In [None]:
#베스트 분류기 선택



#예측



#평가


```
    하이퍼파라미터 튜닝으로 성능이 향상 되었나요?
```

## 하이퍼파라미터 튜닝2

In [None]:
#하이퍼파라미터
params = {
    'max_depth' : [ 8 , 12, 16 ,20], 
    'min_samples_split' : [16,24],
}


In [None]:
# GridSearchCV 사용


# 훈련



In [None]:
#베스트 분류기 선택



#예측



#평가


```
    하이퍼파라미터 튜닝으로 성능이 향상 되었나요?
```

---

**[생활탐구]**


```
방문을 꼭 닫은채 공부하고 있다는 보검이...
정말 보검이는 공부하고 있을까요?
생체 신호 측정기를 착용한 보검이로 부터 아래와 같은 신호를 가져왔습니다. 
지금 현재 보검이는 무엇을 하고 있나요?

[보검이 생체 신호 파일]X_quiz.txt 

```

In [None]:
# 561개의 생체신호로 측정된 보검이의 생체신호 파일을 강사님께 받을것.

In [None]:
#신호 읽어서 예측해보기
