In [5]:
import os 
import shutil
import zipfile
import sklearn
import graphviz 
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, recall_score, precision_recall_curve, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier, export_graphviz 


#### 1. Dataset
    - 사용자 행동 인식 Dataset(uci human activity recognition dataset)  
    - https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones 

In [20]:
def set_extract_zip(zip_name):
    with zipfile.ZipFile(zip_name) as zip:
        zip.extractall("E:\\workSpace_Dataset\\")        

In [21]:
set_extract_zip("E:\\workSpace_Dataset\\UCI HAR Dataset.zip")


In [22]:
file_name = "E:\\workSpace_Dataset\\UCI HAR Dataset.zip"
EXTRACT_DIR = "E:\\workSpace_Dataset\\UCI"
ACRHIVE_FORMAT = "zip"

shutil.unpack_archive(file_name, EXTRACT_DIR, ACRHIVE_FORMAT)

In [37]:
df_feature_name = pd.read_csv("E:\\workSpace_Dataset\\UCI\\UCI HAR Dataset\\features.txt", sep="\s+", header=None, names=['column_index','column_name'])

In [38]:
df_feature_name.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561 entries, 0 to 560
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   column_index  561 non-null    int64 
 1   column_name   561 non-null    object
dtypes: int64(1), object(1)
memory usage: 8.9+ KB


In [39]:
lst_feature_name = df_feature_name.iloc[:, 1].values.tolist()
lst_feature_name[:10]

['tBodyAcc-mean()-X',
 'tBodyAcc-mean()-Y',
 'tBodyAcc-mean()-Z',
 'tBodyAcc-std()-X',
 'tBodyAcc-std()-Y',
 'tBodyAcc-std()-Z',
 'tBodyAcc-mad()-X',
 'tBodyAcc-mad()-Y',
 'tBodyAcc-mad()-Z',
 'tBodyAcc-max()-X']

In [40]:
df_feature_duplicate = df_feature_name.groupby("column_name").count()
df_feature_duplicate

Unnamed: 0_level_0,column_index
column_name,Unnamed: 1_level_1
"angle(X,gravityMean)",1
"angle(Y,gravityMean)",1
"angle(Z,gravityMean)",1
"angle(tBodyAccJerkMean),gravityMean)",1
"angle(tBodyAccMean,gravity)",1
...,...
tGravityAccMag-max(),1
tGravityAccMag-mean(),1
tGravityAccMag-min(),1
tGravityAccMag-sma(),1


In [41]:
df_feature_duplicate = df_feature_name.groupby('column_name').count()
print(df_feature_duplicate[df_feature_duplicate['column_index'] > 1].count())
df_feature_duplicate[df_feature_duplicate['column_index'] > 1].head(10)

column_index    42
dtype: int64


Unnamed: 0_level_0,column_index
column_name,Unnamed: 1_level_1
"fBodyAcc-bandsEnergy()-1,16",3
"fBodyAcc-bandsEnergy()-1,24",3
"fBodyAcc-bandsEnergy()-1,8",3
"fBodyAcc-bandsEnergy()-17,24",3
"fBodyAcc-bandsEnergy()-17,32",3
"fBodyAcc-bandsEnergy()-25,32",3
"fBodyAcc-bandsEnergy()-25,48",3
"fBodyAcc-bandsEnergy()-33,40",3
"fBodyAcc-bandsEnergy()-33,48",3
"fBodyAcc-bandsEnergy()-41,48",3


In [45]:
df_example = pd.DataFrame([["A"], ["A"], ["A"], ["B"], ["B"], ["A"]], columns=["A"])
display(df_example)
df_example.groupby("A").cumcount()

Unnamed: 0,A
0,A
1,A
2,A
3,B
4,B
5,A


0    0
1    1
2    2
3    0
4    1
5    3
dtype: int64

#### Remove of Duplicate Dataset value

In [55]:
def get_new_feature_name(df_old_feature_name):
    df_feature_duplicate = pd.DataFrame(data=df_old_feature_name.groupby("column_name").cumcount(), columns=["duplicate_cnt"])
    df_feature_duplicate = df_feature_duplicate.reset_index()
    df_new_feature = pd.merge(df_old_feature_name.reset_index(), df_feature_duplicate, how="outer")
    df_new_feature["column_name"]= df_new_feature[["column_name", "duplicate_cnt"]].apply(lambda x : x[0]+ "_"+ str(x[1]) if x[1] >0 else x[0], axis =1)
    
    df_new_feature = df_new_feature.drop(["index"], axis=1)
    
    return df_new_feature

In [62]:
def get_human_dataset():
   df_feature_name = pd.read_csv("E:\\workSpace_Dataset\\UCI\\UCI HAR Dataset\\features.txt", sep="\s+", header=None, names=['column_index','column_name'])
   df_new_feature_name = get_new_feature_name(df_feature_name) 
   feature_name = df_new_feature_name.iloc[:, 1].values.tolist()
   
   X_train = pd.read_csv("E:\\workSpace_Dataset\\UCI\\UCI HAR Dataset\\train\\X_train.txt", sep="\s+", names = feature_name)
   X_test = pd.read_csv("E:\\workSpace_Dataset\\UCI\\UCI HAR Dataset\\test\\X_test.txt", sep="\s+", names = feature_name)
   
   y_train = pd.read_csv("E:\\workSpace_Dataset\\UCI\\UCI HAR Dataset\\train\\y_train.txt",sep='\s+',header=None,names=['action'])
   y_test = pd.read_csv("E:\\workSpace_Dataset\\UCI\\UCI HAR Dataset\\test\\y_test.txt",sep='\s+',header=None,names=['action'])
   
   return X_train, X_test, y_train, y_test 


In [63]:
X_train, X_test, y_train, y_test = get_human_dataset()

In [64]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(7352, 561)
(2947, 561)
(7352, 1)
(2947, 1)


In [65]:
print(y_train['action'].value_counts())

6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: action, dtype: int64
