### 그래디언트 부스팅 계열의 앙상블 모델
- 대표 모델 : XGBoost, LightGBM

설치 여부 확인

In [1]:
import xgboost, lightgbm

print(f'xgboost : {xgboost.__version__}')
print(f'lightgbm : {lightgbm.__version__}')

xgboost : 1.7.3
lightgbm : 4.1.0


(1) 모듈 로딩 및 데이터 준비 <hr>

In [2]:
## 데이터 파일 변수 선언
feature_name_file = '../DATA/human_activity/features.txt'
label_file = '../DATA/human_activity/activity_labels.txt'

x_train_file = '../DATA/human_activity/train/X_train.txt'
y_train_file = '../DATA/human_activity/train/y_train.txt'

x_test_file = '../DATA/human_activity/test/X_test.txt'
y_test_file = '../DATA/human_activity/test/y_test.txt'

In [3]:
## 데이터 로딩
import pandas as pd
import numpy as np

feature_nameDF = pd.read_csv(feature_name_file, 
                              sep='\s+', 
                              header=None, 
                              names=['col_id', 'col_name'])
feature_nameDF.head(3)

Unnamed: 0,col_id,col_name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z


In [4]:
feature_nameDF.info()  # 561개 컬럼 존재

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561 entries, 0 to 560
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   col_id    561 non-null    int64 
 1   col_name  561 non-null    object
dtypes: int64(1), object(1)
memory usage: 8.9+ KB


In [5]:
# 중복 체크
feature_nameDF.col_name.duplicated().sum()

84

In [6]:
type(feature_nameDF.col_name.values)

numpy.ndarray

In [7]:
feature_nameDF[feature_nameDF.col_name.duplicated()]

Unnamed: 0,col_id,col_name
316,317,"fBodyAcc-bandsEnergy()-1,8"
317,318,"fBodyAcc-bandsEnergy()-9,16"
318,319,"fBodyAcc-bandsEnergy()-17,24"
319,320,"fBodyAcc-bandsEnergy()-25,32"
320,321,"fBodyAcc-bandsEnergy()-33,40"
...,...,...
497,498,"fBodyGyro-bandsEnergy()-17,32"
498,499,"fBodyGyro-bandsEnergy()-33,48"
499,500,"fBodyGyro-bandsEnergy()-49,64"
500,501,"fBodyGyro-bandsEnergy()-1,24"


In [8]:
feature_nameDF[feature_nameDF.col_name.duplicated()].index

Index([316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
       330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343,
       395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408,
       409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422,
       474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487,
       488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501],
      dtype='int64')

In [9]:
columns_to_remove = feature_nameDF[feature_nameDF.col_name.duplicated()].index

In [10]:
feature_nameDF.drop(columns_to_remove, inplace=True)
feature_nameDF.reset_index(drop=True, inplace=True)
feature_nameDF.head()

Unnamed: 0,col_id,col_name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y


In [11]:
x_trainDF = pd.read_csv(x_train_file, sep='\s+', header=None)
x_trainDF.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118


In [12]:
x_trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 561 entries, 0 to 560
dtypes: float64(561)
memory usage: 31.5 MB


In [13]:
x_trainDF.drop(columns_to_remove, axis='columns', inplace=True)
x_trainDF = x_trainDF.T.reset_index(drop=True).T    # 컬럼명을 초기화하기 위해 T를 취해 인덱스로 바꿔서 리셋후 복원

In [14]:
x_trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 477 entries, 0 to 476
dtypes: float64(477)
memory usage: 26.8 MB


In [15]:
x_testDF = pd.read_csv(x_test_file, sep='\s+', header=None)
x_testDF.drop(columns_to_remove, axis='columns', inplace=True)
x_testDF = x_testDF.T.reset_index(drop=True).T
x_testDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2947 entries, 0 to 2946
Columns: 477 entries, 0 to 476
dtypes: float64(477)
memory usage: 10.7 MB


In [16]:
y_trainDF = pd.read_csv(y_train_file, sep='\s+', header=None)
y_trainDF.head(3)

Unnamed: 0,0
0,5
1,5
2,5


In [17]:
y_testDF = pd.read_csv(y_test_file, sep='\s+', header=None)
y_testDF.head(3)

Unnamed: 0,0
0,5
1,5
2,5
