In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 데이터 로딩 ###
dataset = pd.read_csv('/content/drive/MyDrive/빅데이터 팀플/dementia_analytics/data_processing/dataset_05/dataset_with_label_fill_user_mean.csv')
data_output = dataset[['DIAG_NM']]
# data_input에서 score 가 들어가는 column 등 필요 없는 column 제거
data_input = dataset.drop(['DIAG_NM', 'ID', 'date'], axis=1)
columns_to_drop = [col for col in data_input.columns if 'score' in col]
data_input = data_input[['sleep_breath_average', 'sleep_hr_average', 'sleep_hr_lowest', 'sleep_deep', 'sleep_rem', 'activity_cal_total', 'sleep_awake', 'activity_steps', 'activity_total', 'sleep_duration', 'activity_daily_movement']]

from sklearn.model_selection import train_test_split

train_input_data, test_input_data, train_output_data, test_output_data = train_test_split(data_input, data_output, test_size=0.2, shuffle=True, stratify=data_output, random_state=42)

### 레이블 인코딩 ### 

# 트리 기반 알고리즘에서는 숫자의 크기에 따른 중요도 차이가 없어 레이블 인코딩 OK
# 선형적 특징을 가지는 알고리즘들은 one hot encoding 통해 해결해야 함

from sklearn.preprocessing import LabelEncoder

# 라벨인코더 선언 및 Fitting
# CN : 0, Dem : 1, MCI : 2
items = ['CN', 'Dem', 'MCI']
encoder = LabelEncoder()
encoder.fit(items)

# train
train_output_data['DIAG_NM'] = encoder.transform(train_output_data['DIAG_NM'])

# test
test_output_data['DIAG_NM'] = encoder.transform(test_output_data['DIAG_NM'])


In [31]:
from sklearn.model_selection import train_test_split

train_input, val_input, train_output, val_output = train_test_split(train_input_data, train_output_data, test_size=0.2, shuffle=True, stratify=train_output_data, random_state=42)

In [32]:
t_input, v_input, t_output, v_output = train_test_split(train_input, train_output, test_size=0.2, shuffle=True, stratify=train_output, random_state=42)

In [33]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
scaler_fit = scaler.fit(t_input)
t_input = scaler_fit.transform(t_input)
v_input = scaler_fit.transform(v_input)
val_input = scaler_fit.transform(val_input)

In [34]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=32)

t_input, t_output = sm.fit_resample(t_input, t_output)

# LightGBM

In [35]:
### Light GBM ###
from lightgbm import LGBMClassifier

LGBM = LGBMClassifier()
evals = [(v_input, v_output)]
LGBM.fit(t_input, t_output, early_stopping_rounds=100, eval_metric='logloss', eval_set=evals, verbose=True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[1]	valid_0's multi_logloss: 1.05031
[2]	valid_0's multi_logloss: 1.00938
[3]	valid_0's multi_logloss: 0.972824
[4]	valid_0's multi_logloss: 0.941578
[5]	valid_0's multi_logloss: 0.915929
[6]	valid_0's multi_logloss: 0.891269
[7]	valid_0's multi_logloss: 0.871676
[8]	valid_0's multi_logloss: 0.85351
[9]	valid_0's multi_logloss: 0.836038
[10]	valid_0's multi_logloss: 0.819527
[11]	valid_0's multi_logloss: 0.80263
[12]	valid_0's multi_logloss: 0.788233
[13]	valid_0's multi_logloss: 0.775014
[14]	valid_0's multi_logloss: 0.761681
[15]	valid_0's multi_logloss: 0.747683
[16]	valid_0's multi_logloss: 0.735243
[17]	valid_0's multi_logloss: 0.72487
[18]	valid_0's multi_logloss: 0.714978
[19]	valid_0's multi_logloss: 0.705395
[20]	valid_0's multi_logloss: 0.697004
[21]	valid_0's multi_logloss: 0.687761
[22]	valid_0's multi_logloss: 0.680624
[23]	valid_0's multi_logloss: 0.6724
[24]	valid_0's multi_logloss: 0.665746
[25]	valid_0's multi_logloss: 0.658214
[26]	valid_0's multi_logloss: 0.652283
[2

In [36]:
#  validation 성능
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# validation 데이터 예측
val_pred = LGBM.predict(val_input)

# 정확도 계산
accuracy = accuracy_score(val_output, val_pred)
print("Accuracy:", accuracy)

# 정밀도 계산
precision = precision_score(val_output, val_pred, average='macro')
print("Precision:", precision)

# 재현율 계산
recall = recall_score(val_output, val_pred, average='macro')
print("Recall:", recall)

# F1 스코어 계산
f1 = f1_score(val_output, val_pred, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7652221612726275
Precision: 0.7395748360728578
Recall: 0.7510934047165931
F1 Score: 0.7451593756932641


# XGBoost

In [37]:
import xgboost as xgb

params = {'max_depth' : 3,
         'eta' : 0.1, 
         'objective' : 'multi:softmax',
         'num_class' : 3,
         'eval_metric' : 'mlogloss',
         'early_stoppings' : 100 }

num_rounds = 400

dt = xgb.DMatrix(data=t_input, label=t_output)
dv = xgb.DMatrix(data=v_input, label=v_output)

wlist=[(dt, 'train'), (dv, 'eval')]
xgb_model = xgb.train(params=params, dtrain=dt, num_boost_round=num_rounds, evals=wlist)

Parameters: { "early_stoppings" } are not used.

[0]	train-mlogloss:1.07356	eval-mlogloss:1.08230
[1]	train-mlogloss:1.05176	eval-mlogloss:1.06784
[2]	train-mlogloss:1.03165	eval-mlogloss:1.05466
[3]	train-mlogloss:1.01349	eval-mlogloss:1.04410
[4]	train-mlogloss:0.99649	eval-mlogloss:1.03102
[5]	train-mlogloss:0.98020	eval-mlogloss:1.02033
[6]	train-mlogloss:0.96664	eval-mlogloss:1.01097
[7]	train-mlogloss:0.95341	eval-mlogloss:1.00226
[8]	train-mlogloss:0.93736	eval-mlogloss:0.99154
[9]	train-mlogloss:0.92577	eval-mlogloss:0.98350
[10]	train-mlogloss:0.91241	eval-mlogloss:0.97428
[11]	train-mlogloss:0.90204	eval-mlogloss:0.96650
[12]	train-mlogloss:0.89188	eval-mlogloss:0.95883
[13]	train-mlogloss:0.88235	eval-mlogloss:0.95223
[14]	train-mlogloss:0.87460	eval-mlogloss:0.94628
[15]	train-mlogloss:0.86609	eval-mlogloss:0.93972
[16]	train-mlogloss:0.85861	eval-mlogloss:0.93461
[17]	train-mlogloss:0.85037	eval-mlogloss:0.92821
[18]	train-mlogloss:0.84170	eval-mlogloss:0.92245
[19]	train-

In [38]:
#  validation 성능
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# validation 데이터 예측

dval = xgb.DMatrix(data=val_input, label=val_output)

val_pred = xgb_model.predict(dval)

# 정확도 계산
accuracy = accuracy_score(val_output, val_pred)
print("Accuracy:", accuracy)

# 정밀도 계산
precision = precision_score(val_output, val_pred, average='macro')
print("Precision:", precision)

# 재현율 계산
recall = recall_score(val_output, val_pred, average='macro')
print("Recall:", recall)

# F1 스코어 계산
f1 = f1_score(val_output, val_pred, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7185957213384531
Precision: 0.676744492771816
Recall: 0.7284604016488073
F1 Score: 0.6988499773771236


# 결론

data leaking을 방지하기 위해 validation 데이터셋을 직접 지정할 수 있는 모델 두 가지를 비교해 보았다.

이 중 recall이 더 높은 LightGBM을 모델을 사용하자.

In [41]:
# test 데이터 예측

test_input =  scaler_fit.transform(test_input_data)

test_output = test_output_data

test_pred = LGBM.predict(test_input)

# 정확도 계산
accuracy = accuracy_score(test_output, test_pred)
print("Accuracy:", accuracy)

# 정밀도 계산
precision = precision_score(test_output, test_pred, average='macro')
print("Precision:", precision)

# 재현율 계산
recall = recall_score(test_output, test_pred, average='macro')
print("Recall:", recall)

# F1 스코어 계산
f1 = f1_score(test_output, test_pred, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7726075504828798
Precision: 0.7308847434425226
Recall: 0.7808662772477639
F1 Score: 0.7526889999748554


In [39]:
import joblib

joblib.dump(scaler_fit, '/content/drive/MyDrive/빅데이터 팀플/model/final_lgbm_scaler.pkl')

['/content/drive/MyDrive/빅데이터 팀플/model/final_lgbm_scaler.pkl']

In [40]:
import joblib

joblib.dump(LGBM, '/content/drive/MyDrive/빅데이터 팀플/model/final_lgbm_model.pkl')

['/content/drive/MyDrive/빅데이터 팀플/model/final_lgbm_model.pkl']