In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 데이터 로딩 ###
dataset = pd.read_csv('/content/drive/MyDrive/빅데이터 팀플/dementia_analytics/data_processing/dataset_05/dataset_with_label_fill_user_mean.csv')
data_output = dataset[['DIAG_NM']]
# data_input에서 score 가 들어가는 column 등 필요 없는 column 제거
data_input = dataset.drop(['DIAG_NM', 'ID', 'date'], axis=1)
columns_to_drop = [col for col in data_input.columns if 'score' in col]
data_input = data_input[['sleep_breath_average', 'sleep_hr_average', 'sleep_hr_lowest', 'sleep_deep', 'sleep_rem', 'activity_cal_total', 'sleep_awake', 'activity_steps', 'activity_total', 'sleep_duration', 'activity_daily_movement']]
# KeyError: "['sleep_bedtime_end', 'sleep_bedtime_start', 'sleep_hr_5min'] not in index"
#'activity_cal_active', 
#  'activity_met_min_inactive', 'activity_high', 'sleep_efficiency', 'activity_rest', 'activity_met_min_low', 'activity_inactive', 'activity_medium', 'activity_low', 'activity_met_min_medium', 'activity_met_min_high'

In [3]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_output, test_output = train_test_split(data_input, data_output, test_size=0.2, shuffle=True, stratify=data_output, random_state=42)

In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
scaler_fit = scaler.fit(train_input)
train_input = scaler_fit.transform(train_input)
test_input = scaler_fit.transform(test_input)

In [5]:
### 레이블 인코딩 ### 

# 트리 기반 알고리즘에서는 숫자의 크기에 따른 중요도 차이가 없어 레이블 인코딩 OK
# 선형적 특징을 가지는 알고리즘들은 one hot encoding 통해 해결해야 함

from sklearn.preprocessing import LabelEncoder

# 라벨인코더 선언 및 Fitting
# CN : 0, Dem : 1, MCI : 2
items = ['CN', 'Dem', 'MCI']
encoder = LabelEncoder()
encoder = encoder.fit(items)

# train
train_output['DIAG_NM'] = encoder.transform(train_output['DIAG_NM'])
# display(encoder.classes_)
# display(encoder.inverse_transform(train_output['DIAG_NM']))

# test
test_output['DIAG_NM'] = encoder.transform(test_output['DIAG_NM'])

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

XGB = XGBClassifier(colsample_bytree=0.9,
                    gamma=0,
                    n_estimators=500,
                    learning_rate=0.15, 
                    max_depth=15, 
                    random_state=42)

XGB.fit(train_input, train_output)

train_pred = XGB.predict(train_input)
test_pred = XGB.predict(test_input)
print(accuracy_score(train_pred, train_output))
print(accuracy_score(test_pred, test_output))

1.0
0.8208955223880597


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 정확도 계산
accuracy = accuracy_score(test_output, test_pred)
print("Accuracy:", accuracy)

# 정밀도 계산
precision = precision_score(test_output, test_pred, average='macro')
print("Precision:", precision)

# 재현율 계산
recall = recall_score(test_output, test_pred, average='macro')
print("Recall:", recall)

# F1 스코어 계산
f1 = f1_score(test_output, test_pred, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8208955223880597
Precision: 0.8339416122334137
Recall: 0.7613286137269307
F1 Score: 0.7917878178903573


In [8]:
import joblib

joblib.dump(scaler_fit, '/content/drive/MyDrive/빅데이터 팀플/model/final_xgb_scaler.pkl')

['/content/drive/MyDrive/빅데이터 팀플/model/final_xgb_scaler.pkl']

In [9]:
import joblib

joblib.dump(XGB, '/content/drive/MyDrive/빅데이터 팀플/model/final_xgb_model.pkl')

['/content/drive/MyDrive/빅데이터 팀플/model/final_xgb_model.pkl']