# Mediwhale Assignment

## 

### 1. Settings

In [46]:
# load packages
import numpy as np
import pandas as pd
import pickle

# model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
# load data
data = pd.read_csv('../Data/adult_use_data.csv')

In [6]:
# drop HHX, FMX, FPX
df = data.drop(['HHX', 'FMX', 'FPX', 'height.1', 'weight.1', 'bmi.1'], axis = 1)

In [7]:
df.describe()

Unnamed: 0,SEX,AGE_P,REGION,RACERPI2,height,weight,bmi,hypert,chol,chdev,...,preg,kidney,dm,smoke,drink,comp_use,sleep_hour,exercise,live_alone,imbal_meal
count,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,...,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0,4358.0
mean,1.631941,50.96581,2.661083,1.311152,168.426709,81.043047,28.533984,0.753786,0.628499,0.074805,...,0.013768,0.044286,0.383433,0.293713,0.201698,3.065397,6.958697,0.326985,0.376778,0.199403
std,0.482333,17.791746,1.0093,0.875033,9.831944,18.63013,6.041287,0.916131,0.850487,0.263107,...,0.116539,0.205754,0.722485,0.455514,0.401314,1.274776,1.760803,0.469166,0.484634,0.399598
min,1.0,18.0,1.0,1.0,149.9,45.4,15.221981,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,1.0,36.0,2.0,1.0,160.0,67.1,24.017768,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0
50%,2.0,52.0,3.0,1.0,167.6,79.4,27.459684,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,7.0,0.0,0.0,0.0
75%,2.0,65.0,3.0,1.0,175.3,93.0,32.289347,2.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,4.0,8.0,1.0,1.0,0.0
max,2.0,85.0,4.0,5.0,193.0,135.2,51.054409,2.0,2.0,1.0,...,1.0,1.0,2.0,1.0,1.0,4.0,20.0,1.0,1.0,1.0


In [8]:
df.dtypes

SEX             int64
AGE_P           int64
REGION          int64
RACERPI2        int64
height        float64
weight        float64
bmi           float64
hypert        float64
chol          float64
chdev         float64
strev         float64
depress       float64
preg            int64
kidney        float64
dm            float64
smoke         float64
drink           int64
comp_use      float64
sleep_hour    float64
exercise      float64
live_alone    float64
imbal_meal    float64
dtype: object

### 2. Split Train Test

In [36]:
# 모델의 robust를 확보하기 위해 층화추출로 분리
X = df.drop("dm",axis=1)
y = df["dm"]
X_train, X_test, y_train, y_test =train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state= 42)

### 3. xgb, lgbm

In [47]:
# XGB 
xgb = XGBClassifier(random_state=42)

In [48]:
# XGB 적합
xgb.fit(X_train, y_train)

In [49]:
# Confusion matrix
xgb_pred = xgb.predict(X_test)
print('\n','Confusion Matrix - Test:','\n',confusion_matrix(y_test, xgb_pred))


 Confusion Matrix - Test: 
 [[930  12  51]
 [104   9  15]
 [126  13  48]]


In [50]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.94      0.86       993
         1.0       0.26      0.07      0.11       128
         2.0       0.42      0.26      0.32       187

    accuracy                           0.75      1308
   macro avg       0.50      0.42      0.43      1308
weighted avg       0.69      0.75      0.71      1308



In [55]:
# LGBM
lgbm = LGBMClassifier(force_row_wise=True, 
                      random_state=42)

In [56]:
# lgbm 적합
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 568
[LightGBM] [Info] Number of data points in the train set: 3050, number of used features: 21
[LightGBM] [Info] Start training from score -0.275300
[LightGBM] [Info] Start training from score -2.322453
[LightGBM] [Info] Start training from score -1.947551


In [57]:
# Confusion matrix
lgbm_pred = lgbm.predict(X_test)
print('\n','Confusion Matrix - Test:','\n',confusion_matrix(y_test, lgbm_pred))


 Confusion Matrix - Test: 
 [[938  14  41]
 [108   4  16]
 [130   9  48]]


In [58]:
print(classification_report(y_test, lgbm_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.94      0.86       993
         1.0       0.15      0.03      0.05       128
         2.0       0.46      0.26      0.33       187

    accuracy                           0.76      1308
   macro avg       0.47      0.41      0.42      1308
weighted avg       0.69      0.76      0.71      1308



In [60]:
# save the model
with open('../Data/model_xgb.pickle','wb') as fw:
    pickle.dump(xgb, fw)

with open('../Data/model_lgbm.pickle','wb') as fw:
    pickle.dump(lgbm, fw)