In [None]:
Your Goal: The goal of this competition is to use various factors to predict obesity risk in individuals, which is related to cardiovascular disease. Good luck!

목표: 이 대회의 목표는 심혈관 질환과 관련된 개인의 비만 위험을 예측하기 위해 다양한 요인을 사용하는 것입니다. 행운을 빕니다

| 속성                                 | 이유                                       |
| ---------------------------------- | ---------------------------------------- |
| **Gender, Age, Height, Weight**    | 성별·연령·BMI는 비만 위험을 가장 잘 설명하는 기본 인구통계학적 특성 |
| **family_history_with_overweight** | 유전적 요인, 가족력은 비만·심혈관질환 모두에서 중요한 위험인자      |
| **FCVC (식사 시 채소 섭취 빈도)**           | 채소 섭취는 비만과 역상관 관계                        |
| **NCP (하루 식사 횟수)**                 | 과식/소식 패턴은 체중 변화에 영향                      |
| **CAEC (간식 섭취 빈도)**                | 잦은 간식은 비만 확률 상승                          |
| **CH2O (하루 물 섭취량)**                | 낮은 수분 섭취는 대사에 영향 가능                      |
| **FAF (주당 신체활동 빈도)**               | 운동량은 비만 위험 완화 요인                         |
| **TUE (화면 앞 시간, sedentary time)**  | 오래 앉아있는 습관은 비만/대사증후군 위험 인자               |


In [1]:
import pandas as pd

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [None]:
test.info()
train.info()

In [None]:
train.head()
# train.tail()
# train.sample(5)

test.sample(10)

In [None]:
train.shape,test.shape

In [None]:
train[['NObeyesdad']].value_counts()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
train.describe(include='O')

In [None]:
test.describe(include='O')

In [None]:
train.isnull().sum().sum()

In [None]:
test.isnull().sum().sum()

In [2]:
# id 제거
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

In [3]:
y_train = train.pop("NObeyesdad")

In [4]:
y_train.value_counts()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          20758 non-null  object 
 1   Age                             20758 non-null  float64
 2   Height                          20758 non-null  float64
 3   Weight                          20758 non-null  float64
 4   family_history_with_overweight  20758 non-null  object 
 5   FAVC                            20758 non-null  object 
 6   FCVC                            20758 non-null  float64
 7   NCP                             20758 non-null  float64
 8   CAEC                            20758 non-null  object 
 9   SMOKE                           20758 non-null  object 
 10  CH2O                            20758 non-null  float64
 11  SCC                             20758 non-null  object 
 12  FAF                             

In [None]:
# # 기존 레이블인코딩
from sklearn.preprocessing import LabelEncoder

cols = train.columns[train.dtypes == object]

for col in cols:
    le = LabelEncoder()
    le.fit(pd.concat([train[col],test[col]] ,axis=0)) # 행합치기
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [5]:
# 인코딩대체방법(범주형 type -> category 로 변경)

train['Gender'] = train['Gender'].astype('category')
train['family_history_with_overweight'] = train['family_history_with_overweight'].astype('category')
train['FAVC'] = train['FAVC'].astype('category')
train['CAEC'] = train['CAEC'].astype('category')
train['SMOKE'] = train['SMOKE'].astype('category')
train['SCC'] = train['SCC'].astype('category')
train['CALC'] = train['CALC'].astype('category')
train['MTRANS'] = train['MTRANS'].astype('category')




test['Gender'] = train['Gender'].astype('category')
test['family_history_with_overweight'] = train['family_history_with_overweight'].astype('category')
test['FAVC'] = train['FAVC'].astype('category')
test['CAEC'] = train['CAEC'].astype('category')
test['SMOKE'] = train['SMOKE'].astype('category')
test['SCC'] = train['SCC'].astype('category')
test['CALC'] = train['CALC'].astype('category')
test['MTRANS'] = train['MTRANS'].astype('category')

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13840 entries, 0 to 13839
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Gender                          13840 non-null  category
 1   Age                             13840 non-null  float64 
 2   Height                          13840 non-null  float64 
 3   Weight                          13840 non-null  float64 
 4   family_history_with_overweight  13840 non-null  category
 5   FAVC                            13840 non-null  category
 6   FCVC                            13840 non-null  float64 
 7   NCP                             13840 non-null  float64 
 8   CAEC                            13840 non-null  category
 9   SMOKE                           13840 non-null  category
 10  CH2O                            13840 non-null  float64 
 11  SCC                             13840 non-null  category
 12  FAF               

In [None]:
train

In [None]:
#------------------------
#검증 데이터 분할
#------------------------

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train,
    y_train,
    test_size=0.2,
    random_state=0
)

In [7]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((16606, 16), (4152, 16), (16606,), (4152,))

In [None]:
#1. 랜던포레스트
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0) # 난수값고정 in->out 고정값 
rf.fit(X_train, y_train)  #모델학습
pred_rf=rf.predict_proba(X_val)
pred_rf

In [12]:
# 2.LIGHTGBM 학습
import lightgbm as lgb
lgbmc = lgb.LGBMClassifier(random_state=0, verbose=1) # 
lgbmc.fit(X_train, y_train)
pred=lgbmc.predict(X_val)  # 예측 y값
pred_proba = lgbmc.predict_proba(X_val) # 예측 확률값

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2058
[LightGBM] [Info] Number of data points in the train set: 16606, number of used features: 16
[LightGBM] [Info] Start training from score -2.094366
[LightGBM] [Info] Start training from score -1.912860
[LightGBM] [Info] Start training from score -1.964755
[LightGBM] [Info] Start training from score -1.863915
[LightGBM] [Info] Start training from score -1.634499
[LightGBM] [Info] Start training from score -2.139374
[LightGBM] [Info] Start training from score -2.110635


In [None]:
# 
!pip install lightgbm

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

roc_auc_ovr = roc_auc_score(y_val , pred_proba, multi_class='ovr') # pred 예측 확률값 [오답예측확률 , 정답예측확률]
roc_auc_ovo = roc_auc_score(y_val , pred_proba, multi_class='ovo') # pred 예측 확률값 [오답예측확률 , 정답예측확률]
accuracy = accuracy_score(y_val,pred) # pred 예측 y값 
f1 = f1_score(y_val,pred,average='macro') # pred 예측 y값 (클래스별(1,2,3정답값) 성능을 동등하게 반영)

print('roc_auc_ovr : ',  roc_auc_ovr)
print('roc_auc_ovo : ',  roc_auc_ovo)
print('accuracy : ',  accuracy)
print('f1 : ',  f1)

# OvR(One-vs-Rest): 각 클래스 vs 나머지 → 평균
# OvO(One-vs-One): 클래스 쌍별 AUC 평균

roc_auc_ovr :  0.9888169248146651
roc_auc_ovo :  0.9874571729788963
accuracy :  0.9051059730250481
f1 :  0.8949765427420396


In [10]:
#-----------------------
# 예측 파일 제출(.csv)
#-----------------------
pred_submit=lgbmc.predict(test)
pred_submit

array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype=object)

In [11]:
submit_csv = pd.DataFrame({'prediction':pred_submit})
submit_csv.to_csv("result.csv",index=False)