In [4]:
import pandas as pd
import numpy as np

# Load the data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
original=pd.read_csv('ObesityDataSet.csv')

train.head().T

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
Gender,Male,Female,Female,Female,Male
Age,24.443011,18.0,18.0,20.952737,31.641081
Height,1.699998,1.56,1.71146,1.71073,1.914186
Weight,81.66995,57.0,50.165754,131.274851,93.798055
family_history_with_overweight,yes,yes,yes,yes,yes
FAVC,yes,yes,yes,yes,yes
FCVC,2.0,2.0,1.880534,3.0,2.679664
NCP,2.983297,3.0,1.411685,3.0,1.971472
CAEC,Sometimes,Frequently,Sometimes,Sometimes,Sometimes


In [5]:
# combine train and original Data
train=pd.concat([train,original],axis=0)

In [6]:
# Get categorical columns
categorical_columns=[]
for col in train.columns:
    if train[col].dtype=='object':
        categorical_columns.append(col)

categorical_columns

['Gender',
 'family_history_with_overweight',
 'FAVC',
 'CAEC',
 'SMOKE',
 'SCC',
 'CALC',
 'MTRANS',
 'NObeyesdad']

In [7]:
# Categorical column encoding; Here I use manual encoding
train['Gender']=train["Gender"].apply(lambda x: 1 if x=="Male" else 0)
test['Gender']=test["Gender"].apply(lambda x: 1 if x=="Male" else 0)

train['family_history_with_overweight']=train["family_history_with_overweight"].apply(lambda x: 1 if x=="yes" else 0)
test['family_history_with_overweight']=test["family_history_with_overweight"].apply(lambda x: 1 if x=="yes" else 0)

train['FAVC']=train["FAVC"].apply(lambda x: 1 if x=="yes" else 0)
test['FAVC']=test["FAVC"].apply(lambda x: 1 if x=="yes" else 0)

train['CAEC']=train["CAEC"].apply(lambda x: 1 if x=="no" else 2 if x=="Sometimes" else 3 if x=="Always" else 4)
test['CAEC']=test["CAEC"].apply(lambda x: 1 if x=="no" else 2 if x=="Sometimes" else 3 if x=="Always" else 4)

train['SMOKE']=train["SMOKE"].apply(lambda x: 1 if x=="yes" else 0)
test['SMOKE']=test["SMOKE"].apply(lambda x: 1 if x=="yes" else 0)

train['SCC']=train["SCC"].apply(lambda x: 1 if x=="yes" else 0)
test['SCC']=test["SCC"].apply(lambda x: 1 if x=="yes" else 0)

train['CALC']=train["CALC"].apply(lambda x: 1 if x=="no" else 2 if x=="Sometimes" else 3)
test['CALC']=test["CALC"].apply(lambda x: 1 if x=="no" else 2 if x=="Sometimes" else 3)

mapping={'Public_Transportation':1,
         'Automobile':2,
         'Motorbike':3,
         'Bike':4,
         'Walking':5}
train['MTRANS']=train["MTRANS"].replace(mapping)
test["MTRANS"]=test["MTRANS"].replace(mapping)

# target label encoding
target_mapping={'Insufficient_Weight':0,
                'Normal_Weight':1,
                'Overweight_Level_I':2,
                'Overweight_Level_II':3,
                'Obesity_Type_I':4,
                'Obesity_Type_II':5,
                'Obesity_Type_III':6}

train['NObeyesdad']=train["NObeyesdad"].replace(target_mapping)

In [8]:
# Feature Engineering

def feat_eng(df):
    df['BMI'] = df['Weight'] / (df['Height']**2)
    df["HealthyHabitRatio"] = (df["FCVC"] + df["CH2O"] + df["FAF"]) / (df["FAVC"] + df["CAEC"] + df["TUE"] + df["SMOKE"] * 2)
    df["Age_BMI"] = df["Age"] * df["BMI"]
    df["Age_HealthyHabitRatio"] = df["Age"] * df["HealthyHabitRatio"]
    df["Gender_SCC"]=df["Gender"]*df["SCC"]
    df["Height_Weight_Ratio"]=df["Height"]/df["Weight"]
    df["FAVC_CAEC_Index"]=df["FAVC"]/df["CAEC"]
    df["Activity_Index"]=df["FAF"]-df["TUE"]
    df["Water_Alcohol_Ratio"]=df["CH2O"]/df["CALC"]
    df["Meal_Frequency_Deviation"]=abs(df["NCP"]-3+1e-6)
    df["FamilyHistory_BMI_Interaction"]=(df["family_history_with_overweight"]+1e-6)*df["BMI"]

    return df


In [9]:
train=feat_eng(train)
test=feat_eng(test)

In [10]:
categorical_columns=categorical_columns.remove("NObeyesdad")

In [11]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

test.drop(columns=["id"],axis=1,inplace=True)
X= train.drop(columns=["NObeyesdad","id"],axis=1)
y=train["NObeyesdad"]


In [12]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score

skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=13)
sampler=optuna.samplers.TPESampler(seed=13)

def objective(trial):
    params={
        'objective': 'multi:softmax',
        'n_estimators':trial.suggest_int('n_estimators', 300, 800),
        'max_depth':trial.suggest_int('max_depth',4,10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2,log=True),
        'gamma': trial.suggest_float('gamma', 0, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0,log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0,log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'num_class': len(y.unique()) 
    }

    model=XGBClassifier(**params)
    scores=cross_val_score(model,X,y,cv=skf,scoring='accuracy')
    return scores.mean()


In [13]:
study=optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(lambda trial: objective(trial),n_trials=200)

best_params=study.best_params
print("="*50)
print(f"Best Parameters: {best_params}")

[I 2024-02-12 21:23:44,972] A new study created in memory with name: no-name-82c91488-82f6-4314-8607-4fbe890353c5
[I 2024-02-12 21:24:19,156] Trial 0 finished with value: 0.9118894662316628 and parameters: {'n_estimators': 689, 'max_depth': 5, 'learning_rate': 0.07882965546640572, 'gamma': 0.9657491980429997, 'reg_alpha': 0.6036829515606789, 'reg_lambda': 4.242227738456316e-05, 'colsample_bytree': 0.8045212313806389, 'subsample': 0.8877632573024233, 'min_child_weight': 4}. Best is trial 0 with value: 0.9118894662316628.
[I 2024-02-12 21:24:53,051] Trial 1 finished with value: 0.8928679973736072 and parameters: {'n_estimators': 661, 'max_depth': 4, 'learning_rate': 0.0048611742047654635, 'gamma': 0.058512491882074746, 'reg_alpha': 0.07186005424383446, 'reg_lambda': 9.61240859285251e-06, 'colsample_bytree': 0.8399239757890484, 'subsample': 0.6281399746633151, 'min_child_weight': 2}. Best is trial 0 with value: 0.9118894662316628.
[I 2024-02-12 21:25:13,816] Trial 2 finished with value: 0

Best Parameters: {'n_estimators': 774, 'max_depth': 4, 'learning_rate': 0.038398615995667275, 'gamma': 0.6780480101373416, 'reg_alpha': 0.03267603794070431, 'reg_lambda': 3.3111162830028565e-08, 'colsample_bytree': 0.5123182748611554, 'subsample': 0.8185281449911301, 'min_child_weight': 3}


In [14]:
best_params

{'n_estimators': 774,
 'max_depth': 4,
 'learning_rate': 0.038398615995667275,
 'gamma': 0.6780480101373416,
 'reg_alpha': 0.03267603794070431,
 'reg_lambda': 3.3111162830028565e-08,
 'colsample_bytree': 0.5123182748611554,
 'subsample': 0.8185281449911301,
 'min_child_weight': 3}

In [15]:
model_XGB=XGBClassifier(**best_params)
model_XGB.fit(X,y)
y_pred=model_XGB.predict(test)


In [16]:
sub=pd.read_csv('sample_submission.csv')
# sub["NObeyesdad"]=y_pred.map({0:'Insufficient_Weight',
#                                             1:'Normal_Weight',
#                                            2:'Overweight_Level_I',
#                                            3:'Overweight_Level_II',
#                                            4:'Obesity_Type_I',
#                                            5:'Obesity_Type_II',
#                                            6:'Obesity_Type_III'})
sub["NObeyesdad"]=y_pred
sub["NObeyesdad"]=sub["NObeyesdad"].map({0:'Insufficient_Weight',
                1:'Normal_Weight',
                2:'Overweight_Level_I',
                3:'Overweight_Level_II',
                4:'Obesity_Type_I',
                5:'Obesity_Type_II',
                6:'Obesity_Type_III'})
sub.head()
sub.to_csv('XGB_submission_no_transoform.csv',index=False)