In [None]:
# pip install -q pytorch-tabnet

In [None]:
import os
import sys
import random
import numpy as np 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier, early_stopping
from catboost import CatBoostClassifier
# from pytorch_tabnet.tab_model import TabNetClassifier
import optuna
import pickle
import gc
import math
from functools import partial

In [None]:
def seed_everything(seed = 42):
    """
    Setting seed value to ensure reproducibility across runs
    """
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    

def class2dict(f):
    """
    Convert a class to a dictionary
    """
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

Arguments as follow:
1. model (str) = Choice of model being trained here. Models being used right now: 'randomforest', 'lightgbm', 'xgboost', 'catboost'
2. n_trials (int) = Hyperparameter optimization is done using Optuna. n_trials set the number of optimization runs performed 
3. additional_data (boolean) = Whether to use the additional data from https://www.kaggle.com/datasets/aravindpcoder/obesity-or-cvd-risk-classifyregressorcluster
4. metric (str) = Denote the metric Optuna should use to tune the hyperparameters. Accept either 'accuracy' or 'f1'
5. wandb (boolean) = Whether to use wandb to track experiment runs
6. seed (int) = Seed value
7. target (str) = Target column for this dataset. Do not change.
8. folds (int) = Number of k-folds cross validation

In [None]:
class args:
    model = 'lightgbm'
    n_trials = 175   #Catboost = 15, lightgbm = 200, xgboost = 70, randomforest = 150, tabnet = 5
    additional_data = True 
    metric = 'accuracy' #f1 or accuracy
    wandb = True
    standardize = True
    seed = 42
    target = 'NObeyesdad' #Do not change
    folds = 15
    
    
target_to_label = {'Overweight_Level_II': 0, 'Normal_Weight': 1, 'Insufficient_Weight': 2, 'Obesity_Type_III': 3, 'Obesity_Type_II': 4, 'Overweight_Level_I': 5, 'Obesity_Type_I': 6}
label_to_target = {label:target for target,label in target_to_label.items()}

In [None]:
if args.wandb:
    # Experiement tracking using Weight and Biases
    import wandb
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("wandb_api")
    wandb.login(key=secret_value_0)
    run = wandb.init(project='playground_s4e2', config=class2dict(args), group=args.model)

## Feature Engineering

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

### Features:
1. Gender (Male/Female)
2. Age (Continuous Variable)
3. Height (Continuous Variable)
4. family_history_with_overweight (yes/no)
5. FAVC - Frequent consumption of high caloric food (yes/no)
6. FCVC - Frequency of consumption of vegetables (Continuous Variable)
7. NCP - Number of main meals (Continuous Variable)
8. CAEC - Consumption of food between meals (no/Sometimes/Frequently/Always)
9. SMOKE (yes/no)
10. CH2O - Consumption of water daily (Continuous Variable)
11. SCC - Calories consumption monitoring (no/yes)
12. FAF - Physical activity frequency (Continuous Variable)
13. TUE - Time using technology devices (Continuous Variable)
14. CALC - Consumption of alcohol (no/Sometimes/Frequently/Always)
15. MTRANS - Transportation used (Public_Transportation/Automobile/Walking/Motorbike/Bike)
16. NObeyesdad - Target (Overweight_Level_II/Normal_Weight/Insufficient_Weight/Obesity_Type_III/Obesity_Type_II/Overweight_Level_I/Obesity_Type_I)

I took the feature engineering code from this notebook: https://www.kaggle.com/code/ravi20076/playgrounds4e02-eda-baseline

In [None]:
def feature_engineering(_df, training):
    """
    
    Perform feature training by encoding categorical columns, binning continuous variable
    """
    _df = pd.get_dummies(_df, columns=['MTRANS'])
    _df[['MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike','MTRANS_Public_Transportation', 'MTRANS_Walking']] = _df[['MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike','MTRANS_Public_Transportation', 'MTRANS_Walking']].astype(np.int8)
    _df["Gender"] = np.where(_df["Gender"] == "Male", 1,0).astype(np.uint8)
    _df["family_history_with_overweight"] = np.where(_df["family_history_with_overweight"] == "yes", 1,0).astype(np.uint8)
    _df['SMOKE'] = np.where(_df["SMOKE"] == "yes", 1,0).astype(np.uint8)
    _df['FAVC'] = np.where(_df["FAVC"] == "no", 1,0).astype(np.uint8)
    _df["CAEC"] = _df["CAEC"].map({"no": 0, "Sometimes": 1, "Frequently": 2, "Always": 3}).astype(np.uint8)
    _df['SCC']  = np.where(_df["SCC"] == "no", 1,0).astype(np.uint8)
    _df["CALC"] = _df["CALC"].map({"no": 0, "Sometimes": 1, "Frequently": 2, "Always": 2}).astype(np.uint8)
    _df["BMI"] = _df["Weight"] / _df["Height"]**2
    _df["BMI_Grp"] = np.select([_df["BMI"] < 18.5, _df["BMI"] < 25, _df["BMI"] < 30, _df["BMI"] < 35, _df["BMI"] < 40], [0,1,2,3,4], 5).astype(np.int8)
    
#     _df["BMIbyNCP"]  = np.log1p(_df["BMI"]) - np.log1p(_df["NCP"])
#     _df["BMIFAF"]    = (_df["BMI"] * _df["FAF"])/ 25.0
#     _df["FAFmTUE"]   = _df["FAF"] - _df["TUE"]
#     _df["FCVCpNCP"]  = _df['FCVC'] * _df['NCP']
#     _df['TechUse']   = np.log1p(_df['TUE']) - np.log1p(_df['Age'])
    
    if training:
        _df['NObeyesdad'] = _df['NObeyesdad'].map(target_to_label)
    
    return _df

In [None]:
df = feature_engineering(train, True)
if args.additional_data:
    # If true. use the additional dataset for training. https://www.kaggle.com/datasets/aravindpcoder/obesity-or-cvd-risk-classifyregressorcluster
    add_data = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')
    add_data = feature_engineering(add_data, True)

if args.standardize:
    features = [i for i in df.columns if i != 'id' and i != args.target]
    df_id = df['id'].values
    df_target = df[args.target].values
    
    scaler = StandardScaler().fit(df.loc[:, features])
    df = scaler.transform(df.loc[:, features])
    df = pd.DataFrame(df, columns=features)
    df['id'] = df_id
    df[args.target] = df_target
    
    if args.additional_data:
        add_data_target = add_data[args.target].values
        add_data = scaler.transform(add_data.loc[:, features])
        add_data = pd.DataFrame(add_data, columns=features)
        add_data[args.target] = add_data_target

## Modelling

In [None]:
# Define the objective metric used to perform hyperparameter tuning. Note that the competition uses accuracy as metric, but 
# we can tune the model based on other metrics to allow more variation of models

if args.metric == 'f1':
    objective_metric = partial(f1_score, average='micro') 
elif args.metric == 'accuracy':
    objective_metric = accuracy_score
else:
    print('Objective Metric not defined')

In [None]:
"""
This is the objective function Optuna uses to perform hyperparameter tuning. In each trial/run, Optuna selects a value from the range
given (defined within trial.suggest_int or trial_suggest_float), train a model, and find out the peformance on the validation dataset.
Based on the current performance and the performance from past trials, Optuna automatically adjusts the hyperparmeter setting and try
to converge to an optimal value within least iteration.

"""

def objective(trial, model_name, X_train, y_train, X_val, y_val):
    if model_name == 'lightgbm':
        lightgbm_params = {
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'num_leaves': trial.suggest_int('num_leaves', 30, 200),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 600),
            'n_estimators': trial.suggest_int('n_estimators', 5, 100),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 10),
            'reg_alpha' : trial.suggest_float('reg_alpha', 0.0, 50),
            'reg_lambda' : trial.suggest_float('reg_lambda', 0.0, 40),
            'verbose': -1
        }
        model = LGBMClassifier(**lightgbm_params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[early_stopping(50, verbose=False)])
            
    elif model_name == 'xgboost':
        xgboost_params = {
            'eta': trial.suggest_float('eta', 0.02, 0.3),
            'subsample': trial.suggest_float('subsample', 0.6, 0.9),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.8),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'lambda': trial.suggest_int('lambda', 1, 10),
            'early_stopping_rounds': 50,
        }
        model = XGBClassifier(**xgboost_params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)

    elif model_name == 'catboost':
        catboost_params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.2),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 3, 30),
            'depth': trial.suggest_int('depth', 4, 10),
            'rsm' : trial.suggest_float('rsm', 0.5, 1.0),
            'verbose' : 0,
        }

        model = CatBoostClassifier(**catboost_params)
        model.fit(X_train, y_train)
    
    elif model_name == 'randomforest':
        randomforest_params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'max_depth': trial.suggest_int('max_depth', 4, 40),
            'min_samples_split': trial.suggest_int('min_samples_split', 10, 300),
        }
        model = RandomForestClassifier(**randomforest_params)
        model.fit(X_train, y_train)
        
    elif model_name == 'tabnet':
        tabnet_params = {
            'n_d': trial.suggest_int('n_d', 8, 16),
            'n_steps': trial.suggest_int('n_steps', 3, 5)
        }
        tabnet_params['n_a'] = tabnet_params['n_d']
        model = TabNetClassifier(**tabnet_params)
        model.fit(X_train.values, y_train.values, eval_set=[(X_val.values, y_val.values)])
        
    elif model_name == 'adaboost':
        adaboost_params = {
            'n_estimators': trial.suggest_int('n_estimators', 30, 100),
            'learning_rate' : trial.suggest_float('learning_rate', 0.1, 1.5),
        }
        model = AdaBoostClassifier(**adaboost_params)
        model.fit(X_train.values, y_train.values)


    preds = model.predict(X_val.values)
    metric = objective_metric(y_val, preds)
    return metric

In [None]:
def run_single_fold(fold, model_name, train_idx, val_idx):
    features = [i for i in df.columns if i != 'id' and i != args.target]
   
    X_train = df.loc[train_idx, features]
    y_train = df.loc[train_idx, args.target]
    X_val = df.loc[val_idx, features]
    y_val = df.loc[val_idx, args.target]
    
    if args.additional_data:
        # Concatenate the original data with the additional data. Note that the additional data is used on train only and not 
        # validation. The additional data may be of a different distribution so we should not use it to validate our model. 
        X_train = pd.concat((X_train, add_data[features]))
        y_train = pd.concat((y_train, add_data[args.target]))
    
    # Begin hyperparameter tuning to find out the best choice of hyperparameters
    f = partial(objective, model_name=model_name, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val)
    study = optuna.create_study(direction='maximize')
    study.optimize(f, n_trials=args.n_trials)
    model_params = study.best_trial.params
    print('Best trial:', model_params)

    # Once the optimal hyperparameters are obtained, we used it to fit the model 
    if model_name == 'lightgbm':
        model_params['verbose'] = -1
        best_model = LGBMClassifier(**model_params)
        best_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[early_stopping(50, verbose=False)])
        
    elif model_name == 'xgboost':
        model_params['early_stopping_rounds'] = 50
        best_model = XGBClassifier(**model_params)
        best_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
        
    elif model_name == 'catboost':
        model_params['verbose'] = 0
        best_model = CatBoostClassifier(**model_params)
        best_model.fit(X_train, y_train)
    
    elif model_name == 'randomforest':
        best_model = RandomForestClassifier(**model_params)
        best_model.fit(X_train, y_train)
    
    elif model_name == 'tabnet':
        best_model = TabNetClassifier(**model_params)
        best_model.fit(X_train.values, y_train.values, eval_set=[(X_val.values, y_val.values)])
        
    elif model_name == 'adaboost':
        best_model = AdaBoostClassifier(**model_params)
        best_model.fit(X_train.values, y_train.values)
    
    preds = best_model.predict(X_val.values)

   # Evaluating the best model performance
    score = accuracy_score(y_val, preds)
    print(f"Best {args.model} at fold {fold} has accuracy = {score:.4f}")
    
    if args.wandb:
        wandb.log({f"Fold_{fold}": score})
    
    # Saving Out of fold
    probs = best_model.predict_proba(X_val.values)
    _oof = pd.DataFrame(probs)
    _oof['id'] = df.loc[val_idx, 'id'].values
    _oof['preds'] = preds
    _oof['ground_truth'] = df.loc[val_idx, 'NObeyesdad'].values
    
    # Saving the best model
    pickle.dump(best_model, open(f"{model_name}_fold{fold}", "wb"))
   
        
    del best_model
    gc.collect()
    
    
    return _oof

In [None]:
skf = StratifiedKFold(n_splits=args.folds)
oof = []
for i, (train_idx, val_idx) in enumerate(skf.split(df.drop(columns=args.target), df[args.target])):
    _oof = run_single_fold(i, args.model, train_idx, val_idx)
    oof.append(_oof)
    
oof = pd.concat(oof, axis=0)
oof.to_csv(f"oof_{args.model}.csv", index=False)

cv_score = accuracy_score(oof['ground_truth'], oof['preds'])
print(f"Overall CV score = {cv_score}")
if args.wandb:
    wandb.log({"CV": cv_score})

## Inference

In [None]:
test_ids = test['id'].values
test = test.drop(columns='id')
test = feature_engineering(test, training=False)

if args.standardize:
    test = scaler.transform(test)

In [None]:
results = {i: None for i in range(args.folds)}
for fold in results.keys():
    with open(f"/kaggle/working/{args.model}_fold{fold}", 'rb') as f:
        model = pickle.load(f)
    probs = model.predict_proba(test)
    results[fold] = probs
    
test_probs = np.mean(np.stack(list(results.values())), axis=0)
predictions = np.argmax(test_probs, axis=1)

test_probs = pd.DataFrame(test_probs)
test_probs['id'] = test_ids
test_probs.to_csv(f"test_probs_{args.model}.csv", index=False)

In [None]:
submission = pd.DataFrame({'id': test_ids, args.target: predictions})
submission[args.target] = submission[args.target].map(label_to_target)
submission.to_csv('submission.csv', index=False)

if args.wandb:
    wandb.finish()