In [143]:
import pandas as pd
import numpy as np

# Load the data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
original=pd.read_csv('ObesityDataSet.csv')
submission=pd.read_csv('sample_submission.csv')

In [144]:
train=pd.concat([train,original],axis=0)

In [145]:
# Get categorical columns
categorical_columns=[]
for col in train.columns:
    if train[col].dtype=='object':
        categorical_columns.append(col)

categorical_columns

['Gender',
 'family_history_with_overweight',
 'FAVC',
 'CAEC',
 'SMOKE',
 'SCC',
 'CALC',
 'MTRANS',
 'NObeyesdad']

In [146]:
# Categorical column encoding; Here I use manual encoding
train['Gender']=train["Gender"].apply(lambda x: 1 if x=="Male" else 0)
test['Gender']=test["Gender"].apply(lambda x: 1 if x=="Male" else 0)

train['family_history_with_overweight']=train["family_history_with_overweight"].apply(lambda x: 1 if x=="yes" else 0)
test['family_history_with_overweight']=test["family_history_with_overweight"].apply(lambda x: 1 if x=="yes" else 0)

train['FAVC']=train["FAVC"].apply(lambda x: 1 if x=="yes" else 0)
test['FAVC']=test["FAVC"].apply(lambda x: 1 if x=="yes" else 0)

train['CAEC']=train["CAEC"].apply(lambda x: 1 if x=="no" else 2 if x=="Sometimes" else 3 if x=="Always" else 4)
test['CAEC']=test["CAEC"].apply(lambda x: 1 if x=="no" else 2 if x=="Sometimes" else 3 if x=="Always" else 4)

train['SMOKE']=train["SMOKE"].apply(lambda x: 1 if x=="yes" else 0)
test['SMOKE']=test["SMOKE"].apply(lambda x: 1 if x=="yes" else 0)

train['SCC']=train["SCC"].apply(lambda x: 1 if x=="yes" else 0)
test['SCC']=test["SCC"].apply(lambda x: 1 if x=="yes" else 0)

train['CALC']=train["CALC"].apply(lambda x: 1 if x=="no" else 2 if x=="Sometimes" else 3)
test['CALC']=test["CALC"].apply(lambda x: 1 if x=="no" else 2 if x=="Sometimes" else 3)

mapping={'Public_Transportation':1,
         'Automobile':2,
         'Motorbike':3,
         'Bike':4,
         'Walking':5}
train['MTRANS']=train["MTRANS"].replace(mapping)
test["MTRANS"]=test["MTRANS"].replace(mapping)

# target label encoding
target_mapping={'Insufficient_Weight':0,
                'Normal_Weight':1,
                'Overweight_Level_I':2,
                'Overweight_Level_II':3,
                'Obesity_Type_I':4,
                'Obesity_Type_II':5,
                'Obesity_Type_III':6}

train['NObeyesdad']=train["NObeyesdad"].replace(target_mapping)


In [147]:
# Feature Engineering

def feat_eng(df):
    df['BMI'] = df['Weight'] / (df['Height']**2)
    df["HealthyHabitRatio"] = (df["FCVC"] + df["CH2O"] + df["FAF"]) / (df["FAVC"] + df["CAEC"] + df["TUE"] + df["SMOKE"] * 2)
    df["Age_BMI"] = df["Age"] * df["BMI"]
    df["Age_HealthyHabitRatio"] = df["Age"] * df["HealthyHabitRatio"]
    df["Gender_SCC"]=df["Gender"]*df["SCC"]
    df["Height_Weight_Ratio"]=df["Height"]/df["Weight"]
    df["FAVC_CAEC_Index"]=df["FAVC"]/df["CAEC"]
    df["Activity_Index"]=df["FAF"]-df["TUE"]
    df["Water_Alcohol_Ratio"]=df["CH2O"]/df["CALC"]
    df["Meal_Frequency_Deviation"]=abs(df["NCP"]-3+1e-6)
    df["FamilyHistory_BMI_Interaction"]=(df["family_history_with_overweight"]+1e-6)*df["BMI"]

    return df


In [148]:
train=feat_eng(train)
test=feat_eng(test)

In [149]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

test.drop(columns=["id"],axis=1,inplace=True)
X= train.drop(columns=["NObeyesdad","id"],axis=1)
y=train["NObeyesdad"]
X=scaler.fit_transform(X) 
test=scaler.transform(test)

In [150]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
import torch.optim.lr_scheduler as lr_sheduler
from torch.utils.data import DataLoader,TensorDataset

from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_curve, precision_recall_curve, auc
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import datetime
from torch.utils.tensorboard import SummaryWriter
plt.style.use('seaborn-whitegrid')

class DNN(object):
    def __init__(self,model,loss_fn,optimizer,es_patience):
        # arguments as attributes
        self.model=model
        self.loss_fn=loss_fn
        self.optimizer=optimizer
        self.device='cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)

        # attributes to use in future, currently empty
        self.train_loader=None
        self.val_loader=None
        self.writer=None

        self.patience=es_patience
        self.min_delta=0
        self.counter=0
        self.min_validation_loss=float('inf')

        # attributes to be computed internally
        self.losses=[]
        self.val_losses=[]
        self.total_epochs=0

        # train step function
        self.train_step_fn=self._make_train_step_fn()
        # validation step function
        self.val_step_fn=self._make_val_step_fn()

    def to(self,device):
        try:
            self.device=device
            self.model.to(self.device)
        except RuntimeError:
            self.device='cuda' if torch.cuda.is_available() else 'cpu'
            print(f"Can't move to {device}, moving to {self.device} instead")
            self.model.to(self.device)

    def set_loaders(self,train_loader,val_loader=None):
        self.train_loader=train_loader
        self.val_loader=val_loader

    def set_tensorboard(self,name,folder='runs'):
        suffix=datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        self.writer=SummaryWriter(f"{folder}/{name}_{suffix}")

    def _make_train_step_fn(self):
        def peform_train_step_fn(X,y):
            # set model to train mode
            self.model.train()
            # forward pass
            yhat=self.model(X)
            # compute the loss
            loss=self.loss_fn(yhat,y.squeeze())
            # compute gradients  
            loss.backward()
            # update weights
            self.optimizer.step()
            self.optimizer.zero_grad()

            return loss.item()
        return peform_train_step_fn 

    def _make_val_step_fn(self):
        def perform_val_step_fn(X,y):
            # set model to eval mode
            self.model.eval()

            yhat=self.model(X) 
            loss=self.loss_fn(yhat,y)
            return loss.item()
        return perform_val_step_fn
    

    def _should_early_stop(self,val_loss):
        if val_loss<self.min_validation_loss:
            self.min_validation_loss=val_loss
            self.counter=0
        elif val_loss>self.min_validation_loss:
            self.counter+=1
            if self.counter>=self.patience:
                return True
        return False

    
    def _mini_batch(self,validation=False):
        if validation:
            data_loader=self.val_loader
            step_fn=self.val_step_fn
        else:
            data_loader=self.train_loader
            step_fn=self.train_step_fn
        if data_loader is None:
            return None
        
        mini_batch_losses=[]
        for X_batch,y_batch in data_loader:
            X_batch=X_batch.to(self.device)
            y_batch=y_batch.to(self.device)

            mini_batch_loss=step_fn(X_batch,y_batch)
            mini_batch_losses.append(mini_batch_loss)

        loss=np.mean(mini_batch_losses)
        return loss
    
    def set_scheduler(self,patience=5,mode="min",factor=0.1,min_lr=1e-6):
        self.scheduler=lr_sheduler.ReduceLROnPlateau(self.optimizer,
                                                     patience=patience,
                                                     mode=mode,
                                                     factor=factor,
                                                     min_lr=min_lr)
    
    def set_seed(self,seed=42):
        torch.backends.cudnn.deterministic=True
        torch.backends.cudnn.benchmark=False
        torch.manual_seed(seed)
        np.random.seed(seed)

    def train(self,n_epochs,seed=42):
        self.set_seed(seed)

        for epoch in tqdm(range(n_epochs)):
            self.total_epochs+=1

            # inner loop, perform training using mini batches
            loss=self._mini_batch(validation=False) # validation=False because we are training
            self.losses.append(loss)

            # validation
            with torch.no_grad():
                val_loss=self._mini_batch(validation=True)
                self.val_losses.append(val_loss)
                # Update learning rate sheduler
                self.scheduler.step(val_loss) 
                # Early stopping
                if self._should_early_stop(val_loss):
                    print("Early Stopping")
                    break
            

            # for summary writer
            if self.writer:
                scalars={'training':loss}
                if val_loss is not None:
                    scalars.update({'validation':val_loss})
                self.writer.add_scalars(main_tag='loss',tag_scalar_dict=scalars,global_step=epoch) # global step is the x-axis

        if self.writer:
            self.writer.close() # close the writer




        
    def save_checkpoint(self,filename):
        # Build a dictionary with all information for resume training
        checkpoint={'epoch':self.total_epochs,
                    'model_state_dict':self.model.state_dict(),
                    'optimizer_state_dict':self.optimizer.state_dict(),
                    'loss':self.losses,
                    'val_loss':self.val_losses}
        torch.save(checkpoint,filename)

    def load_checkpoint(self,filename):
        checkpoint=torch.load(filename)

        # Restore the state of the model
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.total_epochs=checkpoint['epoch']
        self.losses=checkpoint['loss']
        self.val_losses=checkpoint['val_loss']

        self.model.train() # always set the model to training mode after loading the checkpoint
        
    def predict(self,X):
        self.model.eval()
        X_tensor=torch.as_tensor(X,dtype=torch.float32,device=self.device)
        yhat_tensor=self.model(X_tensor)
        self.model.train()
        return yhat_tensor.detach().cpu().numpy()
    
    def plot_losses(self):
        fig=plt.figure(figsize=(10,4))
        plt.plot(self.losses,label='Training Loss',c='b')
        plt.plot(self.val_losses,label='Validation Loss',c='r')
        plt.yscale('log')
        plt.xlabel('Epochs')
        plt.ylabel("loss")
        plt.legend()
        plt.tight_layout()
        plt.show()
        return fig
    
    def add_graph(self):
        # fetches a single mini batch
        if self.train_loader and self.writer:
            X_sample,y_sample=next(iter(self.train_loader))
            self.writer.add_graph(self.model,X_sample.to(self.device))


  plt.style.use('seaborn-whitegrid')


In [151]:
def get_mode(arr):
    return max(set(arr), key=arr.count)

def get_mode_by_position(list_of_arrays):
    # Use zip to group elements at the same positions
    grouped_elements = zip(*list_of_arrays)
    # Apply get_mode to each group of elements
    mode_array = [get_mode(group) for group in grouped_elements]
    return mode_array

In [152]:
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

skf=StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
total_losses=[]
all_preds=[]
acc_scores=[]
for train_idx, val_idx in skf.split(X,y):

    # Data preprocessing
    X_train,X_val=X[train_idx],X[val_idx]
    y_train,y_val=y.iloc[train_idx],y.iloc[val_idx]

    # convert pd series objectd to numpy arrays
    y_train=y_train.to_numpy()
    y_val=y_val.to_numpy()

    torch.manual_seed(13)
    X_train_tensor=torch.as_tensor(X_train).float()
    y_train_tensor=torch.as_tensor(y_train).long()

    X_val_tensor=torch.as_tensor(X_val).float()
    y_val_tensor=torch.as_tensor(y_val).long()

    train_dataset=TensorDataset(X_train_tensor,y_train_tensor)
    val_dataset=TensorDataset(X_val_tensor,y_val_tensor)

    train_loader=DataLoader(train_dataset,batch_size=256,shuffle=True)
    val_loader=DataLoader(val_dataset,batch_size=256)
        
    
    # Model config
    learning_rate=0.01

    torch.manual_seed(42)
    model=nn.Sequential()
    model.add_module('hidden1',nn.Linear(27,128))
    model.add_module('activation1',nn.ReLU())
    model.add_module('batchNorm1',nn.BatchNorm1d(128))
    model.add_module('dropout1',nn.Dropout(p=0.2))

    model.add_module('hidden2',nn.Linear(128,64))
    model.add_module('activation2',nn.ReLU())
    model.add_module('batchNorm2',nn.BatchNorm1d(64))
    model.add_module('dropout2',nn.Dropout(p=0.2))

    model.add_module('hidden3',nn.Linear(64,24))
    model.add_module('activation3',nn.ReLU())
    model.add_module('batchNorm3',nn.BatchNorm1d(24))
    model.add_module('dropout3',nn.Dropout(p=0.2))

    model.add_module('output',nn.Linear(24,7))
    model.add_module('softmax',nn.Softmax(dim=1))

    optimizer=optim.Adam(model.parameters(),lr=learning_rate)
    loss_fn=nn.CrossEntropyLoss() 

    es_patience=25

    # Model Training
    n_epochs=250
    ann=DNN(model,loss_fn,optimizer,es_patience)
    ann.set_loaders(train_loader,val_loader)
    ann.set_scheduler(patience=5,factor=0.3) #ann.set_scheduler(patience=5,factor=0.4)
    ann.train(n_epochs)

    '''---------------------------------------------------------------------------------------------'''
    
    y_pred_test=ann.predict(test).argmax(axis=1)
    all_preds.append(y_pred_test)
    acc_scores.append(accuracy_score(y_val,ann.predict(X_val).argmax(axis=1)))
    total_losses.append(ann.val_losses)

final_preds=get_mode_by_position(all_preds)
print(f"\n\nMean Accuracy: {np.mean(acc_scores)}\nCustom score: {np.round((1.29-np.mean([np.mean(losses) for losses in total_losses]))*1e5,2)}")
    
# create a submission dataframe
prediction=pd.DataFrame({'id':submission['id'],'NObeyesdad':final_preds})
submission["NObeyesdad"]=prediction["NObeyesdad"].map({0:'Insufficient_Weight',
                                                        1:'Normal_Weight',
                                                        2:'Overweight_Level_I',
                                                        3:'Overweight_Level_II',
                                                        4:'Obesity_Type_I',
                                                        5:'Obesity_Type_II',
                                                        6:'Obesity_Type_III'})

submission.to_csv('submission_torch_mode3.csv',index=False)


  0%|          | 0/250 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
{np.round((1.29-np.mean([np.mean(losses) for losses in total_losses]))*1e5,2)}

{1591.15}

In [None]:
submission.head()

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
