In [47]:
# Import packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support as score
import os

In [48]:
# Importing data (Note that the path is configured to work when running this file from an ipynb in the models folder only)
# if the os is windows, change the path to the data folder
if os.name == 'nt':
    os.chdir(os.path.dirname(os.path.abspath(__file__)))
df = pd.read_csv('data/train.csv')
df_val = pd.read_csv('data/val.csv')

In [49]:
# Convert all categorical features to numerical ones.
numerical_features = ['Age', 'Height', 'Weight', 'Veg_Consump', 'Water_Consump', 'Meal_Count', 'Phys_Act', 'Time_E_Dev']
df_numerical = pd.DataFrame()
df_numerical[numerical_features] = df[numerical_features]
df_numerical['Gender'] = df['Gender'].map({"Female" : 0, "Male" : 1})
binary_categorical_features = ['H_Cal_Consump', 'Smoking', 'Fam_Hist', 'H_Cal_Burn'] # Yes or no
for feature in binary_categorical_features:
    df_numerical[feature] = df[feature].map({"no" : 0, "yes" : 1})
multi_categorical_features = ['Alcohol_Consump', 'Food_Between_Meals', 'Transport']
for feature in multi_categorical_features:
    values = sorted(list(set(df[feature])))
    for value in values:
        df_numerical[feature+"_"+value] = df[feature].map(lambda x: 1 if x==value else 0)
df_numerical['Body_Level'] = df['Body_Level']

In [50]:
df_numerical.columns

Index(['Age', 'Height', 'Weight', 'Veg_Consump', 'Water_Consump', 'Meal_Count',
       'Phys_Act', 'Time_E_Dev', 'Gender', 'H_Cal_Consump', 'Smoking',
       'Fam_Hist', 'H_Cal_Burn', 'Alcohol_Consump_Always',
       'Alcohol_Consump_Frequently', 'Alcohol_Consump_Sometimes',
       'Alcohol_Consump_no', 'Food_Between_Meals_Always',
       'Food_Between_Meals_Frequently', 'Food_Between_Meals_Sometimes',
       'Food_Between_Meals_no', 'Transport_Automobile', 'Transport_Bike',
       'Transport_Motorbike', 'Transport_Public_Transportation',
       'Transport_Walking', 'Body_Level'],
      dtype='object')

In [51]:
# Convert all categorical features to numerical ones. (for validation set)
numerical_features = ['Age', 'Height', 'Weight', 'Veg_Consump', 'Water_Consump', 'Meal_Count', 'Phys_Act', 'Time_E_Dev']
df_numerical_val = pd.DataFrame()
df_numerical_val[numerical_features] = df_val[numerical_features]
df_numerical_val['Gender'] = df_val['Gender'].map({"Female" : 0, "Male" : 1})
binary_categorical_features = ['H_Cal_Consump', 'Smoking', 'Fam_Hist', 'H_Cal_Burn'] # Yes or no
for feature in binary_categorical_features:
    df_numerical_val[feature] = df_val[feature].map({"no" : 0, "yes" : 1})
multi_categorical_features = ['Alcohol_Consump', 'Food_Between_Meals', 'Transport']
for feature in multi_categorical_features:
    values = sorted(list(set(df_val[feature])))
    for value in values:
        df_numerical_val[feature+"_"+value] = df_val[feature].map(lambda x: 1 if x==value else 0)
df_numerical_val['Body_Level'] = df_val['Body_Level']


In [52]:
features = list(df_numerical.columns.array)
features.reverse()
label = features.pop(0)
features.reverse()
# An extra step done for the validation set since it may not contain all the categories of a specific attribute due to its smaller size.
for feature in features:
    if feature not in list(df_numerical_val.columns.array):
        df_numerical_val[feature] = 0

In [53]:
df_numerical_val.columns

Index(['Age', 'Height', 'Weight', 'Veg_Consump', 'Water_Consump', 'Meal_Count',
       'Phys_Act', 'Time_E_Dev', 'Gender', 'H_Cal_Consump', 'Smoking',
       'Fam_Hist', 'H_Cal_Burn', 'Alcohol_Consump_Frequently',
       'Alcohol_Consump_Sometimes', 'Alcohol_Consump_no',
       'Food_Between_Meals_Always', 'Food_Between_Meals_Frequently',
       'Food_Between_Meals_Sometimes', 'Food_Between_Meals_no',
       'Transport_Automobile', 'Transport_Motorbike',
       'Transport_Public_Transportation', 'Transport_Walking', 'Body_Level',
       'Alcohol_Consump_Always', 'Transport_Bike'],
      dtype='object')

In [54]:
df_numerical = df_numerical.reindex(sorted(df_numerical.columns), axis=1)
df_numerical_val = df_numerical_val.reindex(sorted(df_numerical_val.columns), axis=1)
features = sorted(features)
x_train = df_numerical[features]
y_train = df_numerical['Body_Level']
x_val = df_numerical_val[features]
y_val = df_numerical_val['Body_Level']

In [55]:
x_train.columns

Index(['Age', 'Alcohol_Consump_Always', 'Alcohol_Consump_Frequently',
       'Alcohol_Consump_Sometimes', 'Alcohol_Consump_no', 'Fam_Hist',
       'Food_Between_Meals_Always', 'Food_Between_Meals_Frequently',
       'Food_Between_Meals_Sometimes', 'Food_Between_Meals_no', 'Gender',
       'H_Cal_Burn', 'H_Cal_Consump', 'Height', 'Meal_Count', 'Phys_Act',
       'Smoking', 'Time_E_Dev', 'Transport_Automobile', 'Transport_Bike',
       'Transport_Motorbike', 'Transport_Public_Transportation',
       'Transport_Walking', 'Veg_Consump', 'Water_Consump', 'Weight'],
      dtype='object')

In [56]:

x_val.columns

Index(['Age', 'Alcohol_Consump_Always', 'Alcohol_Consump_Frequently',
       'Alcohol_Consump_Sometimes', 'Alcohol_Consump_no', 'Fam_Hist',
       'Food_Between_Meals_Always', 'Food_Between_Meals_Frequently',
       'Food_Between_Meals_Sometimes', 'Food_Between_Meals_no', 'Gender',
       'H_Cal_Burn', 'H_Cal_Consump', 'Height', 'Meal_Count', 'Phys_Act',
       'Smoking', 'Time_E_Dev', 'Transport_Automobile', 'Transport_Bike',
       'Transport_Motorbike', 'Transport_Public_Transportation',
       'Transport_Walking', 'Veg_Consump', 'Water_Consump', 'Weight'],
      dtype='object')

In [58]:
len(x_train.columns) #== len(x_val.columns)

26