# GOAL:
- Experiment to determine methods of imputation
- Idea: Try different ways of imputation, use the baseline model (decision trees or random forests) and evaluate. Select best model.

Approach: Make the pipeline as much as efficient as possible, by using *"configurations"* for imputing values (exm: impute variable $X$ in a manner $f$)

In [1]:
import pandas as pd
import sklearn as sk


In [2]:
df = pd.read_csv(r"../our data/no_outliers.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight,obese_level
0,0,21.0,Never,no,up to 5,Sometimes,Female,1.62,3.0,no,yes,,3.0,no,Public,Sometimes,1 to 2,64.0,Normal_Weight
1,1,23.0,Frequently,no,up to 5,Sometimes,Male,1.8,3.0,no,yes,3 to 4,0.0,no,Public,Sometimes,1 to 2,77.0,Normal_Weight
2,2,,Frequently,no,up to 2,Sometimes,Male,1.8,3.0,no,no,3 to 4,2.0,no,Walk,Always,1 to 2,87.0,Overweight_Level_I


In [3]:
df = df.drop('Unnamed: 0', axis=1)

## Separate explanatory and targets

In [4]:
df_var, df_target = df.drop("obese_level", axis=1), df['obese_level']

# Separate train and validation

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_var, df_target,
    test_size = .33,
    random_state=20
)

In [6]:
df.dtypes

age                          float64
alcohol_freq                  object
caloric_freq                  object
devices_perday                object
eat_between_meals             object
gender                        object
height                       float64
meals_perday                 float64
monitor_calories              object
parent_overweight             object
physical_activity_perweek     object
siblings                     float64
smoke                         object
transportation                object
veggies_freq                  object
water_daily                   object
weight                       float64
obese_level                   object
dtype: object

# Start experimenting
## Setup configurations and results

In [7]:
from sklearn.experimental import enable_iterative_imputer

from data_preprocesser import preprocesser


In [8]:
my_preprocesser = preprocesser()

# Data preprocesser:
def run(o_train, o_val, configs):
    # Preserve original data as preprocesser does everything inplace
    train = o_train.copy()
    val = o_val.copy()

    for config in configs:
        l = config.split(";")

        if len(l) == 1: # Single-argument options only
            option = l[0]            
            if option == "knn_imputer":
                train, val = my_preprocesser.knn_imputer(train, val, configs[config])
            
            if option == "add_bmi":
                train, val = my_preprocesser.add_bmi(train, val)
            
        if len(l) == 2: # Two options
            option = l[0]
            arg = l[1]

            if option == "encode_data":
                train, val = my_preprocesser.encode_data(train, val, configs[config], type=arg)

            if option == "simp_imputer":
                train, val = my_preprocesser.simp_imputer(train, val, configs[config], strategy=arg)
            
            if option == "scaler":
                train, val = my_preprocesser.scaler(train, val, configs[config], method=arg)
            
            if option == "constant_imputer":
                train, val = my_preprocesser.constant_imputer(train, val, configs[config], filling=arg)

            if option == "iterative_imputer":
                train, val = my_preprocesser.iterative_imputer(train, val, configs[config], estimator=arg)
        
    return train, val 

Preprocesser loaded


# Start making some configurations

In [9]:
configs = []

In [10]:
config_6 = {
    "add_bmi": None,
    "scaler;standard": ['age', 'weight', 'height'],
    "iterative_imputer;lr": ['age', 'weight', 'height'],
    "simp_imputer;most_frequent": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'eat_between_meals',
                            'gender',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'smoke',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            ],
    "constant_imputer;0": [
                            'physical_activity_perweek'
                            ],
    "encode_data;ordinal": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'physical_activity_perweek',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            "bmi_class",
                            "eat_between_meals"
    ],
"encode_data;one_hot": [
        "gender",
        "smoke",
    ]
} # 4 but added bmi
configs.append(config_6)

In [11]:
config_7 = {
    "add_bmi": None,
    "scaler;standard": ['age', 'weight', 'height'],
    "iterative_imputer;lr": ['age', 'weight', 'height'],
    "knn_imputer":[
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'eat_between_meals',
                            'gender',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'physical_activity_perweek',
                            'smoke',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            ],
    "constant_imputer;0": [
                            'physical_activity_perweek'
                            ],

} # previous but i'm using knn
configs.append(config_7)

In [12]:
config_8 = {
    "add_bmi": None,
    "scaler;standard": ['age', 'weight', 'height'],
    "knn_imputer": ['age', 'weight', 'height'],
    "simp_imputer;most_frequent": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'eat_between_meals',
                            'gender',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'smoke',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            ],
    "constant_imputer;0": [
                            'physical_activity_perweek'
                            ],
    "encode_data;ordinal": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'physical_activity_perweek',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            "bmi_class",
                            "eat_between_meals"
    ],
"encode_data;one_hot": [
        "gender",
        "smoke",
    ]
} # 6 but using knn instead of lr
configs.append(config_8)


In [13]:
config_9 = {
    "add_bmi": None,
    "scaler;standard": ['age', 'weight', 'height'],
    "knn_imputer": ['age', 'weight', 'height'],
    "simp_imputer;most_frequent": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'eat_between_meals',
                            'gender',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'smoke',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            'physical_activity_perweek'
                            ],
    "encode_data;ordinal": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'physical_activity_perweek',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            "bmi_class",
                            "eat_between_meals"
    ],
"encode_data;one_hot": [
        "gender",
        "smoke",
    ]
} # before but imputing physical activity with mode

# IMPORTANT OBSERVATION: In some categories performance slightly improves with this
configs.append(config_9)


In [14]:
config_10 = {
    "knn_imputer": ['age', 'weight', 'height'],
    "add_bmi": None,
    "scaler;standard": ['age', 'weight', 'height'],

    "simp_imputer;most_frequent": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'eat_between_meals',
                            'gender',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'smoke',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            'physical_activity_perweek'
                            ],
    "encode_data;ordinal": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'physical_activity_perweek',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            "bmi_class",
                            "eat_between_meals"
    ],
"encode_data;one_hot": [
        "gender",
        "smoke",
    ]
} # mix orders; KNN first and BMI later (+0.01 better performance)

configs.append(config_10)


In [15]:
config_11 = {
    "knn_imputer": ['age', 'weight', 'height'],
    "add_bmi": None,
    "scaler;standard": ['age', 'weight', 'height'],

    "iterative_imputer;KNNclassifier": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'eat_between_meals',
                            'gender',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'smoke',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            'physical_activity_perweek'
                            ],
    "encode_data;ordinal": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'physical_activity_perweek',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            "bmi_class",
                            "eat_between_meals"
    ],
"encode_data;one_hot": [
        "gender",
        "smoke",
    ]
} # fill categorical with iterative KNN

configs.append(config_11)


In [16]:
config_12 = {
    "knn_imputer": ['age', 'weight', 'height'],
    "add_bmi": None,
    "scaler;standard": ['age', 'weight', 'height'],
    "constant_imputer;None": [
                            'physical_activity_perweek'
                            ],

    "iterative_imputer;KNNclassifier": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'eat_between_meals',
                            'gender',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'smoke',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            ],

    "encode_data;ordinal": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'physical_activity_perweek',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            "bmi_class",
                            "eat_between_meals"
    ],
"encode_data;one_hot": [
        "gender",
        "smoke",
    ]
} # same as before but constant imputer for physical activity

configs.append(config_12)


In [None]:
config_13 = {
    "knn_imputer": ['age', 'weight', 'height'],
    "add_bmi": None,
    "scaler;standard": ['age', 'weight', 'height'],
    "constant_imputer;None": [
                            'physical_activity_perweek'
                            ],

    "encode_data;ordinal": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'physical_activity_perweek',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            "bmi_class",
                            "eat_between_meals",
                            "gender",
                            "smoke",
                             ]
,
    "iterative_imputer;KNNclassifier": [
                            'alcohol_freq',
                            'caloric_freq',
                            'devices_perday',
                            'eat_between_meals',
                            'gender',
                            'meals_perday',
                            'monitor_calories',
                            'parent_overweight',
                            'smoke',
                            'transportation',
                            'veggies_freq',
                            'water_daily',
                            ],
} # previous but reversed order
configs.append(config_13)

# Mass Experiments with Baseline Model

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

trees = []
class_weights = {
    'Insufficient_Weight': 1.0,
    'Normal_Weight': 5,
    'Overweight_Level_I': 5,
    'Overweight_Level_II': 1.0,
    'Obesity_Type_I': 1.0,
    'Obesity_Type_II': 1.0,
    'Obesity_Type_III': 1.0
}
# For loop with configurations
for i,config in enumerate(configs):
    # Creates a random forest classifier, trains on preprocessed data with a certain configuration and evaluates
    tree_rf = RandomForestClassifier(random_state=42)
    print("="*50)
    print(f"CONFIGURATION {i+6}")
    print([e for e in config])
    print()
    train_curr, test_curr = run(X_train, X_test, config)
    tree_rf.fit(train_curr, y_train)

    print(classification_report(y_test, tree_rf.predict(test_curr)))
    
    trees.append(tree_rf)

#   importances = tree_rf.feature_importances_
#   importances_df = pd.DataFrame(importances, index=train_curr.columns, columns=['importance'])
#   importances_df = importances_df.sort_values('importance', ascending=False)
#   print(importances_df)
#   print()


CONFIGURATION 6
['add_bmi', 'scaler;standard', 'iterative_imputer;lr', 'simp_imputer;most_frequent', 'constant_imputer;0', 'encode_data;ordinal', 'encode_data;one_hot']

['age', 'weight', 'height']
['age', 'alcohol_freq', 'caloric_freq', 'devices_perday', 'eat_between_meals', 'gender', 'height', 'meals_perday', 'monitor_calories', 'parent_overweight', 'physical_activity_perweek', 'siblings', 'smoke', 'transportation', 'veggies_freq', 'water_daily', 'weight', 'bmi_class']
                     precision    recall  f1-score   support

Insufficient_Weight       1.00      0.89      0.94        75
      Normal_Weight       0.71      0.93      0.81        70
     Obesity_Type_I       0.94      0.94      0.94        80
    Obesity_Type_II       0.97      0.96      0.97        76
   Obesity_Type_III       0.99      1.00      0.99        75
 Overweight_Level_I       0.93      0.81      0.86        79
Overweight_Level_II       0.93      0.89      0.91        75

           accuracy               



                     precision    recall  f1-score   support

Insufficient_Weight       0.99      0.92      0.95        75
      Normal_Weight       0.84      0.91      0.88        70
     Obesity_Type_I       0.92      0.97      0.95        80
    Obesity_Type_II       0.99      0.96      0.97        76
   Obesity_Type_III       0.99      1.00      0.99        75
 Overweight_Level_I       0.93      0.87      0.90        79
Overweight_Level_II       0.93      0.93      0.93        75

           accuracy                           0.94       530
          macro avg       0.94      0.94      0.94       530
       weighted avg       0.94      0.94      0.94       530

CONFIGURATION 13
['knn_imputer', 'add_bmi', 'scaler;standard', 'constant_imputer;None', 'encode_data;ordinal']



ValueError: Found unknown categories ['2.2'] in column 0 during transform

# Conclusions
The best configurations is $12$, where we:
- Impute age, weight, height with KNN
- Add BMI class
- Scale age, weight, height with standard scaling
- Impute physical activity with "None" (NaN is interpreted as No activity)
- Impute qualitative variabiles either with mode or KNN classifier (the performance does not significantly change)
- Encode qualitative data

This yields us an average f1-score of $0.95$ on validation set.

**NOTE:** While this configuration has the best global f1-score, it is to note that we have some tradeoffs; in particular, it performs worse on predicting `Normal Weight` and `Overweight Level II`

In [None]:
importances = trees[12].feature_importances_
importances_df = pd.DataFrame(importances, index=train_curr.columns, columns=['importance'])
importances_df = importances_df.sort_values('importance', ascending=False)
print(importances_df) # Importances of last model


IndexError: list index out of range

Trying to predict my own obesity

In [None]:
my_data = {
    "age": 20.0,
    "alcohol_freq": "Always",
    "caloric_freq": "no",
    "devices_perday": "more than 5",
    "eat_between_meals": "Sometimes",
    "gender": "Male",
    "height": 1.72,
    "meals_perday": 3.0,
    "monitor_calories": "no",
    "parent_overweight": "no",
    "physical_activity_perweek": "1 to 2",
    "siblings": 1,
    "smoke": "no",
    "transportation": "Walk",
    "veggies_freq": "Sometimes",
    "water_daily": "1 to 2",
    "weight": 90.0
}

In [None]:
df_var.head(1).dtypes

age                          float64
alcohol_freq                  object
caloric_freq                  object
devices_perday                object
eat_between_meals             object
gender                        object
height                       float64
meals_perday                 float64
monitor_calories              object
parent_overweight             object
physical_activity_perweek     object
siblings                     float64
smoke                         object
transportation                object
veggies_freq                  object
water_daily                   object
weight                       float64
dtype: object

In [None]:
x = pd.DataFrame([my_data])
x

Unnamed: 0,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight
0,20.0,Always,no,more than 5,Sometimes,Male,1.72,3.0,no,no,1 to 2,1,no,Walk,Sometimes,1 to 2,90.0


In [None]:
a, b = run(X_train,x, config_12)
b



Unnamed: 0,age,height,siblings,weight,alcohol_freq_encoded,caloric_freq_encoded,devices_perday_encoded,meals_perday_encoded,monitor_calories_encoded,parent_overweight_encoded,physical_activity_perweek_encoded,transportation_encoded,veggies_freq_encoded,water_daily_encoded,bmi_class_encoded,eat_between_meals_encoded,gender_1.0,smoke_1.0
0,-0.683343,0.160452,1,0.094483,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,2.0,0.0,1.0,3.0,1.0,0.0


In [None]:
trees[12].predict(b)

array(['Normal_Weight'], dtype=object)

Bruh

# Save data, with best preprocessing configuration

In [None]:
# Load test csv file
test = pd.read_csv("../data/obesity_test.csv")

In [None]:
# Preprocess
_, test = run(X_train, test, config_12)



In [None]:
test = test.drop(columns=["marrital_status", "region"])
test.to_csv("../our data/preprocessed_test.csv")