# Manual Preprocessing

In [52]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB, MultinomialNB, GaussianNB
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import classification_report


In [53]:
df_train = pd.read_csv("../our data/no_outliers.csv")
df_test = pd.read_csv("../data/obesity_test.csv")

df_test = df_test.drop(columns=["region", "marrital_status"])

# Scale and KNN-impute data
scalers = {} # Preserve scalers for antitransformation

columns = ['age', 'height', 'weight']
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5, weights='uniform')

df_train[columns] = scaler.fit_transform(df_train[columns])
df_test[columns] = scaler.transform(df_test[columns])


# Impute
df_train[columns] = imputer.fit_transform(df_train[columns])
df_test[columns] = imputer.transform(df_test[columns])

# Transform back 
df_train[columns] = scaler.inverse_transform(df_train[columns])
df_test[columns] = scaler.inverse_transform(df_test[columns])
    


In [54]:
def classify_bmi_comprehensive(row):
    """
    Classify BMI based on age and BMI value.

    Input:
    row: A Pandas row with 'weight', 'height', and 'age' columns.

    Output:
    Returns a string that classifies the individual into BMI categories.
    """
    # Check if weight and height are valid
    if row['height'] <= 0 or row['weight'] <= 0:
        return 'Invalid data'

    # Calculate BMI
    bmi = row['weight'] / (row['height'] ** 2)

    # Age group: Children (2-19 years)
    if 2 <= row['age'] < 20:
        if bmi < 14:
            return 0 # Underweight
        elif 14 <= bmi < 18:
            return 1 # Normal weight
        elif 18 <= bmi < 21:
            return 2 # Overweight
        else:
            return 3 # Obesity 1

    # Age group: Adults (20-64 years)
    elif 20 <= row['age'] < 65:
        if bmi < 18.5:
            return 0 # "Underweight"
        elif 18.5 <= bmi < 25:
            return 1 # "Healthy Weight"
        elif 25 <= bmi < 30:
            return 2 #"Overweight"
        elif 30<= bmi < 35:
            return 3 #"Obese Class 1"
        elif 35 <= bmi < 40:
            return 4 #"Obese Class 2"
        else:
            return 5 #"Obese Class 3"

In [55]:
# Add BMI
df_train['bmi_class'] = df_train.apply(lambda row: classify_bmi_comprehensive(row), axis=1)
df_test['bmi_class'] = df_test.apply(lambda row: classify_bmi_comprehensive(row), axis=1)

In [56]:
# Fill missing activity with zero
df_train['physical_activity_perweek'].fillna('No Activity')
df_test['physical_activity_perweek'].fillna('No Activity')


0        5 or more
1      No Activity
2           1 to 2
3           1 to 2
4           3 to 4
          ...     
495    No Activity
496    No Activity
497    No Activity
498    No Activity
499         3 to 4
Name: physical_activity_perweek, Length: 500, dtype: object

In [79]:
hashmap = {
"Never": 0,
"Sometimes": 1,
"Frequently": 2,
"Always": 3,

"No Activity": 0,
"up to 2": 1,
"up to 5": 2,
"more than 5": 3,

"less than 1": 1,
"1 to 2": 2,
"more than 2": 3,
"3 to 4": 4,
"5 or more": 5,

"Bicycle": 1,
"Car": 3,
"Motorbike": 3,
"Public": 2,
"Walk": 0,

"no": 0,
"yes": 1,

"Male": 0,
"Female": 1
}


In [80]:
# Manually encode data

columns = ['alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'gender',
 'monitor_calories',
 'parent_overweight',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
 'bmi_class',
 'meals_perday',
 "siblings"]

for target in columns:
    df_train[target] = df_train[target].replace(hashmap)
    df_test[target]= df_test[target].replace(hashmap)


In [81]:
# Fill rest with KNN or smt else

# Scale numerical (again) and KNN-impute data

#columnsx = ['age', 'height', 'weight']
#scaler = StandardScaler()
imputer = KNeighborsClassifier()
imputer = IterativeImputer(imputer)

#df_train[columnsx] = scaler.fit_transform(df_train[columnsx])
#df_test[columnsx] = scaler.transform(df_test[columnsx])

df_train[columns] = imputer.fit_transform(df_train[columns])
df_test[columns] = imputer.transform(df_test[columns])

In [82]:
# Transform to life score
life_columns = [
 'alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'monitor_calories',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
 ]

df_train["life"] = 0
df_test["life"] = 0

for column in life_columns:
    df_train["life"] += df_train[column]
    df_test["life"] += df_test[column]


In [83]:
hash_obesity = {
 'Normal_Weight': 1,
 'Overweight_Level_I': 2,
 'Overweight_Level_II': 3,
 'Obesity_Type_I': 4,
 'Insufficient_Weight': 5,
 'Obesity_Type_II': 6,
 'Obesity_Type_III': 7
 }

X = df_train.iloc[:, 1:].drop(columns='obese_level')
y = df_train['obese_level'].replace(hash_obesity)



  y = df_train['obese_level'].replace(hash_obesity)


# Feature selection with wrapper method

In [84]:
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [86]:
baseline = RandomForestClassifier()
k = 5 # suppose i want to select 5

rfe = RFE(estimator=baseline, n_features_to_select=k)
rfe.fit_transform(X_train, y_train)

selected_features = pd.Series(rfe.support_, index = X.columns)
selected_features


rfe.score(X_test, y_test)

0.9459459459459459

In [87]:
# same as above but iterate k
max_score = -999 
info = {}

for i in range(1, len(X.columns.tolist())+1):
    rfe = RFE(estimator=baseline, n_features_to_select=i)
    rfe.fit_transform(X_train, y_train)
    s = rfe.score(X_test, y_test)
    print(f"k={i}: {s}")
    selected_features = pd.Series(rfe.support_, index = X.columns)
    S=selected_features[selected_features == True].index.tolist()
    print(f"\tSelected: {S}")

    if s>max_score:
        max_score = s
        info['i'] = i
        info['selected'] = S

print("=========================*2")
print(info)


k=1: 0.5925155925155925
	Selected: ['weight']
k=2: 0.7650727650727651
	Selected: ['weight', 'bmi_class']
k=3: 0.9230769230769231
	Selected: ['height', 'weight', 'bmi_class']
k=4: 0.9417879417879418
	Selected: ['age', 'height', 'weight', 'bmi_class']
k=5: 0.9417879417879418
	Selected: ['age', 'gender', 'height', 'weight', 'bmi_class']
k=6: 0.9438669438669439
	Selected: ['age', 'gender', 'height', 'weight', 'bmi_class', 'life']
k=7: 0.9417879417879418
	Selected: ['age', 'gender', 'height', 'meals_perday', 'weight', 'bmi_class', 'life']
k=8: 0.9501039501039501
	Selected: ['age', 'gender', 'height', 'meals_perday', 'veggies_freq', 'weight', 'bmi_class', 'life']
k=9: 0.9397089397089398
	Selected: ['age', 'alcohol_freq', 'gender', 'height', 'meals_perday', 'veggies_freq', 'weight', 'bmi_class', 'life']
k=10: 0.9397089397089398
	Selected: ['age', 'alcohol_freq', 'eat_between_meals', 'gender', 'height', 'meals_perday', 'veggies_freq', 'weight', 'bmi_class', 'life']
k=11: 0.9313929313929314
	Se

# Feature selection with lasso method (numerical variable)

In [88]:
cor = df_train.replace({"obese_level": hash_obesity}).iloc[:, 1:].corr()
import seaborn as sns

cor.style.highlight_between(left=-1, right=-.69, color="gold").highlight_between(left=.69, right=1, color="gold")

  cor = df_train.replace({"obese_level": hash_obesity}).iloc[:, 1:].corr()


Unnamed: 0,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight,obese_level,bmi_class,life
age,1.0,0.053023,0.085915,-0.268275,-0.088581,-0.050592,-0.004945,-0.046343,-0.122547,0.213234,-0.002371,-0.019872,0.095244,0.568066,0.0085,-0.029183,0.242086,0.117957,0.189108,0.154515
alcohol_freq,0.053023,1.0,0.094186,-0.044375,-0.071197,0.003417,0.111197,0.074707,-0.011809,-0.023852,-0.090642,-0.00404,0.072621,-0.009553,0.067472,0.089625,0.216953,0.118906,0.180477,0.241678
caloric_freq,0.085915,0.094186,1.0,0.076895,-0.174198,-0.060071,0.190019,-0.019144,-0.197428,0.21813,-0.0556,-0.019502,-0.031565,0.162454,-0.02494,0.016444,0.272368,0.227523,0.24494,0.176314
devices_perday,-0.268275,-0.044375,0.076895,1.0,0.029645,0.012176,0.052087,0.039087,-0.031854,0.034157,0.060716,-0.023628,0.022173,-0.13733,-0.086245,-0.046033,-0.041866,-0.023176,-0.06352,0.235298
eat_between_meals,-0.088581,-0.071197,-0.174198,0.029645,1.0,0.095136,-0.065139,0.105726,0.120509,-0.17483,0.045592,-0.001149,0.03151,-0.063806,0.072356,-0.155356,-0.263787,-0.146459,-0.285914,0.172996
gender,-0.050592,0.003417,-0.060071,0.012176,0.095136,1.0,-0.6283,-0.077816,0.105815,-0.111408,-0.00109,-0.01922,-0.07103,-0.116163,0.286763,-0.093643,-0.174701,0.134332,0.041802,0.08445
height,-0.004945,0.111197,0.190019,0.052087,-0.065139,-0.6283,1.0,0.231027,-0.129291,0.257137,0.051053,0.039531,0.081247,0.065138,-0.06799,0.186868,0.467835,0.127403,0.111077,0.131221
meals_perday,-0.046343,0.074707,-0.019144,0.039087,0.105726,-0.077816,0.231027,1.0,-0.011728,0.07847,0.118039,0.040633,0.026735,0.044057,0.041351,0.052918,0.104887,0.157345,0.045204,0.16691
monitor_calories,-0.122547,-0.011809,-0.197428,-0.031854,0.120509,0.105815,-0.129291,-0.011728,1.0,-0.205752,0.015096,-0.022916,0.03516,-0.05284,0.074932,0.004045,-0.208447,-0.144627,-0.143212,0.105932
parent_overweight,0.213234,-0.023852,0.21813,0.034157,-0.17483,-0.111408,0.257137,0.07847,-0.205752,1.0,-0.048791,-0.002182,0.029945,0.150427,-0.005956,0.119329,0.493115,0.261395,0.424806,0.058853


In [89]:
X_num = df_train.loc[:, ["age", "weight", "height", "life"]]
y_num = df_train['obese_level'].replace(hash_obesity)


# Lasso method: ONLY for numerical variables
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt 


reg = LassoCV()
reg.fit(X_num, y_num)

coef = pd.Series(reg.coef_, index = X_num.columns)
coef.sort_values(ascending=False)

  y_num = df_train['obese_level'].replace(hash_obesity)


weight    0.047478
age      -0.000000
height   -0.000000
life      0.000000
dtype: float64

# Model testing

In [90]:
to_keep = ['age', 'gender', 'height', 'weight', 'bmi_class']

In [91]:
X = df_train.drop(columns='obese_level')

y = df_train['obese_level']

In [92]:
X = X.iloc[:, 1:]
X

Unnamed: 0,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight,bmi_class,life
0,21.0,0.0,0.0,2.0,1.0,1.0,1.62,3.0,0.0,1.0,2.0,3.0,0.0,2.0,1.0,2.0,64.0,1.0,10.0
1,23.0,2.0,0.0,2.0,1.0,0.0,1.80,3.0,0.0,1.0,4.0,0.0,0.0,2.0,1.0,2.0,77.0,1.0,14.0
2,20.6,2.0,0.0,1.0,1.0,0.0,1.80,3.0,0.0,0.0,4.0,2.0,0.0,0.0,3.0,2.0,87.0,2.0,13.0
3,22.0,1.0,0.0,1.0,1.0,0.0,1.78,1.0,0.0,0.0,2.0,3.0,0.0,2.0,1.0,2.0,90.0,2.0,10.0
4,22.0,1.0,0.0,1.0,1.0,0.0,1.64,3.0,0.0,0.0,5.0,3.0,0.0,2.0,1.0,2.0,53.0,1.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1598,21.0,1.0,1.0,2.0,1.0,1.0,1.73,3.0,0.0,1.0,4.0,1.0,0.0,2.0,3.0,2.0,131.0,5.0,16.0
1599,22.0,1.0,1.0,2.0,1.0,1.0,1.75,3.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,134.0,5.0,14.0
1600,23.0,1.0,1.0,2.0,1.0,1.0,1.75,3.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,134.0,5.0,14.0
1601,24.0,1.0,1.0,2.0,1.0,1.0,1.74,3.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,3.0,133.0,5.0,15.0


In [93]:
# set up rkf

def run(model, X, y):
    my_model = model
    rkf = RepeatedKFold(n_splits=5)

    scores_train = []
    scores_val = []

    for (train_index, test_index) in rkf.split(X, y):
        X_train = X.iloc[train_index]
        X_test = X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]

        my_model.fit(X_train, y_train)

        report1 = classification_report(y_test, y_hat:=my_model.predict(X_test), output_dict=True)
        report2 = classification_report(y_train, y_hat:=my_model.predict(X_train), output_dict=True)

        scores_val.append(report1["macro avg"]["f1-score"])
        scores_train.append(report2["macro avg"]["f1-score"])

    return np.array(scores_train).mean(), np.array(scores_val).mean()

        

In [94]:
run(RandomForestClassifier(), X, y)

(1.0, 0.9354135048307881)

In [95]:
run(DecisionTreeClassifier(), X, y)

(1.0, 0.8922654021090528)

In [96]:
run(LogisticRegression(solver="liblinear"), X, y)

(0.7411245257373434, 0.7143863564756999)

In [75]:
# run(LogisticRegression(solver="newton-cg"), X, y)

In [76]:
# run(LogisticRegression(solver="newton-cholesky"), X, y)

In [77]:
#run(GaussianNB(), X, y)