# Manual Preprocessing

In [8]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB, MultinomialNB, GaussianNB
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import classification_report


In [9]:
df_train = pd.read_csv("../our data/no_outliers.csv")
df_test = pd.read_csv("../data/obesity_test.csv")

df_test = df_test.drop(columns=["region", "marrital_status"])

# Scale and KNN-impute data
scalers = {} # Preserve scalers for antitransformation

columns = ['age', 'height', 'weight']
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5, weights='uniform')

df_train[columns] = scaler.fit_transform(df_train[columns])
df_test[columns] = scaler.transform(df_test[columns])


# Impute
df_train[columns] = imputer.fit_transform(df_train[columns])
df_test[columns] = imputer.transform(df_test[columns])

# Transform back 
df_train[columns] = scaler.inverse_transform(df_train[columns])
df_test[columns] = scaler.inverse_transform(df_test[columns])
    


In [10]:
def classify_bmi_comprehensive(row):
    """
    Classify BMI based on age and BMI value.

    Input:
    row: A Pandas row with 'weight', 'height', and 'age' columns.

    Output:
    Returns a string that classifies the individual into BMI categories.
    """
    # Check if weight and height are valid
    if row['height'] <= 0 or row['weight'] <= 0:
        return 'Invalid data'

    # Calculate BMI
    bmi = row['weight'] / (row['height'] ** 2)

    # Age group: Children (2-19 years)
    if 2 <= row['age'] < 20:
        if bmi < 14:
            return 0 # Underweight
        elif 14 <= bmi < 18:
            return 1 # Normal weight
        elif 18 <= bmi < 21:
            return 2 # Overweight
        else:
            return 3 # Obesity 1

    # Age group: Adults (20-64 years)
    elif 20 <= row['age'] < 65:
        if bmi < 18.5:
            return 0 # "Underweight"
        elif 18.5 <= bmi < 25:
            return 1 # "Healthy Weight"
        elif 25 <= bmi < 30:
            return 2 #"Overweight"
        elif 30<= bmi < 35:
            return 3 #"Obese Class 1"
        elif 35 <= bmi < 40:
            return 4 #"Obese Class 2"
        else:
            return 5 #"Obese Class 3"

In [11]:
# Add BMI
df_train['bmi_class'] = df_train.apply(lambda row: classify_bmi_comprehensive(row), axis=1)
df_test['bmi_class'] = df_test.apply(lambda row: classify_bmi_comprehensive(row), axis=1)

In [12]:
# Fill missing activity with zero
df_train['physical_activity_perweek'].fillna('No Activity')
df_test['physical_activity_perweek'].fillna('No Activity')


0        5 or more
1      No Activity
2           1 to 2
3           1 to 2
4           3 to 4
          ...     
495    No Activity
496    No Activity
497    No Activity
498    No Activity
499         3 to 4
Name: physical_activity_perweek, Length: 500, dtype: object

In [13]:
hashmap = {
"Never": 0,
"Sometimes": 1,
"Frequently": 2,
"Always": 3,

"No Activity": 0,
"up to 2": 1,
"up to 5": 2,
"more than 5": 3,

"less than 1": 1,
"1 to 2": 2,
"more than 2": 3,
"3 to 4": 4,
"5 or more": 5,

"Bicycle": 1,
"Car": 3,
"Motorbike": 3,
"Public": 2,
"Walk": 0,

"no": 0,
"yes": 1,

"Male": 0,
"Female": 1
}


In [14]:
# Manually encode data

columns = ['alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'gender',
 'monitor_calories',
 'parent_overweight',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
 'bmi_class',
 'meals_perday',
 "siblings"]

for target in columns:
    df_train[target] = df_train[target].replace(hashmap)
    df_test[target]= df_test[target].replace(hashmap)


  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)


In [None]:
# Fill rest with KNN or smt else

# Scale numerical (again) and KNN-impute data

#columnsx = ['age', 'height', 'weight']
#scaler = StandardScaler()
imputer = RandomForestClassifier()
imputer = IterativeImputer(imputer)

#df_train[columnsx] = scaler.fit_transform(df_train[columnsx])
#df_test[columnsx] = scaler.transform(df_test[columnsx])

df_train[columns] = imputer.fit_transform(df_train[columns])
df_test[columns] = imputer.transform(df_test[columns])

In [None]:
# Transform to life score
life_columns = [
 'alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'monitor_calories',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
 ]

df_train["life"] = 0
df_test["life"] = 0

for column in life_columns:
    df_train["life"] += df_train[column]
    df_test["life"] += df_test[column]


In [None]:
hash_obesity = {
 'Normal_Weight': 1,
 'Overweight_Level_I': 2,
 'Overweight_Level_II': 3,
 'Obesity_Type_I': 4,
 'Insufficient_Weight': 5,
 'Obesity_Type_II': 6,
 'Obesity_Type_III': 7
 }

X = df_train.iloc[:, 1:].drop(columns='obese_level')
y = df_train['obese_level'].replace(hash_obesity)



  y = df_train['obese_level'].replace(hash_obesity)


# Feature selection with wrapper method

In [None]:
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [None]:
baseline = DecisionTreeClassifier()
k = 5 # suppose i want to select 5

rfe = RFE(estimator=baseline, n_features_to_select=k)
rfe.fit_transform(X_train, y_train)

selected_features = pd.Series(rfe.support_, index = X.columns)
selected_features


rfe.score(X_test, y_test)

0.9064449064449065

In [None]:
# same as above but iterate k
max_score = -999 
info = {}

for i in range(1, len(X.columns.tolist())+1):
    rfe = RFE(estimator=baseline, n_features_to_select=i)
    rfe.fit_transform(X_train, y_train)
    s = rfe.score(X_test, y_test)
    print(f"k={i}: {s}")
    selected_features = pd.Series(rfe.support_, index = X.columns)
    S=selected_features[selected_features == True].index.tolist()
    print(f"\tSelected: {S}")

    if s>max_score:
        max_score = s
        info['i'] = i
        info['selected'] = S

print("=========================*2")
print(info)


k=1: 0.5758835758835759
	Selected: ['bmi_class']
k=2: 0.7796257796257796
	Selected: ['weight', 'bmi_class']
k=3: 0.8212058212058212
	Selected: ['gender', 'weight', 'bmi_class']
k=4: 0.8835758835758836
	Selected: ['age', 'gender', 'weight', 'bmi_class']
k=5: 0.8981288981288982
	Selected: ['age', 'gender', 'height', 'weight', 'bmi_class']
k=6: 0.8981288981288982
	Selected: ['age', 'gender', 'height', 'meals_perday', 'weight', 'bmi_class']
k=7: 0.8918918918918919
	Selected: ['age', 'gender', 'height', 'meals_perday', 'weight', 'bmi_class', 'life']
k=8: 0.893970893970894
	Selected: ['age', 'alcohol_freq', 'gender', 'height', 'meals_perday', 'weight', 'bmi_class', 'life']
k=9: 0.896049896049896
	Selected: ['age', 'alcohol_freq', 'gender', 'height', 'meals_perday', 'transportation', 'weight', 'bmi_class', 'life']
k=10: 0.8898128898128899
	Selected: ['age', 'alcohol_freq', 'caloric_freq', 'gender', 'height', 'meals_perday', 'transportation', 'weight', 'bmi_class', 'life']
k=11: 0.893970893970

# Feature selection with lasso method (numerical variable)

In [None]:
cor = df_train.replace({"obese_level": hash_obesity}).iloc[:, 1:].corr()
import seaborn as sns

cor.style.highlight_between(left=-1, right=-.69, color="gold").highlight_between(left=.69, right=1, color="gold")

  cor = df_train.replace({"obese_level": hash_obesity}).iloc[:, 1:].corr()


Unnamed: 0,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight,obese_level,bmi_class,life
age,1.0,0.053169,0.083063,-0.26876,-0.087264,-0.052043,-0.004945,-0.047727,-0.122547,0.214087,-0.021202,-0.01975,0.095244,0.490053,0.007512,-0.03117,0.242086,0.117957,0.189108,0.027955
alcohol_freq,0.053169,1.0,0.091256,-0.040455,-0.061084,0.006099,0.11208,0.073627,-0.012353,-0.026739,-0.077057,-0.004441,0.07236,0.00168,0.07633,0.092495,0.21627,0.119628,0.178584,0.274179
caloric_freq,0.083063,0.091256,1.0,0.073715,-0.176985,-0.06418,0.185171,-0.01649,-0.190706,0.22141,-0.076805,-0.020163,-0.032593,0.200901,-0.027106,0.01562,0.271642,0.226095,0.245236,0.151451
devices_perday,-0.26876,-0.040455,0.073715,1.0,0.028046,0.010833,0.052651,0.039102,-0.031393,0.03238,0.073672,-0.023344,0.022392,-0.13582,-0.088612,-0.035114,-0.041426,-0.021716,-0.063885,0.281126
eat_between_meals,-0.087264,-0.061084,-0.176985,0.028046,1.0,0.104823,-0.064115,0.103741,0.120509,-0.174141,0.047666,0.000995,0.03151,-0.073105,0.075445,-0.150834,-0.26228,-0.143678,-0.285914,0.19594
gender,-0.052043,0.006099,-0.06418,0.010833,0.104823,1.0,-0.629557,-0.075903,0.10471,-0.117113,-0.018158,-0.022311,-0.062651,-0.088236,0.295161,-0.0899,-0.177149,0.12986,0.038444,0.115871
height,-0.004945,0.11208,0.185171,0.052651,-0.064115,-0.629557,1.0,0.22868,-0.129291,0.259084,0.070126,0.034811,0.081247,0.039127,-0.068414,0.184279,0.467835,0.127403,0.111077,0.127066
meals_perday,-0.047727,0.073627,-0.01649,0.039102,0.103741,-0.075903,0.22868,1.0,-0.012074,0.074559,0.135844,0.043538,0.026554,0.028237,0.043082,0.060634,0.102348,0.158296,0.043383,0.174533
monitor_calories,-0.122547,-0.012353,-0.190706,-0.031393,0.120509,0.10471,-0.129291,-0.012074,1.0,-0.205118,0.026643,-0.024134,0.03516,-0.052209,0.078931,0.002863,-0.208447,-0.144627,-0.143212,0.129589
parent_overweight,0.214087,-0.026739,0.22141,0.03238,-0.174141,-0.117113,0.259084,0.074559,-0.205118,1.0,-0.075612,-0.000333,0.030132,0.156472,-0.00563,0.114653,0.494143,0.263508,0.426227,0.020685


In [None]:
X_num = df_train.loc[:, ["age", "weight", "height", "life"]]
y_num = df_train['obese_level'].replace(hash_obesity)


# Lasso method: ONLY for numerical variables
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt 


reg = LassoCV()
reg.fit(X_num, y_num)

coef = pd.Series(reg.coef_, index = X_num.columns)
coef.sort_values(ascending=False)

  y_num = df_train['obese_level'].replace(hash_obesity)


weight    0.047478
age      -0.000000
height   -0.000000
life      0.000000
dtype: float64

# Statistical tests ($\chi^2$)

In [None]:
import scipy.stats as stats
from scipy.stats import chi2_contingency


In [None]:
#note: slightly modify the code
catcol =  ['alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'gender',
 'monitor_calories',
 'parent_overweight',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
 'bmi_class',
 'meals_perday',
 "siblings"]


def TestIndependence(X,y,var,alpha=0.05):   
    dfObserved = pd.crosstab(index=X.index, values=y,columns=X, aggfunc="count") 
    chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)
    dfExpected = pd.DataFrame(expected, columns=dfObserved.columns, index = dfObserved.index)
    if p<alpha:
        result="{0} is IMPORTANT for Prediction".format(var)
    else:
        result="{0} is NOT an important predictor. (Discard {0} from model)".format(var)
    print(result)

for var in catcol:
    TestIndependence(X[var],y, var)


NameError: name 'X' is not defined

# Model testing

In [17]:
to_keep = ['age', 'gender', 'height', 'weight', 'bmi_class']

In [18]:
X = df_train.drop(columns='obese_level')

y = df_train['obese_level']

In [19]:
X = X.iloc[:, 1:]
X

Unnamed: 0,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight,bmi_class,life
0,21.0,0.0,0.0,2.0,1.0,1.0,1.62,3.0,0.0,1.0,2.0,3.0,0.0,2.0,1.0,2.0,64.0,1.0,10.0
1,23.0,2.0,0.0,2.0,1.0,0.0,1.80,3.0,0.0,1.0,4.0,0.0,0.0,2.0,1.0,2.0,77.0,1.0,14.0
2,20.6,2.0,0.0,1.0,1.0,0.0,1.80,3.0,0.0,0.0,4.0,2.0,0.0,0.0,3.0,2.0,87.0,2.0,13.0
3,22.0,1.0,0.0,1.0,1.0,0.0,1.78,1.0,0.0,0.0,2.0,3.0,0.0,2.0,1.0,2.0,90.0,2.0,10.0
4,22.0,1.0,0.0,1.0,1.0,0.0,1.64,3.0,0.0,0.0,5.0,3.0,0.0,2.0,1.0,2.0,53.0,1.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1598,21.0,1.0,1.0,2.0,1.0,1.0,1.73,3.0,0.0,1.0,4.0,1.0,0.0,2.0,3.0,2.0,131.0,5.0,16.0
1599,22.0,1.0,1.0,2.0,1.0,1.0,1.75,3.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,134.0,5.0,14.0
1600,23.0,1.0,1.0,2.0,1.0,1.0,1.75,3.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,134.0,5.0,14.0
1601,24.0,1.0,1.0,2.0,1.0,1.0,1.74,3.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,3.0,133.0,5.0,15.0


In [20]:
# set up rkf

def run(model, X, y):
    my_model = model
    rkf = RepeatedKFold(n_splits=10)

    scores_train = []
    scores_val = []

    for (train_index, test_index) in rkf.split(X, y):
        X_train = X.iloc[train_index]
        X_test = X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]

        my_model.fit(X_train, y_train)

        report1 = classification_report(y_test, y_hat:=my_model.predict(X_test), output_dict=True)
        report2 = classification_report(y_train, y_hat:=my_model.predict(X_train), output_dict=True)

        scores_val.append(report1["macro avg"]["f1-score"])
        scores_train.append(report2["macro avg"]["f1-score"])

    return np.array(scores_train).mean(), np.array(scores_val).mean(), my_model.feature_importances_

        

In [21]:
run(RandomForestClassifier(), X, y)

(1.0,
 0.9368815683749335,
 array([0.08875508, 0.02382582, 0.01619864, 0.01829865, 0.02618396,
        0.06411401, 0.08277486, 0.03149703, 0.00514357, 0.02677576,
        0.01327112, 0.01220421, 0.0018141 , 0.01705636, 0.02699571,
        0.01658379, 0.26473557, 0.2307667 , 0.03300506]))

In [22]:
A = np.array([0.09067107, 0.02871339, 0.01791884, 0.01859065, 0.03083244,
        0.06170784, 0.08273799, 0.03168581, 0.0048444 , 0.02890144,
        0.01651799, 0.01432507, 0.00211583, 0.01874717, 0.02799509,
        0.01553866, 0.24649305, 0.22843336, 0.03322992])

pd.DataFrame(A, index=X_train.columns, columns=["b"]).sort_values(by="b",ascending=False).style.bar()

Unnamed: 0,b
weight,0.246493
bmi_class,0.228433
age,0.090671
height,0.082738
gender,0.061708
life,0.03323
meals_perday,0.031686
eat_between_meals,0.030832
parent_overweight,0.028901
alcohol_freq,0.028713


In [23]:
run(DecisionTreeClassifier(), X, y)

(1.0,
 0.8929497535529459,
 array([0.11022828, 0.02172396, 0.01425674, 0.00251735, 0.00153718,
        0.16000822, 0.07192763, 0.02332453, 0.        , 0.00637542,
        0.00209195, 0.00361604, 0.        , 0.00320358, 0.00147193,
        0.00339245, 0.21430421, 0.34936859, 0.01065196]))

In [24]:
run(LogisticRegression(solver="liblinear"), X, y)

AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

In [None]:
# run(LogisticRegression(solver="newton-cg"), X, y)

In [None]:
# run(LogisticRegression(solver="newton-cholesky"), X, y)

In [None]:
#run(GaussianNB(), X, y)