# Manual Preprocessing - but using embedded feature selection methods

In [1]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB, MultinomialNB, GaussianNB
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import classification_report


In [2]:
df_train = pd.read_csv("../our data/no_outliers.csv")
df_test = pd.read_csv("../data/obesity_test.csv")

df_test = df_test.drop(columns=["region", "marrital_status"])

# Scale and KNN-impute data
scalers = {} # Preserve scalers for antitransformation

columns = ['age', 'height', 'weight']
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5, weights='uniform')

df_train[columns] = scaler.fit_transform(df_train[columns])
df_test[columns] = scaler.transform(df_test[columns])


# Impute
df_train[columns] = imputer.fit_transform(df_train[columns])
df_test[columns] = imputer.transform(df_test[columns])

# Transform back 
df_train[columns] = scaler.inverse_transform(df_train[columns])
df_test[columns] = scaler.inverse_transform(df_test[columns])
    


In [3]:
def classify_bmi_comprehensive(row):
    """
    Classify BMI based on age and BMI value.

    Input:
    row: A Pandas row with 'weight', 'height', and 'age' columns.

    Output:
    Returns a string that classifies the individual into BMI categories.
    """
    # Check if weight and height are valid
    if row['height'] <= 0 or row['weight'] <= 0:
        return 'Invalid data'

    # Calculate BMI
    bmi = row['weight'] / (row['height'] ** 2)

    # Age group: Children (2-19 years)
    if 2 <= row['age'] < 20:
        if bmi < 14:
            return 0 # Underweight
        elif 14 <= bmi < 18:
            return 1 # Normal weight
        elif 18 <= bmi < 21:
            return 2 # Overweight
        else:
            return 3 # Obesity 1

    # Age group: Adults (20-64 years)
    elif 20 <= row['age'] < 65:
        if bmi < 18.5:
            return 0 # "Underweight"
        elif 18.5 <= bmi < 25:
            return 1 # "Healthy Weight"
        elif 25 <= bmi < 30:
            return 2 #"Overweight"
        elif 30<= bmi < 35:
            return 3 #"Obese Class 1"
        elif 35 <= bmi < 40:
            return 4 #"Obese Class 2"
        else:
            return 5 #"Obese Class 3"

In [4]:
# Add BMI
df_train['bmi_class'] = df_train.apply(lambda row: classify_bmi_comprehensive(row), axis=1)
df_test['bmi_class'] = df_test.apply(lambda row: classify_bmi_comprehensive(row), axis=1)

In [5]:
# Fill missing activity with zero
df_train['physical_activity_perweek'].fillna('No Activity')
df_test['physical_activity_perweek'].fillna('No Activity')


0        5 or more
1      No Activity
2           1 to 2
3           1 to 2
4           3 to 4
          ...     
495    No Activity
496    No Activity
497    No Activity
498    No Activity
499         3 to 4
Name: physical_activity_perweek, Length: 500, dtype: object

In [6]:
hashmap = {
"Never": 0,
"Sometimes": 1,
"Frequently": 2,
"Always": 3,

"No Activity": 0,
"up to 2": 1,
"up to 5": 2,
"more than 5": 3,

"less than 1": 1,
"1 to 2": 2,
"more than 2": 3,
"3 to 4": 4,
"5 or more": 5,

"Bicycle": 1,
"Car": 4,
"Motorbike": 3,
"Public": 2,
"Walk": 0,

"no": 0,
"yes": 1,

"Male": 0,
"Female": 1
}


In [7]:
# Manually encode data

columns = ['alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'gender',
 'monitor_calories',
 'parent_overweight',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
 'bmi_class',
 'meals_perday',
 "siblings"]

for target in columns:
    df_train[target] = df_train[target].replace(hashmap)
    df_test[target]= df_test[target].replace(hashmap)


  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)


In [8]:
# Fill rest with KNN or smt else

# Scale numerical (again) and KNN-impute data

#columnsx = ['age', 'height', 'weight']
#scaler = StandardScaler()
imputer = DecisionTreeClassifier()
imputer = IterativeImputer(imputer)

#df_train[columnsx] = scaler.fit_transform(df_train[columnsx])
#df_test[columnsx] = scaler.transform(df_test[columnsx])

df_train[columns] = imputer.fit_transform(df_train[columns])
df_test[columns] = imputer.transform(df_test[columns])



# Feature selection


In [None]:
hash_obesity = {
 'Normal_Weight': 0,
 'Overweight_Level_I': 1,
 'Overweight_Level_II': 2,
 'Obesity_Type_I': 3,
 'Insufficient_Weight': 4,
 'Obesity_Type_II': 5,
 'Obesity_Type_III': 6
 }

In [13]:
df_train['obese_level'].unique().tolist()

['Normal_Weight',
 'Overweight_Level_I',
 'Overweight_Level_II',
 'Obesity_Type_I',
 'Insufficient_Weight',
 'Obesity_Type_II',
 'Obesity_Type_III']

In [None]:
# Lasso method
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt 

def plot_importance(coef,name):
    imp_coef = coef.sort_values()
    plt.figure(figsize=(8,10))
    imp_coef.plot(kind = "barh")
    plt.title("Feature importance using " + name + " Model")
    plt.axvline(x=0, color="red", linestyle="--")
    plt.show()

reg = LassoCV()

X = df_train.drop("obese_level", axis=1)
y = df_train['obese_level']

reg.fit(X, y)

coef = pd.Series(reg.coef_, index = X.columns)

ValueError: could not convert string to float: 'Normal_Weight'

# Model testing

In [28]:
X = df_train.drop(columns='obese_level')
y = df_train['obese_level']

In [29]:
X = X.iloc[:, 1:]
X

Unnamed: 0,age,gender,height,meals_perday,parent_overweight,siblings,weight,bmi_class,life
0,21.0,1.0,1.62,3.0,1.0,3.0,64.0,1.0,10.0
1,23.0,0.0,1.80,3.0,1.0,0.0,77.0,1.0,14.0
2,20.6,0.0,1.80,3.0,0.0,2.0,87.0,2.0,13.0
3,22.0,0.0,1.78,1.0,0.0,3.0,90.0,2.0,13.0
4,22.0,0.0,1.64,3.0,0.0,3.0,53.0,1.0,13.0
...,...,...,...,...,...,...,...,...,...
1598,21.0,1.0,1.73,3.0,1.0,1.0,131.0,5.0,16.0
1599,22.0,1.0,1.75,3.0,1.0,0.0,134.0,5.0,14.0
1600,23.0,1.0,1.75,3.0,1.0,0.0,134.0,5.0,14.0
1601,24.0,1.0,1.74,3.0,1.0,0.0,133.0,5.0,15.0


In [30]:
# set up rkf

def run(model, X, y):
    my_model = model
    rkf = RepeatedKFold(n_splits=5)

    scores_train = []
    scores_val = []

    for (train_index, test_index) in rkf.split(X, y):
        X_train = X.iloc[train_index]
        X_test = X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]

        my_model.fit(X_train, y_train)

        report1 = classification_report(y_test, y_hat:=my_model.predict(X_test), output_dict=True)
        report2 = classification_report(y_train, y_hat:=my_model.predict(X_train), output_dict=True)

        scores_val.append(report1["macro avg"]["f1-score"])
        scores_train.append(report2["macro avg"]["f1-score"])

    return np.array(scores_train).mean(), np.array(scores_val).mean()

        

In [31]:
run(RandomForestClassifier(), X, y)

(1.0, 0.9376914093470652)

In [32]:
run(DecisionTreeClassifier(), X, y)

(1.0, 0.900091427684042)

In [33]:
run(LogisticRegression(solver="liblinear"), X, y)

(0.6744568045474185, 0.6437766690805445)

In [34]:
run(LogisticRegression(solver="newton-cg"), X, y)

(0.8232783940902952, 0.8085095784861465)

In [35]:
# run(LogisticRegression(solver="newton-cholesky"), X, y)

In [36]:
run(GaussianNB(), X, y)

(0.6675767021586867, 0.6598028349360497)