# Manual Preprocessing

In [33]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB, MultinomialNB, GaussianNB
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')


In [3]:
df_train = pd.read_csv("../our data/no_outliers.csv")
df_test = pd.read_csv("../data/obesity_test.csv")

df_test = df_test.drop(columns=["region", "marrital_status"])

# Scale and KNN-impute data
scalers = {} # Preserve scalers for antitransformation

columns = ['age', 'height', 'weight']
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5, weights='uniform')

df_train[columns] = scaler.fit_transform(df_train[columns])
df_test[columns] = scaler.transform(df_test[columns])


# Impute
df_train[columns] = imputer.fit_transform(df_train[columns])
df_test[columns] = imputer.transform(df_test[columns])

# Transform back 
df_train[columns] = scaler.inverse_transform(df_train[columns])
df_test[columns] = scaler.inverse_transform(df_test[columns])
    


In [4]:
def classify_bmi_comprehensive(row):
    """
    Classify BMI based on age and BMI value.

    Input:
    row: A Pandas row with 'weight', 'height', and 'age' columns.

    Output:
    Returns a string that classifies the individual into BMI categories.
    """
    # Check if weight and height are valid
    if row['height'] <= 0 or row['weight'] <= 0:
        return 'Invalid data'

    # Calculate BMI
    bmi = row['weight'] / (row['height'] ** 2)

    # Age group: Children (2-19 years)
    if 2 <= row['age'] < 20:
        if bmi < 14:
            return 0 # Underweight
        elif 14 <= bmi < 18:
            return 1 # Normal weight
        elif 18 <= bmi < 21:
            return 2 # Overweight
        else:
            return 3 # Obesity 1

    # Age group: Adults (20-64 years)
    elif 20 <= row['age'] < 65:
        if bmi < 18.5:
            return 0 # "Underweight"
        elif 18.5 <= bmi < 25:
            return 1 # "Healthy Weight"
        elif 25 <= bmi < 30:
            return 2 #"Overweight"
        elif 30<= bmi < 35:
            return 3 #"Obese Class 1"
        elif 35 <= bmi < 40:
            return 4 #"Obese Class 2"
        else:
            return 5 #"Obese Class 3"

In [5]:
# Add BMI
df_train['bmi_class'] = df_train.apply(lambda row: classify_bmi_comprehensive(row), axis=1)
df_test['bmi_class'] = df_test.apply(lambda row: classify_bmi_comprehensive(row), axis=1)

In [6]:
# Fill missing activity with zero
df_train['physical_activity_perweek'].fillna('No Activity')
df_test['physical_activity_perweek'].fillna('No Activity')


0        5 or more
1      No Activity
2           1 to 2
3           1 to 2
4           3 to 4
          ...     
495    No Activity
496    No Activity
497    No Activity
498    No Activity
499         3 to 4
Name: physical_activity_perweek, Length: 500, dtype: object

In [7]:
hashmap = {
"Never": 0,
"Sometimes": 1,
"Frequently": 2,
"Always": 3,

"No Activity": 0,
"up to 2": 1,
"up to 5": 2,
"more than 5": 3,

"less than 1": 1,
"1 to 2": 2,
"more than 2": 3,
"3 to 4": 4,
"5 or more": 5,

"Bicycle": 1,
"Car": 3,
"Motorbike": 3,
"Public": 2,
"Walk": 0,

"no": 0,
"yes": 1,

"Male": 0,
"Female": 1
}


In [8]:
# Manually encode data

columns = ['alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'gender',
 'monitor_calories',
 'parent_overweight',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
 'bmi_class',
 'meals_perday',
 "siblings"]

for target in columns:
    df_train[target] = df_train[target].replace(hashmap)
    df_test[target]= df_test[target].replace(hashmap)


  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)


In [9]:
# Fill rest with KNN or smt else

# Scale numerical (again) and KNN-impute data

#columnsx = ['age', 'height', 'weight']
#scaler = StandardScaler()
imputer = DecisionTreeClassifier()
imputer = IterativeImputer(imputer)

#df_train[columnsx] = scaler.fit_transform(df_train[columnsx])
#df_test[columnsx] = scaler.transform(df_test[columnsx])

df_train[columns] = imputer.fit_transform(df_train[columns])
df_test[columns] = imputer.transform(df_test[columns])



In [10]:
# Transform to life score
life_columns = [
 'alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'monitor_calories',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
]

bad_columns = [
 'eat_between_meals',
 'alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'smoke',
 'transportation',
]

good_columns = [
 'monitor_calories',
 'physical_activity_perweek',
 'veggies_freq',
 'water_daily',
]

df_train["life"] = 0
df_test["life"] = 0

for column in good_columns:
    df_train["life"] += df_train[column]
    df_test["life"] += df_test[column]


for column in bad_columns:
    df_train["life"] -= df_train[column]
    df_test["life"] -= df_test[column]




In [11]:
hash_obesity = {
 'Normal_Weight': 1,
 'Overweight_Level_I': 2,
 'Overweight_Level_II': 3,
 'Obesity_Type_I': 4,
 'Insufficient_Weight': 5,
 'Obesity_Type_II': 6,
 'Obesity_Type_III': 7
 }

X = df_train.iloc[:, 1:].drop(columns='obese_level')
y = df_train['obese_level'].replace(hash_obesity)



  y = df_train['obese_level'].replace(hash_obesity)


In [12]:
# Define general function to preprocess data without information leak
def classify_bmi_comprehensive(row):
    """
    Classify BMI based on age and BMI value.

    Input:
    row: A Pandas row with 'weight', 'height', and 'age' columns.

    Output:
    Returns a string that classifies the individual into BMI categories.
    """
    # Check if weight and height are valid
    if row['height'] <= 0 or row['weight'] <= 0:
        return 'Invalid data'

    # Calculate BMI
    bmi = row['weight'] / (row['height'] ** 2)

    # Age group: Children (2-19 years)
    if 2 <= row['age'] < 20:
        if bmi < 14:
            return 0 # Underweight
        elif 14 <= bmi < 18:
            return 1 # Normal weight
        elif 18 <= bmi < 21:
            return 2 # Overweight
        else:
            return 3 # Obesity 1

    # Age group: Adults (20-64 years)
    elif 20 <= row['age'] < 65:
        if bmi < 18.5:
            return 0 # "Underweight"
        elif 18.5 <= bmi < 25:
            return 1 # "Healthy Weight"
        elif 25 <= bmi < 30:
            return 2 #"Overweight"
        elif 30<= bmi < 35:
            return 3 #"Obese Class 1"
        elif 35 <= bmi < 40:
            return 4 #"Obese Class 2"
        else:
            return 5 #"Obese Class 3"
        
def preprocess_data(X_train, X_test):
    df_train = X_train.copy()
    df_test = X_test.copy()
    # START
    # 1. Scale and KNN-impute data
    columns = ['age', 'height', 'weight']
    scaler = StandardScaler()
    imputer = KNNImputer(n_neighbors=5, weights='uniform')

    df_train[columns] = scaler.fit_transform(df_train[columns])
    df_test[columns] = scaler.transform(df_test[columns])


    # Impute
    df_train[columns] = imputer.fit_transform(df_train[columns])
    df_test[columns] = imputer.transform(df_test[columns])

    # Transform back 
    df_train[columns] = scaler.inverse_transform(df_train[columns])
    df_test[columns] = scaler.inverse_transform(df_test[columns])

    # Add BMI 
    df_train['bmi_class'] = df_train.apply(lambda row: classify_bmi_comprehensive(row), axis=1)
    df_test['bmi_class'] = df_test.apply(lambda row: classify_bmi_comprehensive(row), axis=1)

    # Fill missing activity with 0
    df_train['physical_activity_perweek'].fillna('No Activity')
    df_test['physical_activity_perweek'].fillna('No Activity')

    # Encode categoricals with hashmap (my favourite)
    hashmap = {
        "Never": 0,
        "Sometimes": 1,
        "Frequently": 2,
        "Always": 3,

        "No Activity": 0,
        "up to 2": 1,
        "up to 5": 2,
        "more than 5": 3,

        "less than 1": 1,
        "1 to 2": 2,
        "more than 2": 3,
        "3 to 4": 4,
        "5 or more": 5,

        "Bicycle": 1,
        "Car": 3,
        "Motorbike": 3,
        "Public": 2,
        "Walk": 0,

        "no": 0,
        "yes": 1,

        "Male": 0,
        "Female": 1
        }

    columns = ['alcohol_freq',
    'caloric_freq',
    'devices_perday',
    'eat_between_meals',
    'gender',
    'monitor_calories',
    'parent_overweight',
    'physical_activity_perweek',
    'smoke',
    'transportation',
    'veggies_freq',
    'water_daily',
    'bmi_class',
    'meals_perday',
    "siblings"]

    for target in columns:
        df_train[target] = df_train[target].replace(hashmap)
        df_test[target]= df_test[target].replace(hashmap)


    # Fill the ordinal variables with iterative Random Forests
    imputer = DecisionTreeClassifier()
    imputer = IterativeImputer(imputer)

    df_train[columns] = imputer.fit_transform(df_train[columns])
    df_test[columns] = imputer.transform(df_test[columns])

    # Transform to life score
    life_columns = [
    'alcohol_freq',
    'caloric_freq',
    'devices_perday',
    'eat_between_meals',
    'monitor_calories',
    'physical_activity_perweek',
    'smoke',
    'transportation',
    'veggies_freq',
    'water_daily',
    ]

    df_train["life"] = 0
    df_test["life"] = 0

    for column in life_columns:
        df_train["life"] += df_train[column]
        df_test["life"] += df_test[column]

    return df_train, df_test

# Feature selection with wrapper method

In [13]:
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

In [14]:
df = pd.read_csv("../our data/no_outliers.csv", index_col=0)

X = df.drop('obese_level', axis=1)
y = df['obese_level']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
X_train, X_test = preprocess_data(X_train, X_test)

  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)


In [19]:
baseline = DecisionTreeClassifier()
k = 5 # suppose i want to select 5

rfe = RFE(estimator=baseline, n_features_to_select=k)
rfe.fit_transform(X_train, y_train)

selected_features = pd.Series(rfe.support_, index = X_train.columns)
selected_features


rfe.score(X_test, y_test)

0.9168399168399168

In [24]:
# same as above but iterate k
max_score = -999 
info = {}

for i in range(1, len(X_train.columns.tolist())+1):
    rfe = RFE(estimator=baseline, n_features_to_select=i)
    rfe.fit_transform(X_train, y_train)
    s = rfe.score(X_test, y_test)
    print(f"k={i}: {s}")
    selected_features = pd.Series(rfe.support_, index = X_train.columns)
    S=selected_features[selected_features == True].index.tolist()
    print(f"\tSelected: {S}")

    if s>max_score:
        max_score = s
        info['i'] = i
        info['selected'] = S

print("=========================*2")
print(info)


k=1: 0.6507276507276507
	Selected: ['bmi_class']
k=2: 0.8253638253638254
	Selected: ['weight', 'bmi_class']
k=3: 0.8690228690228691
	Selected: ['age', 'weight', 'bmi_class']
k=4: 0.8898128898128899
	Selected: ['age', 'gender', 'weight', 'bmi_class']
k=5: 0.9126819126819127
	Selected: ['age', 'gender', 'height', 'weight', 'bmi_class']
k=6: 0.9147609147609148
	Selected: ['age', 'alcohol_freq', 'gender', 'height', 'weight', 'bmi_class']
k=7: 0.9022869022869023
	Selected: ['age', 'alcohol_freq', 'gender', 'height', 'weight', 'bmi_class', 'life']
k=8: 0.9064449064449065
	Selected: ['age', 'alcohol_freq', 'gender', 'height', 'physical_activity_perweek', 'weight', 'bmi_class', 'life']
k=9: 0.8981288981288982
	Selected: ['age', 'alcohol_freq', 'caloric_freq', 'gender', 'height', 'parent_overweight', 'weight', 'bmi_class', 'life']
k=10: 0.8898128898128899
	Selected: ['age', 'alcohol_freq', 'caloric_freq', 'gender', 'height', 'parent_overweight', 'physical_activity_perweek', 'weight', 'bmi_class

# Statistical tests for correlation

In [26]:
df_train

Unnamed: 0.1,Unnamed: 0,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,...,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight,obese_level,bmi_class,life
0,0,21.0,0.0,0.0,2.0,1.0,1.0,1.62,3.0,0.0,...,2.0,3.0,0.0,2.0,1.0,2.0,64.0,Normal_Weight,1.0,0.0
1,1,23.0,2.0,0.0,2.0,1.0,0.0,1.80,3.0,0.0,...,4.0,0.0,0.0,2.0,1.0,2.0,77.0,Normal_Weight,1.0,0.0
2,2,20.6,2.0,0.0,1.0,1.0,0.0,1.80,3.0,0.0,...,4.0,2.0,0.0,0.0,3.0,2.0,87.0,Overweight_Level_I,2.0,5.0
3,3,22.0,1.0,0.0,1.0,1.0,0.0,1.78,1.0,0.0,...,2.0,3.0,0.0,2.0,1.0,2.0,90.0,Overweight_Level_II,2.0,0.0
4,4,22.0,1.0,0.0,1.0,1.0,0.0,1.64,3.0,0.0,...,5.0,3.0,0.0,2.0,1.0,2.0,53.0,Normal_Weight,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1598,1598,21.0,1.0,1.0,2.0,1.0,1.0,1.73,3.0,0.0,...,4.0,1.0,0.0,2.0,3.0,2.0,131.0,Obesity_Type_III,5.0,2.0
1599,1599,22.0,1.0,1.0,2.0,1.0,1.0,1.75,3.0,0.0,...,2.0,0.0,0.0,2.0,3.0,2.0,134.0,Obesity_Type_III,5.0,0.0
1600,1600,23.0,1.0,1.0,2.0,1.0,1.0,1.75,3.0,0.0,...,2.0,0.0,0.0,2.0,3.0,2.0,134.0,Obesity_Type_III,5.0,0.0
1601,1601,24.0,1.0,1.0,2.0,1.0,1.0,1.74,3.0,0.0,...,2.0,0.0,0.0,2.0,3.0,3.0,133.0,Obesity_Type_III,5.0,1.0


In [27]:
cor = df_train[['age', 'weight', 'height', 'life']].replace({"obese_level": hash_obesity}).corr()
import seaborn as sns

cor.style.highlight_between(left=-1, right=-.69, color="gold").highlight_between(left=.69, right=1, color="gold")

Unnamed: 0,age,weight,height,life
age,1.0,0.242086,-0.004945,-0.087402
weight,0.242086,1.0,0.467835,-0.019529
height,-0.004945,0.467835,1.0,-0.05237
life,-0.087402,-0.019529,-0.05237,1.0


In [28]:
cor = df_train.replace({"obese_level": hash_obesity}).iloc[:, 1:].corr(method="spearman")
cor.style.highlight_between(left=-1, right=-.69, color="red").highlight_between(left=.69, right=1, color="red").highlight_between(left=-0.01, right=0.01, color="green")

  cor = df_train.replace({"obese_level": hash_obesity}).iloc[:, 1:].corr(method="spearman")


Unnamed: 0,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight,obese_level,bmi_class,life
age,1.0,0.119686,0.121819,-0.268003,-0.120648,-0.071634,0.025875,-0.066489,-0.167786,0.270327,-0.069277,0.002592,0.079838,0.455255,0.052384,0.016187,0.383939,0.21073,0.27601,-0.070984
alcohol_freq,0.119686,1.0,0.101283,-0.047702,-0.087374,0.004973,0.128104,0.074989,-0.020602,-0.020514,-0.092939,-0.008779,0.056651,-0.019874,0.072003,0.088146,0.228232,0.138893,0.180347,-0.195728
caloric_freq,0.121819,0.101283,1.0,0.073632,-0.191589,-0.069721,0.1885,-0.02423,-0.195028,0.21723,-0.080416,-0.014858,-0.031565,0.154487,-0.030855,0.016372,0.28543,0.226441,0.2442,-0.260117
devices_perday,-0.268003,-0.047702,0.073632,1.0,0.026981,0.027206,0.066757,0.047654,-0.031382,0.036709,0.083993,-0.026627,0.018212,-0.142962,-0.063698,-0.041897,-0.039662,-0.006146,-0.050578,-0.282539
eat_between_meals,-0.120648,-0.087374,-0.191589,0.026981,1.0,0.123208,-0.083571,0.129996,0.103018,-0.199365,0.060784,0.004127,0.021977,-0.056508,0.074809,-0.18475,-0.297773,-0.127092,-0.298836,-0.112621
gender,-0.071634,0.004973,-0.069721,0.027206,0.123208,1.0,-0.642204,-0.060759,0.107133,-0.113856,0.017055,-0.020317,-0.062651,-0.127466,0.291495,-0.089187,-0.213907,0.132238,0.023268,0.163479
height,0.025875,0.128104,0.1885,0.066757,-0.083571,-0.642204,1.0,0.211793,-0.131879,0.254538,0.034904,0.030048,0.068275,0.090216,-0.082022,0.177785,0.469418,0.122214,0.117376,-0.064236
meals_perday,-0.066489,0.074989,-0.02423,0.047654,0.129996,-0.060759,0.211793,1.0,-0.007751,0.054637,0.094117,0.043815,0.031847,0.039292,0.034488,0.048746,0.046974,0.127299,0.00863,0.027841
monitor_calories,-0.167786,-0.020602,-0.195028,-0.031382,0.103018,0.107133,-0.131879,-0.007751,1.0,-0.210182,0.034011,-0.016748,0.03451,-0.054406,0.069834,0.004096,-0.215413,-0.14726,-0.140436,0.143244
parent_overweight,0.270327,-0.020514,0.21723,0.036709,-0.199365,-0.113856,0.254538,0.054637,-0.210182,1.0,-0.086852,-0.00012,0.030132,0.143631,-0.006908,0.116254,0.499149,0.259903,0.427733,-0.063689


# Statistical tests (Spearman Rank)

In [29]:
import scipy.stats as stats
from scipy.stats import mannwhitneyu


In [30]:
#note: slightly modify the code
catcol =  ['alcohol_freq',
 'caloric_freq',
 'devices_perday',
 'eat_between_meals',
 'gender',
 'monitor_calories',
 'parent_overweight',
 'physical_activity_perweek',
 'smoke',
 'transportation',
 'veggies_freq',
 'water_daily',
 'bmi_class',
 'meals_perday',
 "siblings"]


import pandas as pd
from scipy.stats import spearmanr

def test_spearman_correlation(X, y, var, alpha=0.05):
    # Calculate Spearman's rank correlation between the predictor and the target
    spearman_corr, p_value = spearmanr(X[var], y)
    
    # Interpret the result based on the p-value
    if p_value < alpha:
        result = f"{var} is IMPORTANT for prediction (Spearman's correlation = {spearman_corr:.2f}, p = {p_value:.4f})"
    else:
        result = f"{var} is NOT an important predictor (Spearman's correlation = {spearman_corr:.2f}, p = {p_value:.4f})"
    
    print(result)

# Example usage
# X is a DataFrame containing predictors, y is a Series containing the target variable, and 'var' is the column name of the predictor to test.
# test_spearman_c

X_p, _ = preprocess_data(X, X_train)
for var in catcol:
    test_spearman_correlation(X_p, y, var)


  df_train[target] = df_train[target].replace(hashmap)


alcohol_freq is IMPORTANT for prediction (Spearman's correlation = 0.12, p = 0.0000)
caloric_freq is IMPORTANT for prediction (Spearman's correlation = 0.05, p = 0.0432)
devices_perday is NOT an important predictor (Spearman's correlation = -0.04, p = 0.1226)
eat_between_meals is IMPORTANT for prediction (Spearman's correlation = -0.36, p = 0.0000)
gender is NOT an important predictor (Spearman's correlation = -0.01, p = 0.5564)
monitor_calories is IMPORTANT for prediction (Spearman's correlation = -0.08, p = 0.0011)
parent_overweight is IMPORTANT for prediction (Spearman's correlation = 0.31, p = 0.0000)
physical_activity_perweek is IMPORTANT for prediction (Spearman's correlation = -0.13, p = 0.0000)
smoke is NOT an important predictor (Spearman's correlation = -0.03, p = 0.2438)
transportation is IMPORTANT for prediction (Spearman's correlation = 0.05, p = 0.0374)
veggies_freq is NOT an important predictor (Spearman's correlation = 0.02, p = 0.5043)
water_daily is IMPORTANT for pred



# Model testing

In [31]:
# set up rkf

def run(model, X, y):
    my_model = model
    rkf = RepeatedKFold(n_splits=10)

    scores_train = []
    scores_val = []

    i = 0
    for (train_index, test_index) in rkf.split(X, y):
        X_train = X.iloc[train_index]
        X_test = X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]

        X_train, X_test = preprocess_data(X_train, X_test)

        my_model.fit(X_train, y_train)

        report1 = classification_report(y_test, y_hat:=my_model.predict(X_test), output_dict=True)
        report2 = classification_report(y_train, y_hat:=my_model.predict(X_train), output_dict=True)

        scores_val.append(report1["macro avg"]["f1-score"])
        scores_train.append(report2["macro avg"]["f1-score"])
        print(i)
        i+=1

    return np.array(scores_train).mean(), np.array(scores_val).mean(), my_model.feature_importances_

        

In [34]:
run(RandomForestClassifier(), X, y)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


(0.999992936886283,
 0.9342779367893845,
 array([0.08684592, 0.02916015, 0.01698479, 0.02002437, 0.02886051,
        0.06605276, 0.08397836, 0.02910226, 0.00474715, 0.0283351 ,
        0.01465299, 0.01462235, 0.00200634, 0.02188073, 0.02701335,
        0.01607257, 0.23612015, 0.23541477, 0.0381254 ]))

In [35]:
A = np. array([0.08684592, 0.02916015, 0.01698479, 0.02002437, 0.02886051,
        0.06605276, 0.08397836, 0.02910226, 0.00474715, 0.0283351 ,
        0.01465299, 0.01462235, 0.00200634, 0.02188073, 0.02701335,
        0.01607257, 0.23612015, 0.23541477, 0.0381254 ])

pd.DataFrame(A, index=X_train.columns, columns=["b"]).sort_values(by="b",ascending=False).style.bar()

Unnamed: 0,b
weight,0.23612
bmi_class,0.235415
age,0.086846
height,0.083978
gender,0.066053
life,0.038125
alcohol_freq,0.02916
meals_perday,0.029102
eat_between_meals,0.028861
parent_overweight,0.028335


In [78]:
run(DecisionTreeClassifier(), X, y)

  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target].replace(hashmap)
  df_test[target]= df_test[target].replace(hashmap)
  df_train[target] = df_train[target]

(1.0,
 0.8887583831335648,
 array([0.10193846, 0.03012724, 0.01176187, 0.00337386, 0.00445542,
        0.1610644 , 0.08341583, 0.01239505, 0.00134955, 0.01086837,
        0.00080973, 0.00495463, 0.        , 0.00766767, 0.00428283,
        0.00735991, 0.19896782, 0.34659038, 0.00861699]))

In [79]:
A = np.array([0.10193846, 0.03012724, 0.01176187, 0.00337386, 0.00445542,
        0.1610644 , 0.08341583, 0.01239505, 0.00134955, 0.01086837,
        0.00080973, 0.00495463, 0.        , 0.00766767, 0.00428283,
        0.00735991, 0.19896782, 0.34659038, 0.00861699])


pd.DataFrame(A, index=X_train.columns, columns=["b"]).sort_values(by="b",ascending=False).style.bar()

Unnamed: 0,b
bmi_class,0.34659
weight,0.198968
gender,0.161064
age,0.101938
height,0.083416
alcohol_freq,0.030127
meals_perday,0.012395
caloric_freq,0.011762
parent_overweight,0.010868
life,0.008617


In [27]:
run(LogisticRegression(solver="liblinear"), X, y)

AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

In [None]:
# run(LogisticRegression(solver="newton-cg"), X, y)

In [None]:
# run(LogisticRegression(solver="newton-cholesky"), X, y)

In [None]:
#run(GaussianNB(), X, y)