In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import scipy.stats as stats
from ucimlrepo import fetch_ucirepo 


# Load the dataset
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 
df = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features

# 1. Creating variables 
def categorize_imc(imc):
    if imc < 18.5:
        return 'Insufficient_Weight'
    elif imc < 25:
        return 'Normal_Weight'
    elif imc < 28:
        return 'Overweight_Level_I'
    elif imc < 30:
        return 'Overweight_Level_II'
    elif imc < 35:
        return 'Obesity_Type_I'
    elif imc < 40:
        return 'Obesity_Type_II'
    else:
        return 'Obesity_Type_III'

df['IMC'] = df['Weight'] / (df['Height'] ** 2)
df['NObeyesdad'] = df['IMC'].apply(categorize_imc)


# cleaning the dataset
# 2. missing values 
print('There is any missing value:', df.isnull().values.any())

# 3. duplicates
df.drop_duplicates(inplace=True)
# 4. one-hot encoding categorical variables
df['Gender'] = [1 if valor == 'Male' else 0 for valor in df['Gender']]
df['family_history_with_overweight'] = [1 if valor == 'yes' else 0 for valor in df['family_history_with_overweight']]
df['FAVC'] = [1 if valor == 'yes' else 0 for valor in df['FAVC']]
df['SMOKE'] = [1 if valor == 'yes' else 0 for valor in df['SMOKE']]
df['SCC'] = [1 if valor == 'yes' else 0 for valor in df['SCC']]

# 5. removing outliers
def remove_outliers(df, columns):
    for column in columns:
        z_scores = np.abs(stats.zscore(df[column]))
        df = df[(z_scores < 3)]
    return df
df = remove_outliers(df, ['Age', 'Height', 'Weight'])

# 6. standardizing the data
scaler = StandardScaler()
df[['Age', 'Height', 'Weight']] = scaler.fit_transform(df[['Age', 'Height', 'Weight']])

# 7. Preparing the data for modeling
nobesity_map = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}

df['NObeyesdad'] = df['NObeyesdad'].map(nobesity_map)

# 8. splitting the data into training and testing sets
X = df[['FAF']]
y = df['NObeyesdad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))




There is any missing value: False
Classification Report:
              precision    recall  f1-score   support

           0       0.09      0.06      0.07        53
           1       0.40      0.58      0.47        66
           2       0.22      0.15      0.17        62
           3       0.00      0.00      0.00        29
           4       0.24      0.15      0.19        73
           5       0.29      0.23      0.25        71
           6       0.21      0.46      0.29        59

    accuracy                           0.25       413
   macro avg       0.21      0.23      0.21       413
weighted avg       0.23      0.25      0.23       413

Confusion Matrix:
[[ 3 16  2  3  7  7 15]
 [ 0 38  2  1  2  0 23]
 [ 7 14  9  1  9  6 16]
 [ 5  5  6  0  2  3  8]
 [ 4 16 10  3 11  9 20]
 [10  4  9  5 10 16 17]
 [ 6  2  3  2  4 15 27]]
