In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import scipy.stats as stats
from ucimlrepo import fetch_ucirepo 


# Load the dataset
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 
df = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features
# 0. First Analysis
print('Dataset shape:', df.shape)
print('Dataset columns:', df.columns)
print('Dataset head:\n', df.head())
print('Dataset info:\n', df.info())
print('Dataset description:\n', df.describe())

# 1. Creating variables 
def categorize_imc(imc):
    if imc < 18.5:
        return 'Insufficient_Weight'
    elif imc < 25:
        return 'Normal_Weight'
    elif imc < 28:
        return 'Overweight_Level_I'
    elif imc < 30:
        return 'Overweight_Level_II'
    elif imc < 35:
        return 'Obesity_Type_I'
    elif imc < 40:
        return 'Obesity_Type_II'
    else:
        return 'Obesity_Type_III'

df['IMC'] = df['Weight'] / (df['Height'] ** 2)
df['NObeyesdad'] = df['IMC'].apply(categorize_imc)


# cleaning the dataset
# 2. missing values 
print('There is any missing value?(True = Yes // False = No)', df.isnull().values.any())

# 3. duplicates
df.drop_duplicates(inplace=True)
# 4. one-hot encoding categorical variables
df['Gender'] = [1 if valor == 'Male' else 0 for valor in df['Gender']]
df['family_history_with_overweight'] = [1 if valor == 'yes' else 0 for valor in df['family_history_with_overweight']]
df['FAVC'] = [1 if valor == 'yes' else 0 for valor in df['FAVC']]
df['SMOKE'] = [1 if valor == 'yes' else 0 for valor in df['SMOKE']]
df['SCC'] = [1 if valor == 'yes' else 0 for valor in df['SCC']]

# 5. removing outliers
def remove_outliers(df, columns):
    for column in columns:
        z_scores = np.abs(stats.zscore(df[column]))
        df = df[(z_scores < 3)]
    return df
df = remove_outliers(df, ['Age', 'Height', 'Weight','NCP', 'CH2O', 'FAF'])

# 6. standardizing the data
scaler = StandardScaler()
df[['Age', 'Height', 'Weight', 'NCP', 'CH2O', 'FAF']] = scaler.fit_transform(df[['Age', 'Height', 'Weight', 'NCP', 'CH2O', 'FAF']])

# 7. Preparing the data for modeling
nobesity_map = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}

df['NObeyesdad'] = df['NObeyesdad'].map(nobesity_map)
# 8. splitting the data into training and testing sets
X = df[['family_history_with_overweight', 'FAVC', 'NCP', 'CH2O', 'FAF']]
y = df['NObeyesdad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))






Dataset shape: (2111, 16)
Dataset columns: Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS'],
      dtype='object')
Dataset head:
    Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no   2.0   no  2