In [48]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [49]:
import yaml

 #leemos el archivo Yaml en Python
try:
    with open("../config.yaml", 'r') as file:
        config = yaml.safe_load(file)
except Exception as e:
    print('Error leyendo el archivo .yaml:', e)

In [50]:
df = pd.read_csv(config['data']['df'], sep=",", header=0, low_memory=False)

In [51]:
#Hacemos que se vean rodas las columnas
pd.set_option('display.max_columns', None)

FAVC:" Do you eat high caloric food frequently? "

FCVC:" Do you usually eat vegetables in your meals? "

NCP:" How many main meals do you have daily? "

CAEC:" Do you eat any food between meals? "

SMOKE:" Do you smoke? "

CH2O:" How much water do you drink daily? "

SCC:" Do you monitor the calories you eat daily? "

FAF:" How often do you have physical activity? "

TUE:" How much time do you use technological devices such as cell phone, videogames, television, computer and others? "

CALC :" How often do you drink alcohol? "

MTRANS:" Which transportation do you usually use?

In [106]:
df

Unnamed: 0,age,gender,height,weight,calc,favc,fcvc,ncp,scc,smoke,ch2o,family_history,faf,tue,caec,mtrans,obesity_level,bmi
0,21.000000,Female,1.620000,64.000000,no,no,2.0,3.0,no,no,2.000000,yes,0.000000,1.000000,Sometimes,Public_Transportation,Normal_Weight,243865.264441
1,21.000000,Female,1.520000,56.000000,Sometimes,no,3.0,3.0,yes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight,242382.271468
2,23.000000,Male,1.800000,77.000000,Frequently,no,2.0,3.0,no,no,2.000000,yes,2.000000,1.000000,Sometimes,Public_Transportation,Normal_Weight,237654.320988
3,27.000000,Male,1.800000,87.000000,Frequently,no,3.0,3.0,no,no,2.000000,no,2.000000,0.000000,Sometimes,Walking,Overweight_Level_I,268518.518519
4,22.000000,Male,1.780000,89.800000,Sometimes,no,2.0,1.0,no,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II,283423.810125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,20.976842,Female,1.710730,131.408528,Sometimes,yes,3.0,3.0,no,no,1.728139,yes,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III,449014.750007
2107,21.982942,Female,1.748584,133.742943,Sometimes,yes,3.0,3.0,no,no,2.005130,yes,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III,437419.233525
2108,22.524036,Female,1.752206,133.689352,Sometimes,yes,3.0,3.0,no,no,2.054193,yes,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III,435438.165841
2109,24.361936,Female,1.739450,133.346641,Sometimes,yes,3.0,3.0,no,no,2.852339,yes,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III,440715.354033


In [23]:
#Miramos tanto sus dtypes como si tiene nulos
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             2111 non-null   float64
 1   Gender                          2111 non-null   object 
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   CALC                            2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   SCC                             2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  family_history_with_overweight  2111 non-null   object 
 12  FAF                             21

In [24]:
#Miramos si tiene datos nulos
df.isnull().sum()

Age                               0
Gender                            0
Height                            0
Weight                            0
CALC                              0
FAVC                              0
FCVC                              0
NCP                               0
SCC                               0
SMOKE                             0
CH2O                              0
family_history_with_overweight    0
FAF                               0
TUE                               0
CAEC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

### **Limpieza**

In [56]:
#Quitamos las mayusculas 
df.columns=df.columns.str.lower()

In [57]:
#Cambiamos los nombres de dos columnas "family_history_with_overweight" y "nobeyesdad"
df=df.rename(columns={"family_history_with_overweight": "family_history", "nobeyesdad": "obesity_level"})

In [58]:
#Calculo el índice de masa corporal (IMC) por nivel de obesidad
df['bmi'] = df['weight'] / (df['height'] / 100) ** 2

**Preparamos los datos**

In [61]:
#seleccionamos las columnas categoricas y numericas y las separamos en dataframes distintos.
potential_categorical_from_numerical = df.select_dtypes("number").loc[:, df.select_dtypes("number").nunique() < 20]

df_categorical = pd.concat([df.select_dtypes("object"), potential_categorical_from_numerical], axis=1)

df_numerical = df.select_dtypes("number").drop(columns=potential_categorical_from_numerical.columns)

In [65]:
# Realizar One-Hot Encoding usando pd.get_dummies()
df_encoded = pd.get_dummies(df_categorical, columns=['calc', 'caec', 'mtrans'], drop_first=False)

In [66]:
# Convertir 'gender' a formato binario
df_encoded['gender'] = df_encoded['gender'].apply(lambda x: 1 if x == 'Male' else 0)

# Asegurar que las columnas binarias estén en formato correcto (0 y 1)
binary_columns = ['favc', 'scc', 'smoke', 'family_history']
for col in binary_columns:
    df_encoded[col] = df_encoded[col].apply(lambda x: 1 if x == 'yes' else 0)

In [67]:
df_encoded

Unnamed: 0,gender,favc,scc,smoke,family_history,obesity_level,calc_Always,calc_Frequently,calc_Sometimes,calc_no,caec_Always,caec_Frequently,caec_Sometimes,caec_no,mtrans_Automobile,mtrans_Bike,mtrans_Motorbike,mtrans_Public_Transportation,mtrans_Walking
0,0,0,0,0,1,Normal_Weight,False,False,False,True,False,False,True,False,False,False,False,True,False
1,0,0,1,1,1,Normal_Weight,False,False,True,False,False,False,True,False,False,False,False,True,False
2,1,0,0,0,1,Normal_Weight,False,True,False,False,False,False,True,False,False,False,False,True,False
3,1,0,0,0,0,Overweight_Level_I,False,True,False,False,False,False,True,False,False,False,False,False,True
4,1,0,0,0,0,Overweight_Level_II,False,False,True,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,1,0,0,1,Obesity_Type_III,False,False,True,False,False,False,True,False,False,False,False,True,False
2107,0,1,0,0,1,Obesity_Type_III,False,False,True,False,False,False,True,False,False,False,False,True,False
2108,0,1,0,0,1,Obesity_Type_III,False,False,True,False,False,False,True,False,False,False,False,True,False
2109,0,1,0,0,1,Obesity_Type_III,False,False,True,False,False,False,True,False,False,False,False,True,False


In [68]:
# Unimos los dos dataframes
df_final = pd.concat([df_encoded, df_numerical], axis=1)

In [69]:
df_final

Unnamed: 0,gender,favc,scc,smoke,family_history,obesity_level,calc_Always,calc_Frequently,calc_Sometimes,calc_no,caec_Always,caec_Frequently,caec_Sometimes,caec_no,mtrans_Automobile,mtrans_Bike,mtrans_Motorbike,mtrans_Public_Transportation,mtrans_Walking,age,height,weight,fcvc,ncp,ch2o,faf,tue,bmi
0,0,0,0,0,1,Normal_Weight,False,False,False,True,False,False,True,False,False,False,False,True,False,21.000000,1.620000,64.000000,2.0,3.0,2.000000,0.000000,1.000000,243865.264441
1,0,0,1,1,1,Normal_Weight,False,False,True,False,False,False,True,False,False,False,False,True,False,21.000000,1.520000,56.000000,3.0,3.0,3.000000,3.000000,0.000000,242382.271468
2,1,0,0,0,1,Normal_Weight,False,True,False,False,False,False,True,False,False,False,False,True,False,23.000000,1.800000,77.000000,2.0,3.0,2.000000,2.000000,1.000000,237654.320988
3,1,0,0,0,0,Overweight_Level_I,False,True,False,False,False,False,True,False,False,False,False,False,True,27.000000,1.800000,87.000000,3.0,3.0,2.000000,2.000000,0.000000,268518.518519
4,1,0,0,0,0,Overweight_Level_II,False,False,True,False,False,False,True,False,False,False,False,True,False,22.000000,1.780000,89.800000,2.0,1.0,2.000000,0.000000,0.000000,283423.810125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,1,0,0,1,Obesity_Type_III,False,False,True,False,False,False,True,False,False,False,False,True,False,20.976842,1.710730,131.408528,3.0,3.0,1.728139,1.676269,0.906247,449014.750007
2107,0,1,0,0,1,Obesity_Type_III,False,False,True,False,False,False,True,False,False,False,False,True,False,21.982942,1.748584,133.742943,3.0,3.0,2.005130,1.341390,0.599270,437419.233525
2108,0,1,0,0,1,Obesity_Type_III,False,False,True,False,False,False,True,False,False,False,False,True,False,22.524036,1.752206,133.689352,3.0,3.0,2.054193,1.414209,0.646288,435438.165841
2109,0,1,0,0,1,Obesity_Type_III,False,False,True,False,False,False,True,False,False,False,False,True,False,24.361936,1.739450,133.346641,3.0,3.0,2.852339,1.139107,0.586035,440715.354033


**Entrenamiento**

In [88]:
# Eliminamos la columna 'obesity_level' de los features y lo colocamos como target
features = df_final.drop(columns = "obesity_level")
target = df_final[["obesity_level"]]

In [89]:
rf = RandomForestClassifier(max_depth=10)
rf.fit(features, target)

  return fit_method(estimator, *args, **kwargs)


**Predicción**

In [90]:
# Creamos un nuevo dataframe para datos artificiales categoricos
df_cat_to_predict = pd.DataFrame(columns= df_categorical.columns)

# Eliminamos la columna 'obesity_level' ya que es la que queremos predecir
df_cat_to_predict.drop(columns=['obesity_level'], inplace=True)

# Generamos datos aleatorios
gender_options = df_categorical['gender'].unique()
temp_gender = np.random.choice(gender_options)
df_cat_to_predict.loc[1, "gender"] = temp_gender

calc_options= df_categorical['calc'].unique()
temp_calc=np.random.choice(calc_options)
df_cat_to_predict.loc[1, "calc"] = temp_calc

favc_options= df_categorical['favc'].unique()
temp_favc=np.random.choice(favc_options)
df_cat_to_predict.loc[1, "favc"] = temp_favc

scc_options= df_categorical['scc'].unique()
temp_scc=np.random.choice(scc_options)
df_cat_to_predict.loc[1, "scc"] = temp_scc

smoke_options= df_categorical['smoke'].unique()
temp_smoke=np.random.choice(smoke_options)
df_cat_to_predict.loc[1, "smoke"] = temp_smoke

family_history_options= df_categorical['family_history'].unique()
temp_family_history=np.random.choice(family_history_options)
df_cat_to_predict.loc[1, "family_history"] = temp_family_history

caec_options= df_categorical['caec'].unique()
temp_caec=np.random.choice(caec_options)
df_cat_to_predict.loc[1, "caec"] = temp_caec

mtrans_options= df_categorical['mtrans'].unique()
temp_mtrans=np.random.choice(mtrans_options)
df_cat_to_predict.loc[1, "mtrans"] = temp_mtrans

In [91]:
# Convertir 'gender' a formato binario
df_cat_to_predict['gender'] = df_cat_to_predict['gender'].apply(lambda x: 1 if x == 'Male' else 0)

# Asegurar que las columnas binarias estén en formato correcto (0 y 1)
binary_columns = ['favc', 'scc', 'smoke', 'family_history']
for col in binary_columns:
    df_cat_to_predict[col] = df_cat_to_predict[col].apply(lambda x: 1 if x == 'yes' else 0)

In [92]:
#Creamos las columnas faltantes
new_columns = ['calc_Always','calc_Frequently',
       'calc_Sometimes', 'calc_no','caec_Always','caec_Frequently', 'caec_Sometimes',
       'caec_no','mtrans_Automobile','mtrans_Bike', 'mtrans_Motorbike',
       'mtrans_Public_Transportation', 'mtrans_Walking']

# Añadimos las nuevas columnas
for col in new_columns:
    df_cat_to_predict[col] = np.nan

In [93]:
# Rellenamos las filas de las columnas nuevas con los datos aleatorios que se generen
df_cat_to_predict.loc[1, "calc_" + df_cat_to_predict['calc'][1]] = 1 

In [94]:
df_cat_to_predict.loc[1, "caec_" + df_cat_to_predict['caec'][1]] = 1 

In [95]:
df_cat_to_predict.loc[1, "mtrans_" + df_cat_to_predict['mtrans'][1]] = 1 

In [96]:
# Eliminamos las columnas sobrantes 
df_cat_to_predict.drop(columns=['calc', 'caec', 'mtrans'], inplace=True)

In [97]:
# Rellenamos los Na con 0
df_cat_to_predict.fillna(0, inplace=True)

In [98]:
# # Creamos un nuevo dataframe para datos artificiales numericos
df_num_predict = pd.DataFrame(columns= df_numerical.columns)

# Generamos datos aleatorios
age_options= df_numerical['age'].unique()
temp_age=np.random.choice(age_options)
df_num_predict.loc[1, "age"] = temp_age

height_options= df_numerical['height'].unique()
temp_height=np.random.choice(height_options)
df_num_predict.loc[1, "height"] = temp_height

weight_options= df_numerical['weight'].unique()
temp_weight=np.random.choice(weight_options)
df_num_predict.loc[1, "weight"] = temp_weight

fcvc_options= df_numerical['fcvc'].unique()
temp_fcvc=np.random.choice(fcvc_options)
df_num_predict.loc[1, "fcvc"] = temp_fcvc

ncp_options= df_numerical['ncp'].unique()
temp_ncp=np.random.choice(ncp_options)
df_num_predict.loc[1, "ncp"] = temp_ncp

ch2o_options= df_numerical['ch2o'].unique()
temp_ch2o=np.random.choice(ch2o_options)
df_num_predict.loc[1, "ch2o"] = temp_ch2o

faf_options= df_numerical['faf'].unique()
temp_faf=np.random.choice(faf_options)
df_num_predict.loc[1, "faf"] = temp_faf

tue_options= df_numerical['tue'].unique()
temp_tue=np.random.choice(tue_options)
df_num_predict.loc[1, "tue"] = temp_tue

bmi_options= df_numerical['bmi'].unique()
temp_bmi=np.random.choice(bmi_options)
df_num_predict.loc[1, "bmi"] = temp_bmi

In [99]:
# Unimos los dos dataframes
df_predict = pd.concat([df_cat_to_predict, df_num_predict], axis=1)

In [100]:
# Cambiamos el dtype a integer
df_predict[df_predict.select_dtypes(include=["bool"]).columns]=df_predict[df_predict.select_dtypes(include=["bool"]).columns].astype(int)

In [101]:
df_predict

Unnamed: 0,gender,favc,scc,smoke,family_history,calc_Always,calc_Frequently,calc_Sometimes,calc_no,caec_Always,caec_Frequently,caec_Sometimes,caec_no,mtrans_Automobile,mtrans_Bike,mtrans_Motorbike,mtrans_Public_Transportation,mtrans_Walking,age,height,weight,fcvc,ncp,ch2o,faf,tue,bmi
1,1,0,0,1,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,22.935612,1.812259,133.10761,2.04516,3.292956,1.796257,1.999836,0.561661,356815.035918


In [103]:
# Generamos la predicción
rf.predict(df_predict)[0]

'Obesity_Type_II'