# Cleaning and EDA

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [7]:
# =======================================================
# I. Carga y Análisis Inicial
# =======================================================
df = pd.read_csv('../data/raw/obesity_estimation_modified.csv')
print("Análisis Inicial de Tipos de Datos y Nulos:")
df.info()


Análisis Inicial de Tipos de Datos y Nulos:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2153 entries, 0 to 2152
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Gender                          2135 non-null   object
 1   Age                             2126 non-null   object
 2   Height                          2125 non-null   object
 3   Weight                          2137 non-null   object
 4   family_history_with_overweight  2133 non-null   object
 5   FAVC                            2137 non-null   object
 6   FCVC                            2136 non-null   object
 7   NCP                             2129 non-null   object
 8   CAEC                            2131 non-null   object
 9   SMOKE                           2134 non-null   object
 10  CH2O                            2124 non-null   object
 11  SCC                             2138 non-null   object
 12  FAF 

In [8]:
# =======================================================
# II. Limpieza de Estructura
# =======================================================
df = df.drop(columns=['mixed_type_col'])
df['NObeyesdad'] = df['NObeyesdad'].str.lower().str.strip()



In [9]:
# =======================================================
# III. Conversión de Tipos de Datos y Manejo de Errores
# =======================================================
numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
for col in numerical_cols:
    # Coercer a NaN los valores no numéricos
    df[col] = pd.to_numeric(df[col], errors='coerce')



In [10]:
# =======================================================
# IV. Imputación de Valores Nulos
# =======================================================
# Imputar Numéricas con Mediana
for col in df.select_dtypes(include=[np.float64]).columns:
    df[col].fillna(df[col].median(), inplace=True)
# Imputar Categóricas con Moda
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [11]:
# =======================================================
# V. Gestión de Outliers (Capping)
# =======================================================
capping_limits = {
    'Age': 100, 'Height': 2.5, 'Weight': 300, 'FCVC': 3,
    'NCP': 4, 'CH2O': 3, 'FAF': 3, 'TUE': 2
}
for col, limit in capping_limits.items():
    df[col] = df[col].clip(upper=limit)



In [12]:
# =======================================================
# VII. Preprocesamiento Final (Encoding)
# =======================================================
# 1. Label Encoding para la Variable Target
le = LabelEncoder()
df['NObeyesdad_Encoded'] = le.fit_transform(df['NObeyesdad'])
df_temp = df.drop(columns=['NObeyesdad']) # Quitar la columna original

# 2. One-Hot Encoding para Features
categorical_features = df_temp.select_dtypes(include=['object']).columns
df_final_encoded = pd.get_dummies(df_temp, columns=categorical_features, drop_first=True)


In [13]:

# =======================================================
# VIII. Guardar el Dataset Final
# =======================================================

df_final_encoded.to_csv('../data/processed/a01796211/obesity_estimation_final_preprocessed.csv', index=False)
print(f"\nDataset final guardado como: obesity_estimation_final_preprocessed.csv")



Dataset final guardado como: obesity_estimation_final_preprocessed.csv
