In [4]:
import pandas as pd
import numpy as np

import os
import shutil
import zipfile

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

In [5]:
df_raw_train = pd.read_csv(r'../data/IN/train.csv')

In [6]:
new_column_names = {
    "FAVC": "HighCalorieFoodFreq",
    "FCVC": "VegetableConsumptionFreq",
    "NCP": "MainMealsPerDay",
    "CAEC": "SnackingFrequency",
    "SMOKE": "SmokingHabit",
    "CH2O": "DailyWaterIntake",
    "SCC": "CaloricMonitoring",
    "FAF": "PhysicalActivityPerWeek",
    "TUE": "ScreenTimePerDay",
    "CALC": "AlcoholConsumption",
    "MTRANS": "ModeOfTransportation"
}

# Rename columns
df_raw_train.rename(columns=new_column_names, inplace=True)

In [7]:
df_raw_train.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'HighCalorieFoodFreq',
       'VegetableConsumptionFreq', 'MainMealsPerDay', 'SnackingFrequency',
       'SmokingHabit', 'DailyWaterIntake', 'CaloricMonitoring',
       'PhysicalActivityPerWeek', 'ScreenTimePerDay', 'AlcoholConsumption',
       'ModeOfTransportation', 'NObeyesdad'],
      dtype='object')

In [8]:
cols_drop = ["SmokingHabit"]
cols_keep = df_raw_train.columns[~df_raw_train.columns.isin(cols_drop)]
predict_col = "NObeyesdad"


df_raw_train.set_index("id", inplace=True)


In [9]:
df_prepared_train = df_raw_train.drop(columns=cols_drop).copy()

# Manejo de outliers

In [10]:
def windsorize_upper(df, columns, upper_percentile=0.99):
    """
    Windsorize the upper tail of a series at a given percentile.
    """
    for col in columns:
        upper_limit = df[col].quantile(upper_percentile)
        df[col] = np.where(df[col] > upper_limit, upper_limit, df[col])
    return df

df_prepared_train = windsorize_upper(df_prepared_train, ["Age"])
df_prepared_train = windsorize_upper(df_prepared_train, ["Height"])



In [11]:
df_prepared_train.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T.style.background_gradient(cmap='coolwarm')

Unnamed: 0,count,mean,std,min,25%,50%,75%,90%,95%,99%,max
Age,20758.0,23.778037,5.425799,14.0,20.0,22.815416,26.0,31.386405,35.460417,41.0,41.0
Height,20758.0,1.700016,0.086771,1.45,1.631856,1.7,1.762887,1.818641,1.84629,1.889104,1.889104
Weight,20758.0,87.887768,26.379443,39.0,66.0,84.064875,111.600553,120.996581,132.116491,137.855041,165.057269
VegetableConsumptionFreq,20758.0,2.445908,0.533218,1.0,2.0,2.393837,3.0,3.0,3.0,3.0,3.0
MainMealsPerDay,20758.0,2.761332,0.705375,1.0,3.0,3.0,3.0,3.0,3.520555,4.0,4.0
DailyWaterIntake,20758.0,2.029418,0.608467,1.0,1.792022,2.0,2.549617,2.868167,3.0,3.0,3.0
PhysicalActivityPerWeek,20758.0,0.981747,0.838302,0.0,0.008013,1.0,1.587406,2.0,2.545707,3.0,3.0
ScreenTimePerDay,20758.0,0.616756,0.602113,0.0,0.0,0.573887,1.0,1.444183,2.0,2.0,2.0


# Normalizacion

In [12]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler, LabelEncoder, OrdinalEncoder
import pandas as pd

class PreprocessingPipeline:
    def __init__(self, target_column):
        self.target_column = target_column
        self.label_encoders = {}

        self.ordinal_encoder = OrdinalEncoder(
            handle_unknown='use_encoded_value', 
            unknown_value=-1, 
            categories=[
                ['no', 'Sometimes', 'Frequently'],  # AlcoholConsumption
                ['no', 'Sometimes', 'Frequently', 'Always']  # SnackingFrequency
            ]
        )

        # Utiliser `sparse` au lieu de `sparse_output` pour compatibilité
        self.onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')
        self.scaler = RobustScaler()
        self.Y_label_encoder = LabelEncoder()

        self.feature_names = []

        self.caracteristicas_categoricas_ordinal = ["AlcoholConsumption", "SnackingFrequency"]
        self.caracteristicas_categoricas_label = ["Gender", "family_history_with_overweight", "HighCalorieFoodFreq", "CaloricMonitoring"]
        self.caracteristicas_categoricas_onehot = ["ModeOfTransportation"]
        self.target_column_label = ["NObeyesdad"]

    def fit(self, df):
        if self.target_column in df.columns:
            X = df.drop(columns=self.target_column)
            y = df[self.target_column]
        else:
            X = df
            y = None  

        # Vérifier la présence des colonnes attendues
        required_cols = (
            self.caracteristicas_categoricas_ordinal +
            self.caracteristicas_categoricas_label +
            self.caracteristicas_categoricas_onehot
        )
        for col in required_cols:
            if col not in X.columns:
                raise ValueError(f"Colonne manquante : {col}")

        # Détection plus stricte des colonnes numériques
        self.numeric_features = list(X.select_dtypes(include=["int64", "float64"]).columns)

        if X.isnull().any().any():
            print("⚠️ Attention : des valeurs manquantes sont présentes dans les données.")

        self.ordinal_encoder.fit(X[self.caracteristicas_categoricas_ordinal])
        self.onehot_encoder.fit(X[self.caracteristicas_categoricas_onehot])
        self.scaler.fit(X[self.numeric_features])

        for col in self.caracteristicas_categoricas_label:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le

        onehot_names = self.onehot_encoder.get_feature_names_out(self.caracteristicas_categoricas_onehot)
        self.feature_names = (
            self.numeric_features +
            self.caracteristicas_categoricas_ordinal +
            self.caracteristicas_categoricas_label +
            list(onehot_names)
        )

        if y is not None:
            self.Y_label_encoder.fit(y)

    def transform(self, df):
        if self.target_column in df.columns:
            X = df.drop(columns=self.target_column)
            y = df[self.target_column]
        else:
            X = df
            y = None  

        X_num = pd.DataFrame(self.scaler.transform(X[self.numeric_features]), columns=self.numeric_features, index=X.index)
        X_ord = pd.DataFrame(self.ordinal_encoder.transform(X[self.caracteristicas_categoricas_ordinal]),
                             columns=self.caracteristicas_categoricas_ordinal, index=X.index)

        X_lbl = pd.DataFrame(index=X.index)
        for col in self.caracteristicas_categoricas_label:
            le = self.label_encoders[col]
            X_lbl[col] = le.transform(X[col])

        X_onehot = pd.DataFrame(self.onehot_encoder.transform(X[self.caracteristicas_categoricas_onehot]),
                                columns=self.onehot_encoder.get_feature_names_out(self.caracteristicas_categoricas_onehot),
                                index=X.index)

        X_final = pd.concat([X_num, X_ord, X_lbl, X_onehot], axis=1)
        X_final = X_final[self.feature_names]  # Pour garantir l'ordre

        if y is not None:
            y_transformed = self.Y_label_encoder.transform(y)
            df_final = pd.concat([X_final, pd.Series(y_transformed, index=X.index, name=self.target_column)], axis=1)
        else:
            df_final = X_final
        
        return df_final

    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

    def Y_inverse_transform(self, y_array):
        return self.Y_label_encoder.inverse_transform(y_array)

    def get_feature_names(self):
        return self.feature_names


In [13]:
prep = PreprocessingPipeline(target_column="NObeyesdad")

# Traitement des features
df_train_ready = prep.fit_transform(df_prepared_train)

df_train_ready

Unnamed: 0_level_0,Age,Height,Weight,VegetableConsumptionFreq,MainMealsPerDay,DailyWaterIntake,PhysicalActivityPerWeek,ScreenTimePerDay,AlcoholConsumption,SnackingFrequency,Gender,family_history_with_overweight,HighCalorieFoodFreq,CaloricMonitoring,ModeOfTransportation_Bike,ModeOfTransportation_Motorbike,ModeOfTransportation_Public_Transportation,ModeOfTransportation_Walking,NObeyesdad
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0.271266,-0.000015,-0.052520,-0.393837,-0.016703,1.007891,-0.633155,0.402586,1.0,1.0,1,1,1,0,0.0,0.0,1.0,0.0,6
1,-0.802569,-1.068449,-0.593521,-0.393837,0.000000,0.000000,0.000000,0.426113,0.0,2.0,0,1,1,0,0.0,0.0,0.0,0.0,1
2,-0.802569,0.087460,-0.743393,-0.513303,-1.588315,-0.118298,-0.084814,1.099697,0.0,1.0,0,1,1,0,0.0,0.0,1.0,0.0,0
3,-0.310447,0.081889,1.035294,0.606163,0.000000,-0.430229,0.296230,0.206312,1.0,1.0,0,1,1,0,0.0,0.0,1.0,0.0,4
4,1.470944,1.443200,0.213444,0.285827,-1.028528,-0.026600,0.612877,0.357834,1.0,1.0,1,1,1,0,0.0,0.0,1.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,0.386945,0.508475,0.660567,0.525747,0.000000,0.200383,0.209270,-0.377207,1.0,1.0,1,1,1,0,0.0,0.0,1.0,0.0,3
20754,-0.802569,0.076318,-0.747028,0.606163,1.000000,-1.319966,0.633155,0.426113,1.0,2.0,1,0,1,0,0.0,0.0,1.0,0.0,0
20755,-0.452398,0.912433,0.471828,0.013980,0.000000,0.000000,0.100064,0.624552,0.0,1.0,1,1,1,0,0.0,0.0,1.0,0.0,3
20756,1.839589,0.000000,-0.011946,0.277401,-1.028528,0.191181,-0.633155,0.399947,0.0,1.0,1,1,1,0,0.0,0.0,0.0,0.0,6


In [14]:
OUTPUT_FOLDER = "../data/OUT/"

def save_dataframes_to_csv(output_folder, df_train, train_filename="train_clean.csv"):
    """
    Guarda los DataFrames de entrenamiento y prueba en formato CSV en una carpeta específica.
    Si la carpeta ya existe, borra todo su contenido antes de guardar los nuevos archivos.
    
    Args:
        output_folder (str): La ruta de la carpeta donde se guardarán los archivos CSV.
        df_train (pd.DataFrame): El DataFrame de entrenamiento que se va a guardar.
       
        train_filename (str, opcional): El nombre del archivo CSV para el DataFrame de entrenamiento.
        
    
    """
    # Si la carpeta ya existe, eliminar todo su contenido
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)  # Borrar toda la carpeta y su contenido
        print(f"Carpeta {output_folder} eliminada.")
    
    # Crear la carpeta si no existe
    os.makedirs(output_folder, exist_ok=True)
    
    # Definir las rutas completas de los archivos
    train_path = os.path.join(output_folder, train_filename)
 
    
    # Guardar los DataFrames en formato CSV
    df_train.to_csv(train_path, index=False)

    
    print(f"DataFrames guardados en {output_folder}:")
    print(f" - {train_filename}")

save_dataframes_to_csv(OUTPUT_FOLDER, df_train_ready)

DataFrames guardados en ../data/OUT/:
 - train_clean.csv


In [20]:
import pickle
chemin_fichier = r"C:\Users\utilisateur\OneDrive - Grenoble Ecole de Management\Documents\MADRID ESIC\2S\IA machine learning\Proyecto_final\src/pipeline_preprocessing.pkl"

with open(chemin_fichier, "wb") as f:
    pickle.dump(prep, f)

In [18]:
import sys
print(sys.version)


3.12.9 | packaged by conda-forge | (main, Mar  4 2025, 22:37:18) [MSC v.1943 64 bit (AMD64)]
