In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection    import train_test_split
from sklearn.pipeline           import Pipeline
from sklearn.preprocessing      import OneHotEncoder, StandardScaler      
from sklearn.compose            import ColumnTransformer

from sklearn import set_config
set_config(transform_output="pandas") # !!! ATTENTION
# set_config(restore=True) à mettre à la fin du script 


k_AssetsDir   = "assets/"
k_FileName    = "walmart_store_sales.csv"

k_Gold        = 1.618     # gold number for ratio
k_Width       = 12
k_Height      = k_Width/k_Gold

k_target            = "weekly_sales"
k_random_state      = 42            
k_test_size         = 20/100        



In [15]:
from sklearn.base import TransformerMixin, BaseEstimator

class MyCustomImputer(BaseEstimator, TransformerMixin):

    def __init__(self, strategy='mean'):
        self.strategy = strategy

    def fit(self, X, y=None):
        if self.strategy == 'mean':
            self.fill_value = np.nanmean(X, axis=0)
        elif self.strategy == 'median':
            self.fill_value = np.nanmedian(X, axis=0)
        elif self.strategy == 'most_frequent':
            self.fill_value = np.nanmax(X, axis=0)
        else:
            raise ValueError("Invalid strategy. Please choose 'mean', 'median', or 'most_frequent'.")
        return self

    def transform(self, X):
        return np.where(np.isnan(X), self.fill_value, X)


In [17]:


# ###################################################################################
filename_in = k_AssetsDir+k_FileName
df = pd.read_csv(filename_in)
df.columns = df.columns.str.lower()
df.head(10)  

X = df.drop(columns=k_target)
y = df[k_target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=k_test_size, random_state=k_random_state) 

X_train.to_csv(k_AssetsDir+"custom_train_avant.csv")

numeric_features      = X.select_dtypes(include="number").columns
categorical_features  = X.select_dtypes(exclude="number").columns

numeric_transformer = Pipeline(
  steps=[
    ("custom1",     MyCustomImputer(strategy='mean'))
    # ("scaler_num",  StandardScaler()),
  ]
)

categorical_transformer = Pipeline(
  steps=[
    ("encoder_cat", OneHotEncoder(drop="first")),                 
  ]
)

preprocessor = ColumnTransformer(
  transformers=[
    ("num", numeric_transformer,     numeric_features),
    # ("cat", categorical_transformer, categorical_features),
  ]
)

X_train = preprocessor.fit_transform(X_train)
X_test  = preprocessor.transform(X_test)

X_train.to_csv(k_AssetsDir+"custom_train_apres.csv")
X_test.to_csv(k_AssetsDir+"custom_test_apres.csv")

