## Application des transformations definies dans "trasnfromers.py"

In [1]:
from transformers import *

In [2]:
import pandas as pd
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pathlib import Path
from sklearn.model_selection import train_test_split


In [None]:
path_src_dataset = Path("./data/src/X_train_Hi5.csv")

out_folder_dataset = Path("./data/cleaned")
# Create the folder if it doesn't exist
out_folder_dataset.mkdir(parents=True, exist_ok=True)

out_folder_config = Path("./data/cleaned/pipelines")
out_folder_config.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(path_src_dataset)

target = "piezo_groundwater_level_category"

X = df.drop(columns=target)

mapping = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
y = df[target].map(mapping)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Apply the transformers selected
pipeline = Pipeline(steps=[
("DropNaRate", DropNaRate(0.7)),
("CleanFeatures", CleanFeatures(['"insee_%_agri","meteo_rain_height"'])),
("Altitude", AltitudeTrans(columns=["piezo_station_altitude", "meteo_altitude"])),
('Dates',DateTransformer())
# ... Add others transformations
])


print("Pipeline ongoing...")
processed_X_train = pipeline.fit_transform(X_train)
processed_X_val = pipeline.transform(X_val)

# Save the processed data to CSV
processed_X_train.to_csv(out_folder_dataset / "X_train.csv", index=False)
processed_X_val.to_csv(out_folder_dataset / "X_val.csv", index=False)
y_train.to_csv(out_folder_dataset / "y_train.csv", index=False)
y_val.to_csv(out_folder_dataset / "y_val.csv", index=False)

print("Data converted to csv")

  df = pd.read_csv(path_src_dataset)
