## Application des transformations definies dans "trasnfromers.py"

In [1]:
from transformers import *

In [2]:
import pandas as pd
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pathlib import Path
from sklearn.model_selection import train_test_split


In [3]:
columns_to_drop = ["piezo_station_department_name",
    "piezo_station_update_date",
    "piezo_station_commune_code_insee",
    "piezo_station_pe_label",
    "piezo_station_bdlisa_codes",
    "piezo_station_bss_code",
    "piezo_station_bss_id", 
    "piezo_bss_code",
    "piezo_measurement_date",
    "piezo_producer_name",
    "piezo_measure_nature_code",
    "meteo_name",
    "hydro_station_code",
    "hydro_method_code",
    "hydro_method_label", 
    "hydro_qualification_label", 
    "prelev_structure_code_0",
    "prelev_structure_code_2"]


In [7]:
path_src_dataset = Path("./data/src/X_train_Hi5.csv")

out_folder_dataset = Path("./data/cleaned")
# Create the folder if it doesn't exist
out_folder_dataset.mkdir(parents=True, exist_ok=True)

out_folder_config = Path("./data/cleaned/pipelines")
out_folder_config.mkdir(parents=True, exist_ok=True)



# Calculate the columns to keep
#all_columns = pd.read_csv(path_src_dataset, nrows=0).columns.tolist()
#columns_to_keep = [col for col in all_columns if col not in columns_to_drop] 
columns_to_keep = ["insee_%_agri","meteo_rain_height","piezo_station_altitude", "meteo_altitude","piezo_groundwater_level_category","meteo_date"]
# Step 3: Load the CSV file with only the relevant columns
df = pd.read_csv(path_src_dataset, usecols=columns_to_keep)

target = "piezo_groundwater_level_category"

X = df.drop(columns=target)

mapping = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
y = df[target].map(mapping)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Apply the transformers selected
pipeline = Pipeline(steps=[
("DropNaRate", DropNaRate(0.7)),
("CleanFeatures", CleanFeatures(["insee_%_agri","meteo_rain_height"])),
("Altitude", AltitudeTrans(columns=["piezo_station_altitude", "meteo_altitude"])),
('Dates',DateTransformer())
# ... Add others transformations
])


print("Pipeline ongoing...")
processed_X_train = pipeline.fit_transform(X_train)
processed_X_val = pipeline.transform(X_val)

# Save the processed data to CSV
processed_X_train.to_csv(out_folder_dataset / "X_train.csv", index=False)
processed_X_val.to_csv(out_folder_dataset / "X_val.csv", index=False)
y_train.to_csv(out_folder_dataset / "y_train.csv", index=False)
y_val.to_csv(out_folder_dataset / "y_val.csv", index=False)

print("Data converted to csv")

Pipeline ongoing...
>> (Info) Droped columns : []
>> (Info) Column insee_%_agri has been standardized to numeric.
>> (Info) Missing values in insee_%_agri filled with median: 3.4
>> (Info) Missing values in meteo_rain_height filled with median: 0.1
>> (Info) Missing values in insee_%_agri filled with median: 3.4
>> (Info) Missing values in meteo_rain_height filled with median: 0.1
Data converted to csv


In [12]:
processed_X_train

Unnamed: 0,piezo_station_altitude,meteo_altitude,date,meteo_rain_height,insee_%_agri
1630794,92.50,73,0.155266,0.0,10.0
1086631,253.47,235,0.390368,0.4,0.1
783902,31.00,49,0.635898,0.4,3.3
2266722,102.00,116,0.289320,1.6,8.8
1451391,189.00,222,-0.866562,13.8,54.7
...,...,...,...,...,...
1692743,76.00,49,0.552997,0.0,0.0
2356330,152.00,191,0.791221,0.2,0.0
2229084,54.00,92,0.035473,0.2,3.1
2768307,253.00,238,-0.561923,0.6,11.1


In [9]:
#quick KNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Normalize the numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(processed_X_train)
X_test_scaled = scaler.transform(processed_X_val)

# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # Default is 5 neighbors
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


ValueError: Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values