## Application des transformations definies dans "trasnfromers.py"

In [1]:
from transformers import *

In [2]:
import pandas as pd
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pathlib import Path
from sklearn.model_selection import train_test_split

import pickle


In [3]:
columns_to_drop = [
        "piezo_station_department_name",
        "piezo_station_update_date",
        "piezo_station_commune_code_insee",
        "piezo_station_pe_label",
        "piezo_station_bdlisa_codes",
        "piezo_station_bss_code",
        "piezo_station_bss_id", 
        "piezo_bss_code",
        "piezo_measurement_date",
        "piezo_producer_name",
        "piezo_measure_nature_code",
        "meteo_name",
        "meteo_id", 
        "meteo_latitude",
        "meteo_longitude",
        "hydro_station_code",
        "hydro_method_code", 
        "hydro_method_label", 
        "hydro_qualification_label", 
        "prelev_structure_code_0",
        "prelev_structure_code_2",
        "prelev_structure_code_0",
        "prelev_commune_code_insee_0",
        "piezo_station_department_code",
        
        "meteo_DRR", 
        "meteo_temperature_min_ground", 
        "meteo_temperature_min_50cm", 
        "meteo_pressure_avg",
        "meteo_pression_maxi", 
        "meteo_wind_speed_avg_2m", 
        "meteo_wind_max_2m", 
        "meteo_wind_direction_max_inst_2m", 
        "meteo_time_wind_max_2m", 
        "meteo_wetting_duration", 
        "meteo_sunshine_duration", 
        "meteo_radiation", 
        "meteo_radiation_direct", 
        "meteo_sunshine_%", 
        "meteo_radiation_IR", 
        "meteo_radiation_UV_max", 
        "meteo_cloudiness", 
        "meteo_cloudiness_height", 
        "meteo_if_snow", 
        "meteo_if_fog", 
        "meteo_if_thunderstorm", 
        "meteo_if_sleet", 
        "meteo_if_hail", 
        "meteo_if_dew", 
        "meteo_if_black_ice", 
        "meteo_if_snow_ground", 
        "meteo_if_frost", 
        "meteo_if_smoke", 
        "meteo_if_mist",
        "meteo_if_lightning", 
        "meteo_evapotranspiration_Monteith", 
        "meteo_radiation_UV", 
        "meteo_snow_height", 
        "meteo_snow_thickness_max", 
        "meteo_snow_thickness_6h"]

ajouts_drop_yael = ["meteo_altitude","meteo_temperature_min","meteo_temperature_max"]


columns_to_drop+=ajouts_drop_yael

In [4]:
# Specification des colonnes  a garder 

##ANCIENNE VERSION
#all_columns = pd.read_csv(path_src_dataset, nrows=0).columns.tolist()
#columns_to_keep = [col for col in all_columns if col not in columns_to_drop] 


##NOUVELLE VERSION 
#Je précise ci dessous les colonnes nécessaires pour chacun des transformers, et en commentaire l'ordre dans lequel ce transformer sera utilisé
#Date
col_yass = ['meteo_date'] + ['piezo_station_commune_name']+['prelev_volume_0', 'prelev_volume_1',
                             'prelev_volume_2', 'prelev_other_volume_sum']  # ORDRE = 4 (en dernier car il supprime les dates )
#Altitude
altitude_flo = ["piezo_station_altitude", "meteo_altitude"] # ORDRE 1
prelev_flo = ["prelev_volume_0", "prelev_usage_label_0", "prelev_volume_obtention_mode_label_0", "prelev_volume_1", "prelev_usage_label_1", "prelev_volume_obtention_mode_label_1", "prelev_volume_2", "prelev_usage_label_2", "prelev_volume_obtention_mode_label_2"]
col_flo = altitude_flo + prelev_flo
#Insee & rain "CleanFeatures"
cols_yael_input = ['insee_%_agri', 'meteo_rain_height', 'insee_pop_commune', 'insee_med_living_level', 'insee_%_ind', 'insee_%_const']
cols_yael_need = ["piezo_station_department_code", "meteo_date"]

#Temperature
cols_lucien_need = ['piezo_station_department_code', 'piezo_measurement_date']
cols_lucien_input = ['meteo_temperature_avg','meteo_temperature_min','meteo__pressure_saturation_avg','meteo_temperature_max']
#Lat Long
cols_mat = ["distance_piezo_meteo",'piezo_station_longitude','piezo_station_latitude','meteo_latitude','meteo_longitude', "meteo_temperature_avg", "meteo_temperature_avg_threshold", "meteo_temperature_min", "meteo_temperature_max", "meteo_temperature_min_ground", "hydro_observation_result_elab", "hydro_status_code", "hydro_qualification_code", "hydro_hydro_quantity_elab"]


#Clean pizo
pizo_cols = ['piezo_station_investigation_depth', 'piezo_obtention_mode', 'piezo_status', 'piezo_qualification', 'piezo_measure_nature_code','piezo_station_department_code']
 
#target
target = "piezo_groundwater_level_category"

columns_to_keep = col_yass + cols_yael_input + cols_yael_need + col_flo + cols_lucien_need + cols_lucien_input + cols_mat + [target] + pizo_cols

columns_to_keep

['meteo_date',
 'piezo_station_commune_name',
 'prelev_volume_0',
 'prelev_volume_1',
 'prelev_volume_2',
 'prelev_other_volume_sum',
 'insee_%_agri',
 'meteo_rain_height',
 'insee_pop_commune',
 'insee_med_living_level',
 'insee_%_ind',
 'insee_%_const',
 'piezo_station_department_code',
 'meteo_date',
 'piezo_station_altitude',
 'meteo_altitude',
 'prelev_volume_0',
 'prelev_usage_label_0',
 'prelev_volume_obtention_mode_label_0',
 'prelev_volume_1',
 'prelev_usage_label_1',
 'prelev_volume_obtention_mode_label_1',
 'prelev_volume_2',
 'prelev_usage_label_2',
 'prelev_volume_obtention_mode_label_2',
 'piezo_station_department_code',
 'piezo_measurement_date',
 'meteo_temperature_avg',
 'meteo_temperature_min',
 'meteo__pressure_saturation_avg',
 'meteo_temperature_max',
 'distance_piezo_meteo',
 'piezo_station_longitude',
 'piezo_station_latitude',
 'meteo_latitude',
 'meteo_longitude',
 'meteo_temperature_avg',
 'meteo_temperature_avg_threshold',
 'meteo_temperature_min',
 'meteo_te

In [5]:
path_src_dataset = Path("./data/src/X_train_Hi5.csv")


# Out folders
out_folder_dataset = Path("./data/cleaned")
out_folder_dataset.mkdir(parents=True, exist_ok=True) # Create the folder if it doesn't exist
out_folder_config = Path("./data/processed/pipelines")
out_folder_config.mkdir(parents=True, exist_ok=True)

# Load the CSV file with only the relevant columns
df = pd.read_csv(path_src_dataset, usecols=columns_to_keep, nrows=100_000) #  SI on veut charger moins de lignes : ajouter --> ,nrows=10e4)
df = df.drop_duplicates()



X = df.drop(columns=target)

#Mapping du target 
mapping = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
y = df[target].map(mapping)

#Test-val split 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Apply the transformers selected
processing_pipeline = Pipeline(steps=[
("DropNaRate", DropNaRate(0.7)),
('PrelevVol', PrelevVol()),
("Prelevement", Prelev(columns=col_flo,usage_label_max_categories=4,mode_label_max_categories=4,scale=1)),
("CleanFeatures", CleanFeatures(cols_yael_input)),
("Altitude", AltitudeTrans(columns=["piezo_station_altitude", "meteo_altitude"])),
('LatLong',CleanLatLon()),
('CleanTemp',CleanTemp()),
('Temp',TemperaturePressionTrans(columns=cols_lucien_input)),
('CleanHydro', CleanHydro()),
('CleanPizo',  CleanPizo(pizo_cols)),
('Dates',DateTransformer()),
('DropCols',DropCols(columns_to_drop) )

# ... Add others transformations
])

In [12]:
print("Pipeline ongoing...")
processed_X_train = processing_pipeline.fit_transform(X_train)
processed_X_val = processing_pipeline.transform(X_val)

Pipeline ongoing...
>> (Info) Droped columns : []
>> (Info - Prelev) 'prelev_volume_0' has been filledna with mean = 1758783081.04119
>> (Info - Prelev) 'prelev_usage_label_0' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_volume_obtention_mode_label_0' has been one-hot-encoded in 4 features
>> (Info) Recuperations des moyennes des données INSEE par department
>> (Info) Infos medianes Insee recupérees
>> (Info) Valeurs Manquantes comblées avec les Médianes.
>> (Info) Calculating means for numerical features and preparing for one-hot encoding.
>> (Info) Fitting completed: Means, modes, and one-hot encoders prepared.
>> (Info) Transforming data: Filling missing values and applying one-hot encoding.
>> (Info) Missing values in piezo_station_investigation_depth filled with department means.
>> (Info) One-hot encoding applied to piezo_obtention_mode with missing values filled.
>> (Info) One-hot encoding applied to piezo_status with missing values filled.
>> (Info) One-hot

In [7]:
processed_X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79614 entries, 0 to 79613
Data columns (total 28 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   piezo_station_altitude                                   79614 non-null  float64
 1   piezo_station_longitude                                  79614 non-null  float64
 2   piezo_station_latitude                                   79614 non-null  float64
 3   meteo_date                                               79614 non-null  float64
 4   meteo_rain_height                                        79614 non-null  float64
 5   meteo_temperature_avg                                    79614 non-null  float64
 6   meteo_temperature_avg_threshold                          67380 non-null  float64
 7   meteo__pressure_saturation_avg                           79614 non-null  float64
 8   distance_piezo_meteo      

In [8]:
print("Pipeline ongoing...")
processed_X_train = processing_pipeline.fit_transform(X_train)
processed_X_val = processing_pipeline.transform(X_val)
processed_X_train.describe()

# Save the processed data to CSV
# processed_X_train.to_csv(out_folder_dataset / "X_train.csv", index=False)
# processed_X_val.to_csv(out_folder_dataset / "X_val.csv", index=False)
# y_train.to_csv(out_folder_dataset / "y_train.csv", index=False)
# y_val.to_csv(out_folder_dataset / "y_val.csv", index=False)

# print("Data converted to csv")

Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,meteo_date,meteo_rain_height,meteo_temperature_avg,meteo_temperature_avg_threshold,meteo__pressure_saturation_avg,distance_piezo_meteo,hydro_observation_result_elab,...,prelev_usage_label_0_CANAUX,prelev_usage_label_0_EAU POTABLE,prelev_usage_label_0_EAU TURBINEE (barrage),prelev_usage_label_0_infrequent_sklearn,prelev_volume_obtention_mode_label_0_Mesure directe,prelev_volume_obtention_mode_label_0_Mesure indirecte,prelev_volume_obtention_mode_label_0_Volume mesuré,prelev_volume_obtention_mode_label_0_infrequent_sklearn,hydro_observation_log,hydro_hydro_quantity_elab_infrequent_sklearn,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29
count,79614.0,79614.0,79614.0,79614.0,79614.0,79614.0,67380.0,79614.0,79614.0,79614.0,...,79614.000000,79614.000000,79614.0,79614.0,79614.0,79614.0,79614.0,79614.0,79614.0,79614.0,,,,,,,,
mean,149.999284,2.235861,46.428223,0.924409,1.928721,7.150043,7.514056,8.65342,0.95031,49607.34,...,0.103261,0.399553,0.278293,0.218894,0.492011,0.172281,0.164808,0.1709,8.329262,0.023199,,,,,,,,
std,180.251718,2.721948,2.176467,0.067874,4.603425,3.530557,3.501651,2.228708,0.217305,193351.6,...,0.304301,0.489810,0.448161,0.413499,0.499939,0.377627,0.371009,0.376424,2.141984,0.150537,,,,,,,,
min,0.0,0.72,-4.657129,41.399732,0.772456,0.0,-8.3,-9.2,0.5,0.0,1.000000e+00,...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.5,41.9,-0.078067,44.497506,0.869764,0.0,4.7,5.1,6.971429,1.0,1.062000e+03,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.967909,0.0,,,,,,,
50%,108.9,1.903628,46.399199,0.941397,0.2,7.480601,7.7,8.433333,1.0,3224.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,8.078378,0.0,,,,,,,,
75%,100.0,185.0,4.230897,48.228035,0.98524,2.0,9.645082,10.0,10.1,1.0,1.500950e+04,...,0.000000,1.0,1.0,0.0,1.0,0.0,0.0,0.0,9.616439,0.0,,,,,,,
max,5233.4,2150.0,9.51989,50.971087,1.0,189.0,20.3,20.95,14.9,1.0,3.235001e+06,...,1.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,14.98954,1.0,,,,,,,


In [13]:
processed_X_train.columns

Index(['piezo_station_investigation_depth', 'piezo_station_altitude',
       'piezo_station_longitude', 'piezo_station_latitude', 'meteo_date',
       'meteo_rain_height', 'meteo_temperature_avg',
       'meteo__pressure_saturation_avg', 'distance_piezo_meteo',
       'hydro_observation_result_elab', 'prelev_volume_0',
       'prelev_other_volume_sum', 'insee_%_agri', 'insee_pop_commune',
       'insee_med_living_level', 'insee_%_ind', 'insee_%_const',
       'prelev_usage_label_0_CANAUX', 'prelev_usage_label_0_EAU POTABLE',
       'prelev_usage_label_0_EAU TURBINEE (barrage)',
       'prelev_usage_label_0_infrequent_sklearn',
       'prelev_volume_obtention_mode_label_0_Mesure directe',
       'prelev_volume_obtention_mode_label_0_Mesure indirecte',
       'prelev_volume_obtention_mode_label_0_Volume mesuré',
       'prelev_volume_obtention_mode_label_0_infrequent_sklearn',
       'hydro_observation_log',
       'piezo_obtention_mode_Mode d'obtention inconnu',
       'piezo_obtention_

In [10]:
a

['insee_med_living_level',
 'hydro_observation_result_elab',
 'piezo_station_longitude',
 'insee_%_ind',
 'meteo_temperature_avg_threshold',
 'hydro_hydro_quantity_elab_infrequent_sklearn',
 'meteo_temperature_avg',
 'insee_pop_commune',
 'insee_%_const',
 'hydro_qualification_code',
 'piezo_station_latitude',
 'hydro_observation_log',
 'insee_%_agri',
 'meteo_date',
 'meteo__pressure_saturation_avg',
 'piezo_station_altitude',
 'prelev_volume_0',
 'hydro_status_code',
 'meteo_rain_height']

# Quick KNN

In [None]:
#quick KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=25)  # Default is 5 neighbors
knn.fit(processed_X_train, y_train)

# Make predictions
y_val_pred = knn.predict(processed_X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))


processed_X_train.head()

In [None]:
#quick KNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

global_pipeline = Pipeline([
    ('processing', processing_pipeline),
    ('StandardScaler', PartialStandardScaler(columns='all')),
    ('estimator', RandomForestClassifier())
])

In [None]:
global_pipeline.fit(X_train, y_train)

>> (Info) Droped columns : []
Index(['piezo_station_department_code', 'piezo_station_investigation_depth',
       'piezo_station_altitude', 'piezo_station_commune_name',
       'piezo_station_longitude', 'piezo_station_latitude',
       'piezo_measurement_date', 'piezo_obtention_mode', 'piezo_status',
       'piezo_qualification', 'piezo_measure_nature_code', 'meteo_latitude',
       'meteo_longitude', 'meteo_altitude', 'meteo_date', 'meteo_rain_height',
       'meteo_temperature_min', 'meteo_temperature_max',
       'meteo_temperature_avg', 'meteo__pressure_saturation_avg',
       'distance_piezo_meteo', 'hydro_observation_result_elab',
       'prelev_volume_0', 'prelev_usage_label_0',
       'prelev_volume_obtention_mode_label_0', 'prelev_volume_1',
       'prelev_usage_label_1', 'prelev_volume_obtention_mode_label_1',
       'prelev_volume_2', 'prelev_usage_label_2',
       'prelev_volume_obtention_mode_label_2', 'prelev_other_volume_sum',
       'insee_%_agri', 'insee_pop_commune',

In [None]:
y_val_pred = global_pipeline.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

>> (Info) Valeurs Manquantes comblées avec les Médianes.
>> (Info) Transforming data: Filling missing values and applying one-hot encoding.
>> (Info) Missing values in piezo_station_investigation_depth filled with department means.
>> (Info) One-hot encoding applied to piezo_obtention_mode with missing values filled.
>> (Info) One-hot encoding applied to piezo_status with missing values filled.
>> (Info) One-hot encoding applied to piezo_qualification with missing values filled.
>> (Info) One-hot encoding applied to piezo_measure_nature_code with missing values filled.
>> (Info) Data transformation completed.
>> (INFO - DropCols) columns ['piezo_station_department_name', 'piezo_station_update_date', 'piezo_station_commune_code_insee', 'piezo_station_pe_label', 'piezo_station_bdlisa_codes', 'piezo_station_bss_code', 'piezo_station_bss_id', 'piezo_bss_code', 'piezo_measurement_date', 'piezo_producer_name', 'piezo_measure_nature_code', 'meteo_name', 'meteo_id', 'meteo_latitude', 'meteo_lo

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Accuracy: 0.58225

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.56      0.59      1934
           1       0.58      0.50      0.54      3222
           2       0.56      0.56      0.56      4630
           3       0.56      0.60      0.58      5415
           4       0.62      0.65      0.63      4799

    accuracy                           0.58     20000
   macro avg       0.59      0.57      0.58     20000
weighted avg       0.58      0.58      0.58     20000



In [None]:
rf = global_pipeline.named_steps['estimator']

rf.feature_importances_

array([0.05986686, 0.05502813, 0.0547437 , 0.14608801, 0.0411407 ,
       0.0774364 , 0.04493748, 0.00192649, 0.12639811, 0.07688095,
       0.03314264, 0.03700347, 0.03694276, 0.02944164, 0.03252412,
       0.00327626, 0.00201415, 0.00176134, 0.00373462, 0.00302471,
       0.00181662, 0.00179391, 0.00249644, 0.1265805 ])

### Save Pipeline

In [None]:
save = True
if save:
        
    pipeline_name = "pipeline_randomforest_1st"

    # Writing to sample.json
    with open(out_folder_config / Path(pipeline_name + ".pkl"), "wb") as file:
        pickle.dump(global_pipeline, file)