## Application des transformations definies dans "trasnfromers.py"

In [1]:
from transformers import *

In [2]:
import pandas as pd
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pathlib import Path
from sklearn.model_selection import train_test_split

import pickle


In [3]:
columns_to_drop = [
        "piezo_station_department_name",
        "piezo_station_update_date",
        "piezo_station_commune_code_insee",
        "piezo_station_pe_label",
        "piezo_station_bdlisa_codes",
        "piezo_station_bss_code",
        "piezo_station_bss_id", 
        "piezo_bss_code",
        "piezo_measurement_date",
        "piezo_producer_name",
        "piezo_measure_nature_code",
        "meteo_name",
        "meteo_id", 
        "meteo_latitude",
        "meteo_longitude",
        "hydro_station_code",
        "hydro_method_code", 
        "hydro_method_label", 
        "hydro_qualification_label", 
        "prelev_structure_code_0",
        "prelev_structure_code_2",
        "prelev_structure_code_0",
        "prelev_commune_code_insee_0",
        "piezo_station_department_code",
        
        "meteo_DRR", 
        "meteo_temperature_min_ground", 
        "meteo_temperature_min_50cm", 
        "meteo_pressure_avg",
        "meteo_pression_maxi", 
        "meteo_wind_speed_avg_2m", 
        "meteo_wind_max_2m", 
        "meteo_wind_direction_max_inst_2m", 
        "meteo_time_wind_max_2m", 
        "meteo_wetting_duration", 
        "meteo_sunshine_duration", 
        "meteo_radiation", 
        "meteo_radiation_direct", 
        "meteo_sunshine_%", 
        "meteo_radiation_IR", 
        "meteo_radiation_UV_max", 
        "meteo_cloudiness", 
        "meteo_cloudiness_height", 
        "meteo_if_snow", 
        "meteo_if_fog", 
        "meteo_if_thunderstorm", 
        "meteo_if_sleet", 
        "meteo_if_hail", 
        "meteo_if_dew", 
        "meteo_if_black_ice", 
        "meteo_if_snow_ground", 
        "meteo_if_frost", 
        "meteo_if_smoke", 
        "meteo_if_mist",
        "meteo_if_lightning", 
        "meteo_evapotranspiration_Monteith", 
        "meteo_radiation_UV", 
        "meteo_snow_height", 
        "meteo_snow_thickness_max", 
        "meteo_snow_thickness_6h"]

ajouts_drop_yael = ["meteo_altitude","meteo_temperature_min","meteo_temperature_max"]


columns_to_drop+=ajouts_drop_yael

In [4]:
# Specification des colonnes  a garder 

##ANCIENNE VERSION
#all_columns = pd.read_csv(path_src_dataset, nrows=0).columns.tolist()
#columns_to_keep = [col for col in all_columns if col not in columns_to_drop] 


##NOUVELLE VERSION 
#Je précise ci dessous les colonnes nécessaires pour chacun des transformers, et en commentaire l'ordre dans lequel ce transformer sera utilisé
#Date
col_yass = ['meteo_date'] + ['prelev_volume_0', 'prelev_volume_1',
                             'prelev_volume_2', 'prelev_other_volume_sum','piezo_station_commune_name']  # ORDRE = 4 (en dernier car il supprime les dates )
#Altitude
altitude_flo = ["piezo_station_altitude", "meteo_altitude"] # ORDRE 1
prelev_flo = ["prelev_volume_0", "prelev_usage_label_0", "prelev_volume_obtention_mode_label_0", "prelev_volume_1", "prelev_usage_label_1", "prelev_volume_obtention_mode_label_1", "prelev_volume_2", "prelev_usage_label_2", "prelev_volume_obtention_mode_label_2"]
col_flo = altitude_flo + prelev_flo
#Insee & rain "CleanFeatures"
cols_yael_input = ['insee_%_agri', 'meteo_rain_height', 'insee_pop_commune', 'insee_med_living_level', 'insee_%_ind', 'insee_%_const']
cols_yael_need = ["piezo_station_department_code", "meteo_date"]

#Temperature
cols_lucien_need = ['piezo_station_department_code', 'piezo_measurement_date']
cols_lucien_input = ['meteo_temperature_avg','meteo_temperature_min','meteo__pressure_saturation_avg','meteo_temperature_max']
#Lat Long
cols_mat = ["distance_piezo_meteo",'piezo_station_longitude','piezo_station_latitude','meteo_latitude','meteo_longitude', "meteo_temperature_avg", "meteo_temperature_avg_threshold", "meteo_temperature_min", "meteo_temperature_max", "meteo_temperature_min_ground", "hydro_observation_result_elab", "hydro_status_code", "hydro_qualification_code", "hydro_hydro_quantity_elab"]


#Clean pizo
pizo_cols = ['piezo_station_investigation_depth', 'piezo_obtention_mode', 'piezo_status', 'piezo_qualification', 'piezo_measure_nature_code','piezo_station_department_code']
 
#target
target = "piezo_groundwater_level_category"

columns_to_keep = col_yass + cols_yael_input + cols_yael_need + col_flo + cols_lucien_need + cols_lucien_input + cols_mat + [target] + pizo_cols

columns_to_keep

['meteo_date',
 'prelev_volume_0',
 'prelev_volume_1',
 'prelev_volume_2',
 'prelev_other_volume_sum',
 'piezo_station_commune_name',
 'insee_%_agri',
 'meteo_rain_height',
 'insee_pop_commune',
 'insee_med_living_level',
 'insee_%_ind',
 'insee_%_const',
 'piezo_station_department_code',
 'meteo_date',
 'piezo_station_altitude',
 'meteo_altitude',
 'prelev_volume_0',
 'prelev_usage_label_0',
 'prelev_volume_obtention_mode_label_0',
 'prelev_volume_1',
 'prelev_usage_label_1',
 'prelev_volume_obtention_mode_label_1',
 'prelev_volume_2',
 'prelev_usage_label_2',
 'prelev_volume_obtention_mode_label_2',
 'piezo_station_department_code',
 'piezo_measurement_date',
 'meteo_temperature_avg',
 'meteo_temperature_min',
 'meteo__pressure_saturation_avg',
 'meteo_temperature_max',
 'distance_piezo_meteo',
 'piezo_station_longitude',
 'piezo_station_latitude',
 'meteo_latitude',
 'meteo_longitude',
 'meteo_temperature_avg',
 'meteo_temperature_avg_threshold',
 'meteo_temperature_min',
 'meteo_te

In [26]:
path_src_dataset = Path("./data/src/X_train_Hi5.csv")


# Out folders
out_folder_dataset = Path("./data/cleaned")
out_folder_dataset.mkdir(parents=True, exist_ok=True) # Create the folder if it doesn't exist
out_folder_config = Path("./data/processed/pipelines")
out_folder_config.mkdir(parents=True, exist_ok=True)

# Load the CSV file with only the relevant columns
df = pd.read_csv(path_src_dataset, usecols=columns_to_keep,nrows=10e6) #  SI on veut charger moins de lignes : ajouter --> ,nrows=10e4)
df = df.drop_duplicates()



X = df.drop(columns=target)

#Mapping du target 
mapping = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
y = df[target].map(mapping)

#Test-val split 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Apply the transformers selected
processing_pipeline = Pipeline(steps=[
("DropNaRate", DropNaRate(0.7)),
("Prelevol",PrelevVol()),
("MeteoTimeTnx", TimeTnx(delta=5, clean=False)),
("Prelevement", Prelev(columns=col_flo,usage_label_max_categories=4,mode_label_max_categories=4,scale=1)),
("CleanFeatures", CleanFeatures(cols_yael_input)),
("Altitude", AltitudeTrans(columns=["piezo_station_altitude", "meteo_altitude"])),
('LatLong',CleanLatLon()),
('CleanTemp',CleanTemp()),
('Temp',TemperaturePressionTrans(columns=cols_lucien_input)),
('CleanHydro', CleanHydro()),
('CleanPizo',  CleanPizo(pizo_cols)),
('Dates',DateTransformer()),
('DropCols',DropCols(columns_to_drop))

# ... Add others transformations
])



ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [6]:
print("Pipeline ongoing...")
processed_X_train = processing_pipeline.fit_transform(X_train)
processed_X_val = processing_pipeline.transform(X_val)

Pipeline ongoing...
>> (Info) Droped columns : []
>> (INFO) missing values in columns ['prelev_volume_0', 'prelev_volume_1', 'prelev_volume_2', 'prelev_other_volume_sum'] are filled by the minimum of the column by commune
>> (Info - Prelev) 'prelev_usage_label_0' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_volume_obtention_mode_label_0' has been one-hot-encoded in 4 features
>> (Info) Calculating medians and means for ['insee_%_agri', 'meteo_rain_height', 'insee_pop_commune', 'insee_med_living_level', 'insee_%_ind', 'insee_%_const']
>> (Info) Rainfall means by department and month calculated.
>> (Info) Medians and means successfully calculated.
>> (Info) Filling missing values with calculated medians and means.
>> (Info) Calculating means for numerical features and preparing for one-hot encoding.
>> (Info) Fitting completed: Means, modes, and one-hot encoders prepared.
>> (Info) Transforming data: Filling missing values and applying one-hot encoding.
>> (Info) Mis

In [7]:
processed_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80000 entries, 75220 to 15795
Data columns (total 44 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   piezo_station_investigation_depth                        80000 non-null  float64
 1   piezo_station_altitude                                   80000 non-null  float64
 2   piezo_station_longitude                                  80000 non-null  float64
 3   piezo_station_latitude                                   80000 non-null  float64
 4   meteo_date                                               80000 non-null  float64
 5   meteo_rain_height                                        80000 non-null  float64
 6   meteo_temperature_avg                                    80000 non-null  float64
 7   meteo__pressure_saturation_avg                           80000 non-null  float64
 8   distance_piezo_meteo       

In [8]:
print("Pipeline ongoing...")
processed_X_train = processing_pipeline.fit_transform(X_train)
processed_X_val = processing_pipeline.transform(X_val)
processed_X_train.describe()

# Save the processed data to CSV
# processed_X_train.to_csv(out_folder_dataset / "X_train.csv", index=False)
# processed_X_val.to_csv(out_folder_dataset / "X_val.csv", index=False)
# y_train.to_csv(out_folder_dataset / "y_train.csv", index=False)
# y_val.to_csv(out_folder_dataset / "y_val.csv", index=False)

# print("Data converted to csv")

Pipeline ongoing...
>> (Info) Droped columns : []
>> (INFO) missing values in columns ['prelev_volume_0', 'prelev_volume_1', 'prelev_volume_2', 'prelev_other_volume_sum'] are filled by the minimum of the column by commune
>> (Info - Prelev) 'prelev_usage_label_0' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_volume_obtention_mode_label_0' has been one-hot-encoded in 4 features
>> (Info) Calculating medians and means for ['insee_%_agri', 'meteo_rain_height', 'insee_pop_commune', 'insee_med_living_level', 'insee_%_ind', 'insee_%_const']
>> (Info) Rainfall means by department and month calculated.
>> (Info) Medians and means successfully calculated.
>> (Info) Filling missing values with calculated medians and means.
>> (Info) Calculating means for numerical features and preparing for one-hot encoding.
>> (Info) Fitting completed: Means, modes, and one-hot encoders prepared.
>> (Info) Transforming data: Filling missing values and applying one-hot encoding.
>> (Info) Mis

Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,meteo_date,meteo_rain_height,meteo_temperature_avg,meteo__pressure_saturation_avg,distance_piezo_meteo,hydro_observation_result_elab,...,piezo_status_Donnée contrôlée niveau 2,piezo_status_Donnée interprétée,piezo_qualification_Correcte,piezo_qualification_Incertaine,piezo_qualification_Incorrecte,piezo_qualification_Non qualifié,piezo_measure_nature_code_0,piezo_measure_nature_code_D,piezo_measure_nature_code_I,piezo_measure_nature_code_N
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,...,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,85.672915,149.880498,2.22156,46.422956,0.924389,1.925792,7.158712,8.659683,0.95025,49604.11,...,0.7814,0.02175,0.982513,0.00365,0.0032,0.010637,0.590437,0.005,0.039012,0.36555
std,240.428293,180.345925,2.724138,2.174254,0.067875,4.578783,3.527859,2.229683,0.217429,192372.6,...,0.413299,0.145867,0.13108,0.060305,0.056478,0.102589,0.491756,0.070534,0.193626,0.481587
min,0.0,0.72,-4.657129,41.399732,0.772456,0.0,-8.3,0.5,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.0,41.9,-0.105584,44.499602,0.869764,0.0,4.7,6.985,1.0,1072.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,40.0,108.0,1.897576,46.393226,0.941397,0.2,7.5,8.467031,1.0,3243.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,80.0,185.0,4.216516,48.214196,0.98524,2.0,9.644216,10.1,1.0,15173.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,5233.4,2150.0,9.51989,50.971087,1.0,189.0,20.3,14.9,1.0,3235001.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
processed_X_train.head()

Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,meteo_date,meteo_rain_height,meteo_temperature_avg,meteo__pressure_saturation_avg,distance_piezo_meteo,hydro_observation_result_elab,...,piezo_status_Donnée contrôlée niveau 2,piezo_status_Donnée interprétée,piezo_qualification_Correcte,piezo_qualification_Incertaine,piezo_qualification_Incorrecte,piezo_qualification_Non qualifié,piezo_measure_nature_code_0,piezo_measure_nature_code_D,piezo_measure_nature_code_I,piezo_measure_nature_code_N
75220,102.0,51.74,-0.68218,44.744479,0.869764,3.6,13.7,14.6,1.0,553.0,...,1,0,1,0,0,0,1,0,0,0
48955,166.0,14.54,2.985469,42.584073,0.94706,88.0,5.8,6.2,1.0,3274.0,...,1,0,1,0,0,0,0,0,0,1
44966,140.0,55.0,-0.499435,45.778934,0.952442,0.0,5.154839,6.719672,1.0,86602.0,...,0,0,1,0,0,0,1,0,0,0
13568,372.0,111.32,-0.399286,43.652475,0.996303,0.0,4.6,8.1,1.0,228.0,...,0,0,1,0,0,0,1,0,0,0
92727,36.0,84.91,3.379233,43.539801,0.804187,0.0,9.5,9.3,1.0,27221.0,...,0,1,1,0,0,0,0,0,1,0


In [10]:
# Check for missing values in each column
missing_values = processed_X_train.isna().sum()

# Filter and display columns with missing values
columns_with_missing = missing_values[missing_values > 0]
print(columns_with_missing)


Series([], dtype: int64)


In [11]:
# Identify columns with missing values
missing_values = processed_X_train.isna().sum()
columns_with_missing = missing_values[missing_values > 0]
print(columns_with_missing)

# Display rows where there are missing values in any of these columns
rows_with_missing = processed_X_train[processed_X_train[columns_with_missing.index].isna().any(axis=1)]
rows_with_missing.head()


Series([], dtype: int64)


Unnamed: 0,piezo_station_investigation_depth,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,meteo_date,meteo_rain_height,meteo_temperature_avg,meteo__pressure_saturation_avg,distance_piezo_meteo,hydro_observation_result_elab,...,piezo_status_Donnée contrôlée niveau 2,piezo_status_Donnée interprétée,piezo_qualification_Correcte,piezo_qualification_Incertaine,piezo_qualification_Incorrecte,piezo_qualification_Non qualifié,piezo_measure_nature_code_0,piezo_measure_nature_code_D,piezo_measure_nature_code_I,piezo_measure_nature_code_N


In [12]:
a

NameError: name 'a' is not defined

# Quick KNN

In [None]:
#quick KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=25)  # Default is 5 neighbors
knn.fit(processed_X_train, y_train)

# Make predictions
y_val_pred = knn.predict(processed_X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))


processed_X_train.head()

In [13]:
#quick KNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

global_pipeline = Pipeline([
    ('processing', processing_pipeline),
    ('StandardScaler', PartialStandardScaler(columns='all')),
    ('estimator', RandomForestClassifier())
])

In [14]:
global_pipeline.fit(X_train, y_train)

>> (Info) Droped columns : []
>> (INFO) missing values in columns ['prelev_volume_0', 'prelev_volume_1', 'prelev_volume_2', 'prelev_other_volume_sum'] are filled by the minimum of the column by commune
>> (Info - Prelev) 'prelev_usage_label_0' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_volume_obtention_mode_label_0' has been one-hot-encoded in 4 features
>> (Info) Calculating medians and means for ['insee_%_agri', 'meteo_rain_height', 'insee_pop_commune', 'insee_med_living_level', 'insee_%_ind', 'insee_%_const']
>> (Info) Rainfall means by department and month calculated.
>> (Info) Medians and means successfully calculated.
>> (Info) Filling missing values with calculated medians and means.
>> (Info) Calculating means for numerical features and preparing for one-hot encoding.
>> (Info) Fitting completed: Means, modes, and one-hot encoders prepared.
>> (Info) Transforming data: Filling missing values and applying one-hot encoding.
>> (Info) Missing values in piezo

In [15]:
processed_X_train.isna().sum()

piezo_station_investigation_depth                          0
piezo_station_altitude                                     0
piezo_station_longitude                                    0
piezo_station_latitude                                     0
meteo_date                                                 0
meteo_rain_height                                          0
meteo_temperature_avg                                      0
meteo__pressure_saturation_avg                             0
distance_piezo_meteo                                       0
hydro_observation_result_elab                              0
hydro_status_code                                          0
hydro_qualification_code                                   0
prelev_volume_0                                            0
prelev_other_volume_sum                                    0
insee_%_agri                                               0
insee_pop_commune                                          0
insee_med_living_level  

In [16]:
y_val_pred = global_pipeline.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

>> (Info - Prelev) 'prelev_usage_label_0' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_volume_obtention_mode_label_0' has been one-hot-encoded in 4 features
>> (Info) Filling missing values with calculated medians and means.
>> (Info) Transforming data: Filling missing values and applying one-hot encoding.
>> (Info) Missing values in piezo_station_investigation_depth filled with department means.
>> (Info) One-hot encoding applied to piezo_obtention_mode with missing values filled.
>> (Info) One-hot encoding applied to piezo_status with missing values filled.
>> (Info) One-hot encoding applied to piezo_qualification with missing values filled.
>> (Info) One-hot encoding applied to piezo_measure_nature_code with missing values filled.
>> (Info) Data transformation completed.
>> (INFO - DropCols) columns ['piezo_station_department_name', 'piezo_station_update_date', 'piezo_station_commune_code_insee', 'piezo_station_pe_label', 'piezo_station_bdlisa_codes', 'piezo_sta

In [17]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Accuracy: 0.91745

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      1934
           1       0.92      0.91      0.91      3222
           2       0.88      0.90      0.89      4630
           3       0.90      0.90      0.90      5415
           4       0.95      0.94      0.95      4799

    accuracy                           0.92     20000
   macro avg       0.92      0.92      0.92     20000
weighted avg       0.92      0.92      0.92     20000



In [18]:
rf = global_pipeline.named_steps['estimator']

rf.feature_importances_

array([0.06982378, 0.06485978, 0.06947398, 0.07543064, 0.07373116,
       0.02295468, 0.03411852, 0.03285143, 0.00301964, 0.07311116,
       0.01115544, 0.00646275, 0.04143633, 0.0539466 , 0.04273882,
       0.04752263, 0.04822685, 0.03856969, 0.04302839, 0.00381183,
       0.00622931, 0.00335751, 0.00614523, 0.00576939, 0.00272543,
       0.00417414, 0.00371956, 0.07271084, 0.00211856, 0.0020731 ,
       0.00234866, 0.00085879, 0.00064865, 0.0049447 , 0.00518335,
       0.0006795 , 0.00179809, 0.00036308, 0.00083997, 0.00056442,
       0.00740885, 0.00066753, 0.00148724, 0.00691   ])

### Save Pipeline

In [None]:
save = False
if save:
        
    pipeline_name = "pipeline_randomforest_1st"

    # Writing to sample.json
    with open(out_folder_config / Path(pipeline_name + ".pkl"), "wb") as file:
        pickle.dump(global_pipeline, file)