## Application des transformations definies dans "trasnfromers.py"

In [1]:
from transformers import *

In [2]:
import pandas as pd
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pathlib import Path
from sklearn.model_selection import train_test_split

import pickle


In [3]:
path_src_dataset = Path("./data/src/X_train_Hi5.csv")

out_folder_dataset = Path("./data/cleaned")
# Create the folder if it doesn't exist
out_folder_dataset.mkdir(parents=True, exist_ok=True)

out_folder_config = Path("./data/processed/pipelines")
out_folder_config.mkdir(parents=True, exist_ok=True)



# Calculate the columns to keep
#all_columns = pd.read_csv(path_src_dataset, nrows=0).columns.tolist()
#columns_to_keep = [col for col in all_columns if col not in columns_to_drop] 
columns_to_keep = ["insee_%_agri","meteo_rain_height","piezo_station_altitude", "meteo_altitude","piezo_groundwater_level_category","meteo_date"]
# Step 3: Load the CSV file with only the relevant columns
df = pd.read_csv(path_src_dataset, usecols=columns_to_keep)

target = "piezo_groundwater_level_category"

X = df.drop(columns=target)

mapping = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
y = df[target].map(mapping)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Apply the transformers selected
processing_pipeline = Pipeline(steps=[
    ("DropNaRate", DropNaRate(0.7)),
    ("CleanFeatures", CleanFeatures(["insee_%_agri","meteo_rain_height"])),
    ("Altitude", AltitudeTrans(columns=["piezo_station_altitude", "meteo_altitude"])),
    ('Dates', DateTransformer())
    # ... Add others transformations
])

In [None]:
print("Pipeline ongoing...")
processed_X_train = processing_pipeline.fit_transform(X_train)
processed_X_val = processing_pipeline.transform(X_val)

In [None]:

# Save the processed data to CSV
processed_X_train.to_csv(out_folder_dataset / "X_train.csv", index=False)
processed_X_val.to_csv(out_folder_dataset / "X_val.csv", index=False)
y_train.to_csv(out_folder_dataset / "y_train.csv", index=False)
y_val.to_csv(out_folder_dataset / "y_val.csv", index=False)

print("Data converted to csv")

In [None]:
#quick KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=25)  # Default is 5 neighbors
knn.fit(processed_X_train, y_train)

# Make predictions
y_val_pred = knn.predict(processed_X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))


Accuracy: 0.4380582407642952

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.51      0.50    107538
           1       0.41      0.44      0.43    125818
           2       0.41      0.43      0.42    129952
           3       0.42      0.40      0.41    116354
           4       0.48      0.41      0.44     86402

    accuracy                           0.44    566064
   macro avg       0.44      0.44      0.44    566064
weighted avg       0.44      0.44      0.44    566064



# Quick KNN

In [4]:
#quick KNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

global_pipeline = Pipeline([
    ('processing', processing_pipeline),
    ('StandardScaler', PartialStandardScaler(columns='all')),
    ('estimator', KNeighborsClassifier(n_neighbors=25))
])

In [5]:
global_pipeline.fit(X_train, y_train)

>> (Info) Droped columns : []
>> (Info) Column insee_%_agri has been standardized to numeric.
>> (Info) Missing values in insee_%_agri filled with median: 3.4
>> (Info) Missing values in meteo_rain_height filled with median: 0.1
>> (INFO - PartialStandardScaler) columns ['piezo_station_altitude', 'meteo_altitude', 'date', 'meteo_rain_height', 'insee_%_agri'] have bean standardized


In [7]:
y_val_pred = global_pipeline.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

>> (Info) Missing values in insee_%_agri filled with median: 3.4
>> (Info) Missing values in meteo_rain_height filled with median: 0.1
>> (INFO - PartialStandardScaler) columns ['piezo_station_altitude', 'meteo_altitude', 'date', 'meteo_rain_height', 'insee_%_agri'] have bean standardized


In [8]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Accuracy: 0.4380582407642952

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.51      0.50    107538
           1       0.41      0.44      0.43    125818
           2       0.41      0.43      0.42    129952
           3       0.42      0.40      0.41    116354
           4       0.48      0.41      0.44     86402

    accuracy                           0.44    566064
   macro avg       0.44      0.44      0.44    566064
weighted avg       0.44      0.44      0.44    566064



### Save Pipeline

In [9]:
pipeline_name = "1st_pipeline_12h26"

# Writing to sample.json
with open(out_folder_config / Path(pipeline_name + ".pkl"), "wb") as file:
    pickle.dump(global_pipeline, file)