In [12]:
import pandas as pd
from pipe import *
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import pickle

### Load paths

In [13]:
path_src_dataset = Path("./data/src/X_train_Hi5.csv")
# Out folders
out_folder_dataset = Path("./data/cleaned")
# Create the folder if it doesn't exist
out_folder_dataset.mkdir(parents=True, exist_ok=True)
out_folder_config = Path("./data/processed/pipelines")
out_folder_config.mkdir(parents=True, exist_ok=True)

# Load the CSV file with only the relevant columns
# ,nrows=10e4) #  SI on veut charger moins de lignes : ajouter --> ,nrows=10e4)


In [14]:
columns_to_keep = col_yass + cols_yael_input + cols_yael_need + col_flo + \
    cols_lucien_need + cols_lucien_input + cols_mat + [target] + pizo_cols

### Loading data

In [15]:
df = pd.read_csv(path_src_dataset, usecols=columns_to_keep)
df = df.drop_duplicates()

# Mapping du target
mapping = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
df[target] = df[target].map(mapping)

  df = pd.read_csv(path_src_dataset, usecols=columns_to_keep)


## FUNCTIONS

In [16]:
def summer_train_test_split(df: pd.DataFrame, perc_val: float):
    """
    Split the dataset into a training and a validation set.
    The validation set is composed of the summer months.
    The training set is composed of the other months.
    The split is done randomly.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataset to split.
    perc_val : float
        The percentage of the dataset to put in the validation set.
        
    Returns
    -------
    X_train : pd.DataFrame
        The training set features.
    X_val : pd.DataFrame
        The validation set features.
    y_train : pd.Series
        The training set target.
    y_val : pd.Series
        The validation set target.
    """
    df_copy = df.copy()

    df_copy["meteo_date"] = pd.to_datetime(
        df_copy["meteo_date"], format="%Y-%m-%d")

    summer_df = df_copy[(df_copy["meteo_date"].dt.month >= 6)
                        & (df_copy["meteo_date"].dt.month <= 9)]

    _, summer_val = train_test_split(
        summer_df, random_state=42, test_size=perc_val, shuffle=True)

    Xy_train = df.drop(index=summer_val.index).sample(frac=1, random_state=42)
    Xy_val: pd.DataFrame = df.iloc[summer_val.index].sample(
        frac=1, random_state=42)

    X_train = Xy_train.drop(columns=[target])
    y_train = Xy_train[target]

    X_val = Xy_val.drop(columns=[target])
    y_val = Xy_val[target]

    del df_copy

    return X_train, X_val, y_train, y_val

### Fitting to pipeline

In [17]:
X_train, X_val, y_train, y_val = summer_train_test_split(df, 0.5)
# X_train, X_val, y_train, y_val = train_test_split(df.drop(columns=[target]), df[target], test_size=0.2, random_state=42)

In [None]:
continuous_variables = [
    "piezo_station_investigation_depth",
    "piezo_station_altitude",
    "piezo_station_longitude",
    "piezo_station_latitude",
    "meteo_date",
    "meteo_rain_height",
    "meteo_time_tn",
    "meteo_time_tx",
    "meteo_temperature_avg",
    "meteo__pressure_saturation_avg",
    "hydro_observation_result_elab",
    "prelev_volume_0",
    "prelev_volume_1",
    "prelev_volume_2",
    "prelev_other_volume_sum",
    "insee_%_agri",
    "insee_pop_commune",
    "insee_med_living_level",
    "insee_%_ind",
    "insee_%_const",
    "hydro_observation_log"
]

In [None]:
global_pipeline = Pipeline([
    ('processing', processing_pipeline),
    ('StandardScaler', PartialStandardScaler(columns=continuous_variables))
])

In [18]:
print("Pipeline ongoing...")
processed_X_train = global_pipeline.fit_transform(X_train)
processed_X_val = global_pipeline.transform(X_val)

Pipeline ongoing...
>> (Info) Droped columns : []


  self.mean = X[self.columns].applymap(self.convert_minute).mean()
  X[self.columns] = X[self.columns].applymap(self.convert_minute)


>> (Info - TimeTnx) fill na avec mean = 474.9644888624465 & 809.8013083514762
>> (INFO) missing values in columns ['prelev_volume_0', 'prelev_volume_1', 'prelev_volume_2', 'prelev_other_volume_sum'] are filled by the minimum of the column by commune


KeyboardInterrupt: 

In [None]:
global_pipeline.fit(X_train, y_train)

>> (Info) Droped columns : []


  self.mean = X[self.columns].applymap(self.convert_minute).mean()
  X[self.columns] = X[self.columns].applymap(self.convert_minute)


>> (Info - TimeTnx) fill na avec mean = 474.9644888624465 & 809.8013083514762
>> (INFO) missing values in columns ['prelev_volume_0', 'prelev_volume_1', 'prelev_volume_2', 'prelev_other_volume_sum'] are filled by the minimum of the column by commune
>> (Info - Prelev) 'prelev_usage_label_0' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_volume_obtention_mode_label_0' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_usage_label_1' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_volume_obtention_mode_label_1' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_usage_label_2' has been one-hot-encoded in 4 features
>> (Info - Prelev) 'prelev_volume_obtention_mode_label_2' has been one-hot-encoded in 4 features
>> (Info) Calculating medians and means for ['insee_%_agri', 'meteo_rain_height', 'insee_pop_commune', 'insee_med_living_level', 'insee_%_ind', 'insee_%_const']
>> (Info) Rainfall means by department and month calcul

## Save Pipeline


In [None]:
save = True
if save:

    pipeline_name = "pipeline_processing_5"

    # Writing to sample.json
    with open(out_folder_config / Path(pipeline_name + ".pkl"), "wb") as file:
        pickle.dump(global_pipeline, file)