In [1]:
import pandas as pd
from sklearn import preprocessing

In [2]:
df = pd.read_csv("sampled_x_train_t.csv")  # Randomized sample of the big dataset

  df = pd.read_csv("sampled_x_train_t.csv")  # Randomized sample of the big dataset


# Cleaning all redundant value

In [4]:
def cleaning(
    df: pd.DataFrame,
    piezo: bool = False,
    meteo: bool = False,
    hydro: bool = False,
    prelev: bool = False,
    insee: bool = False,
) -> pd.DataFrame:
    # Define columns to exclude for each category
    excluded_piezo = [
        "piezo_station_department_code",
        "piezo_station_update_date",
        "piezo_station_department_name",
        "piezo_station_commune_code_insee",
        "piezo_station_pe_label",
        "piezo_station_bdlisa_codes",
        "piezo_station_bss_code",
        "piezo_station_commune_name",
        "piezo_station_bss_id",
        "piezo_bss_code",
        "piezo_continuity_name",
        "piezo_producer_code",
        "piezo_producer_name",
        "piezo_measure_nature_name",
    ]
    excluded_meteo = ["meteo_id", "meteo_radiation_IR", "meteo_name"]
    excluded_hydro = [
        "hydro_station_code",
        "hydro_status_label",
        "hydro_method_code",
        "hydro_method_label",
        "hydro_qualification_label",
    ]
    excluded_prelev = [
        "prelev_structure_code_0",
        "prelev_volume_0",
        "prelev_usage_label_0",
        "prelev_volume_obtention_mode_label_0",
        "prelev_longitude_0",
        "prelev_latitude_0",
        "prelev_commune_code_insee_0",
        "prelev_structure_code_1",
        "prelev_volume_1",
        "prelev_usage_label_1",
        "prelev_volume_obtention_mode_label_1",
        "prelev_longitude_1",
        "prelev_latitude_1",
        "prelev_commune_code_insee_1",
        "prelev_structure_code_2",
        "prelev_volume_2",
        "prelev_usage_label_2",
        "prelev_volume_obtention_mode_label_2",
        "prelev_longitude_2",
        "prelev_latitude_2",
        "prelev_commune_code_insee_2",
        "prelev_other_volume_sum",
    ]
    excluded_insee = [
        "prelev_commune_code_insee_0",
        "prelev_commune_code_insee_1",
        "prelev_commune_code_insee_2",
        "insee_%_agri",
        "insee_pop_commune",
        "insee_med_living_level",
        "insee_%_ind",
        "insee_%_const",
    ]
    # Combine columns to drop based on the parameters
    columns_to_drop = []
    if piezo:
        columns_to_drop += excluded_piezo
    if meteo:
        columns_to_drop += excluded_meteo
    if hydro:
        columns_to_drop += excluded_hydro
    if prelev:
        columns_to_drop += excluded_prelev
    if insee:
        columns_to_drop += excluded_insee

    # Drop columns safely (ignore errors for missing columns)

    df_filtered = df.drop(columns=columns_to_drop, errors="ignore")

    return df_filtered

# Unifies longitude and latitude columns

In [5]:
def unify_long_lat(df: pd.DataFrame, distance_threshold: int = 25) -> pd.DataFrame:
    """
    Unifies longitude and latitude into single columns if distance_piezo_hydro is below the threshold.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        distance_threshold (int): The maximum distance for unification.

    Returns:
        pd.DataFrame: The modified DataFrame with unified longitude and latitude columns.
    """
    # Columns to exclude after processing
    excluding = [
        "piezo_station_latitude",
        "piezo_station_longitude",
        "hydro_longitude",
        "hydro_latitude",
        "meteo_longitude",
        "meteo_latitude",
        "distance_piezo_hydro",
        "distance_hydro_meteo",
    ]

    # Ensure the required column exists
    if "distance_piezo_hydro" in df.columns:
        # Create unified longitude and latitude where condition is met
        df.loc[df["distance_piezo_hydro"] < distance_threshold, "longitude"] = df[
            "piezo_station_longitude"
        ]
        df.loc[df["distance_piezo_hydro"] < distance_threshold, "latitude"] = df[
            "piezo_station_latitude"
        ]

    # Drop the excluded columns
    df = df.drop(columns=excluding, errors="ignore")

    return df

# Unify date 

In [6]:
def unify_date(df: pd.DataFrame) -> pd.DataFrame:
    """
    Unifies date combine date in a single column.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.

    Returns:
        pd.DataFrame: The modified DataFrame with unifies dates columns.
    """
    # Columns to exclude after processing
    excluding = ["piezo_measurement_date", "meteo_date", "hydro_observation_date_elab"]

    # Check and assign the first available date column
    if "piezo_measurement_date" in df.columns:
        df["date"] = df["piezo_measurement_date"]

    df = df.drop(columns=excluding, errors="ignore")

    return df

# Cleaning residual empty value

In [7]:
def keep_essential_data(df: pd.DataFrame, threshold: float = 0.8) -> pd.DataFrame:
    """
    Keeping essential data by removing columns with less than thrshold % of empty data

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.

        threshold (float): % of empty data in a column
    """

    # Compute threshold % not null value for each columns
    non_null_percentage = df.notnull().mean()  # not null values %

    # Select only column with less 80 % not null values
    columns_to_keep = non_null_percentage[non_null_percentage >= threshold].index
    filtered_df = df[columns_to_keep]

    # Print datasat after filtration
    print("Initals columns :", df.columns.tolist())
    print("Keeped columns :", filtered_df.columns.tolist())

    return filtered_df

# Cleaning piezzo data

In [8]:
def piezzo_clean(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesses the DataFrame by applying a series of cleaning and transformation steps for piezzo data.

    Parameters:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The preprocessed DataFrame.
    """

    # Replace NaN in qualification columns
    df["piezo_qualification"] = df["piezo_qualification"].fillna("Incertaine")

    # Replace NaN in piezo_measure_nature_code
    df["piezo_measure_nature_code"] = df["piezo_measure_nature_code"].fillna(0)

    # Replace NaN in piezo_obtention_mode
    df["piezo_obtention_mode"] = df["piezo_obtention_mode"].fillna(
        "Mode d'obtention inconnu"
    )

    # Replace NaN in piezo_status
    df["piezo_status"] = df["piezo_status"].fillna("Donnée brute")

    # Define a mapping for the labels to integers
    qualification_label_mapping = {
        "Correcte": 3,
        "Non qualifié": 2,
        "Incorrecte": 0,
        "Incertaine": 1,
    }

    # Map the labels to integers
    df["piezo_qualification"] = df["piezo_qualification"].map(
        qualification_label_mapping
    )

    # Mapping obtention mode
    elements = ["piezo_status", "piezo_obtention_mode"]
    df = pd.get_dummies(df, columns=elements, drop_first=True)

    return df

# Filling others NaN values :

In [9]:
from faiss_imputer import FaissImputer
from sklearn.impute import SimpleImputer


def impute_na(df: pd.DataFrame) -> pd.DataFrame:
    """
    Impute missing values in a DataFrame using FaissImputer.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame with missing values.

    Returns
    -------
    pd.DataFrame
        The DataFrame with missing values imputed.
    """
    # Drop non-numerical columns
    categorical_cols = df.select_dtypes(include=["object", "datetime64"]).columns
    df = df.drop(columns=categorical_cols)

    # Create an instance of FaissImputer
    imputer = FaissImputer(n_neighbors=10)
    simple_imputer = SimpleImputer(strategy="median")

    # Fit the imputer on the data frame with missing values
    imputer.fit(df)

    # Transform the data frame with missing values
    df = pd.DataFrame(imputer.transform(df), columns=df.columns)
    df = pd.DataFrame(simple_imputer.fit_transform(df), columns=df.columns)

    return df

In [10]:
from sklearn.preprocessing import MinMaxScaler


def date_to_timestamp_convertion(df: pd.DataFrame):
    # Convert the date column to datetime and then to a timestamp
    df["date"] = (
        pd.to_datetime(df["date"], format="%Y-%m-%d", errors="coerce").astype(int)
        // 10**9
    )

    # Normalize for better scaling
    scaler = MinMaxScaler()
    df["date"] = scaler.fit_transform(df[["date"]])

    return df

In [11]:
def drop_na(df):
    # Identify string/object columns
    string_features = list(df.select_dtypes(include=["object"]).columns)

    # Exclude specific columns (like 'piezo_groundwater_level_category')
    string_features.remove("piezo_groundwater_level_category")

    # Initialize the LabelEncoder
    le = preprocessing.LabelEncoder()

    # Apply encoding with handling for mixed types and missing values
    df[string_features] = df[string_features].apply(
        lambda col: le.fit_transform(col.astype(str).fillna("Unknown"))
    )

    return df

In [12]:
def pre_process(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesses the DataFrame by applying a series of cleaning and transformation steps.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        cleaning_params (dict): Dictionary of parameters for the `cleaning` function.

    Returns:
        pd.DataFrame: The preprocessed DataFrame.
    """
    df = cleaning(df, True, True, True, True, True)
    df = unify_long_lat(df)
    df = unify_date(df)
    df = date_to_timestamp_convertion(df)
    df = piezzo_clean(df)
    df = drop_na(df)
    # df = impute_na(df)

    return df

In [13]:
df = pre_process(df)

In [None]:
# Define a mapping for the labels to integers
label_mapping = {"Very Low": 0, "Low": 1, "Average": 2, "High": 3, "Very High": 4}

# Map the labels to integers
df["piezo_groundwater_level_category"] = df["piezo_groundwater_level_category"].map(
    label_mapping
)

df.to_csv("pre-processed_data.csv")