In [1]:
%load_ext watermark
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from myst_nb import glue
from IPython.display import Markdown as md

from plastockconf import name_zones, name_particles, name_frequentation, name_situation
from plastockconf import particle_groups, name_substrate, name_distance, table_css_styles, table_css_styles_top

from plastock import attribute_summary, attribute_summary_test, attribute_summary_grid, add_table_to_page

a_property =  {'color' : 'red'}
format_kwargs = dict(precision=2, thousands="'", decimal=",")
glue('blank_caption', " ", display=False)

section = 'A'
page = 2

work_data = pd.read_csv("data/end_pipe/long_form_micro.csv")
work_data.rename(columns={'echantillon':'échantillon', 'frequentation':'fréquentation'}, inplace=True)
beach_data = pd.read_csv("data/end_pipe/asl_beaches.csv")

In [2]:
table_no = 1
figure_no = 1

caption = 'Les données d\'analyse. Tous les tableaux et figures sont construits à partir de cet ensemble de données.'
rule = 'Les attributs dont la moyenne des résultats est supérieure à la moyenne du projet sont en rouge.'

t_0 = work_data.head().style.set_table_styles(table_css_styles)
table_0 = add_table_to_page(t_0, table_no, caption, section, page, rule, format_index='columns')
glue('tablea21', table_0, display=False)

In [3]:
# ! combine souples et dur !
fibers = work_data.loc[work_data.objet == 'fibres'].copy()
not_fibers = work_data.loc[work_data.objet != 'fibres'].copy()

not_fibers['objet'] = 'fragments'

work_datai = pd.concat([fibers, not_fibers])

In [4]:
# def name_the_new_distance(x, less='<= 500 m', more = '> 500 m'):
#     if x == 1:
#         return less
#     else:
#         return more

# def name_the_new_freq(x, new):
#     if x <= 2:
#         return new
#     else:
#         return 'Elévée'


# # the feature variables are added to the work_data
# ti = work_data.copy()
# features = ['frequentation', 'situation', 'orientation', 'distance']

# beach_datax = pd.read_csv("data/end_pipe/asl_beaches.csv").set_index('Plage')

# # they can be merged on the Plage column and the index
# env_plastock = ti.merge(beach_datax[features], left_on='Plage', right_index=True)

# # ! creation of composite variables !
# t_and_f = env_plastock.loc[:, ['échantillon', 'slug','date','code', 'pcs/m²', 'quantité', 'frequentation', 'situation', 'distance', 'substrat']].copy()


# # the substrat and distance features are being combined
# # the two lowest and the two highest of each group are being combined
# # substrat is a matter of combining different granularities. They are being grouped as
# # sand and gravel.
# # distance is now grouped by locations either less than or equal to 500 meters
# t_and_f.loc[t_and_f.substrat <= 2, 'substrat'] = 1
# t_and_f.loc[t_and_f.substrat > 2, 'substrat'] = 2
# t_and_f.loc[t_and_f.distance <= 2, 'distance'] = 1
# t_and_f.loc[t_and_f.distance > 2, 'distance'] = 2
# t_and_f.loc[t_and_f.frequentation <= 2, 'frequentation'] = 2

# # ! the data used in the models !
# f_combi = t_and_f.copy()



# f_combi.rename(columns={'frequentation':'fréquentation', 'loc_date': 'échantillon'}, inplace=True)

# # the feature variables are combined along the ordinal axis. Going from four catgories
# # to two in the case of distance and substrate. city and country are already binary
# # the values of low and moderate frequentation are combined also.
# f_comb = f_combi.copy()
# f_comb['distance'] = f_comb['distance'].apply(lambda x: name_the_new_distance(x))
# f_comb['fréquentation'] = f_comb['fréquentation'].apply(lambda x: name_the_new_freq(x, 'faible-moyenne'))
# f_comb['situation'] = f_comb['situation'].apply(lambda x: name_situation[x])
# f_comb['substrat'] = f_comb['substrat'].apply(lambda x: name_the_new_distance(x, less='Sable', more='Graviers'))


# # ! no composite variables !
# no_combined = env_plastock.loc[:, ['échantillon', 'slug','date','code', 'pcs/m²', 'frequentation', 'situation', 'distance', 'substrat']].copy()
# no_combined.rename(columns={'frequentation':'fréquentation', 'loc_date': 'échantillon'}, inplace=True)

# no_combined['distance'] = no_combined['distance'].apply(lambda x: name_distance[x])
# no_combined['fréquentation'] = no_combined['fréquentation'].apply(lambda x: name_frequentation[x])
# no_combined['situation'] = no_combined['situation'].apply(lambda x: name_situation[x])
# no_combined['substrat'] = no_combined['substrat'].apply(lambda x: name_substrate[x])

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import resample

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder



def analyze_scenario(scenario_data, func, n_iterations=100, bin_width=0.2):
    """
    Analyze a specific scenario using Random Forest regression with bootstrapping,
    and calculate feature importances.

    :param data: DataFrame containing the dataset.
    :param feature_1: The name of the first feature for filtering.
    :param feature_1_value: The value of the first feature to filter by.
    :param feature_2: The name of the second feature for filtering.
    :param feature_2_value: The value of the second feature to filter by.
    :param n_iterations: Number of bootstrap iterations. Default is 100.
    :param bin_width: Width of each bin for histogram. Default is 0.2.
    :return: A tuple containing bins, bin probabilities, flattened predictions, and feature importances.
    """
    
    # Prepare data for regression
    y_scaler = MinMaxScaler()
    y_scaled = y_scaler.fit_transform(scenario_data['pcs_m'].values.reshape(-1,1)).flatten()
    
    # Initialize the OneHotEncoder
    # here we encode the ordinal data
    encoder = OneHotEncoder(sparse_output=False)
    
    X = scenario_data.drop('pcs_m', axis=1)
    
    # Apply the encoder to the categorical columns
    encoded_data = encoder.fit_transform(scenario_data[['fréquentation', 'situation', 'distance', 'substrat']])
    # Create a DataFrame with the encoded data
    X_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['fréquentation', 'situation', 'distance', 'substrat']))

    
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_scaled, test_size=0.2, random_state=42)

    # Bootstrap predictions and accumulate feature importances
    bootstrap_predictions = []
    feature_importances_accumulated = np.zeros(X_train.shape[1])
    
    # Collect diagnostic at each repetition
    cum_mse = []
    cum_r2 = []
    
    for _ in range(n_iterations):
        X_train_sample, y_train_sample = resample(X_train, y_train)
        rf_model_sample = func
        rf_model_sample.fit(X_train_sample, y_train_sample)
        
        pred = rf_model_sample.predict(X_test)
        
        r2 = r2_score(y_test, pred)
        pred = y_scaler.inverse_transform(pred.reshape(-1, 1)).flatten()
        bootstrap_predictions.append(pred)
        mse = mean_squared_error(y_test , pred)
        
        
        feature_importances_accumulated += rf_model_sample.feature_importances_
        
        cum_mse.append(mse)
        cum_r2.append(r2)

        # Average feature importances
    feature_importances = feature_importances_accumulated / n_iterations

    # Flatten the predictions array
    predictions_flat = np.array(bootstrap_predictions).flatten()

    return predictions_flat, feature_importances, cum_mse, cum_r2

def plot_histogram(predictions, observed, title="", reference='camp-dist-1', display=False, order='predictions'):
    fig, ax = plt.subplots(figsize=(10, 6))
    if order == 'predictions':
        sns.histplot(predictions, bins=20, stat="probability", ax=ax, label='prédictions', zorder=0)
        sns.histplot(observed, bins=20, stat="probability", label='observée', zorder=1, ax=ax)
    else:
        sns.histplot(predictions, bins=20, stat="probability", ax=ax, label='prédictions', zorder=1)
        sns.histplot(observed, bins=20, stat="probability", label='observée', zorder=0, ax=ax)
        
    plt.title(title, loc='left')
    plt.xlabel('pcs/m')
    plt.ylabel('Densité de Probabilité')
    plt.legend()
    glue(reference, fig, display=display)
    plt.close()

def evalutate_model(r2s, mses, label, model='random-forest'):
    r2 = np.round(np.mean(r2s), 2)
    mse = np.round(np.mean(mses), 2)
    results = {"cross validated error":r2, "mean² error":mse, 'model':model}
    return pd.DataFrame(results, index=[label])

# Calculating quantiles for Scenario 2
format_kwargs = dict(precision=0, thousands="'")
q_uants = [0.01, 0.25, 0.5, 0.75, 0.99]
index = ['1%', '25%', '50%', '75%', '99%', 'Moyenne']
def makeqdf(observed, predicted, index=index, quants=q_uants, caption=""):
    
    o_q = np.quantile(observed, quants)
    m_o = np.mean(observed)
    o_p = np.quantile(predicted, quants)
    m_p = np.mean(predicted)
    
    results = {'observée':[*o_q, m_o], 'prédiction': [*o_p, m_p]}
   
    return pd.DataFrame(results, index=index).style.set_table_styles(table_css_styles_top).format(**format_kwargs).set_caption(caption)

cols = ['échantillon', 'position', 'fréquentation','situation', 'distance', 'substrat']

In [6]:
# model parameters
estimators = 10
iterations = 1000

test_xt = work_datai.copy()
# the sides of the object used to sample
# length = 10 cm, width = 10cm
test_xt['pcs_m'] = (test_xt['compte']/100)*10000


# Filter for Scenario 
test_xi = test_xt[(test_xt['position'] == 1)].copy()
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat',  'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2 = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Ligne d\'eau'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-lignedeau-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Ligne d\'eau'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='lignedeau-sa', display=False, order="observed")

In [7]:
# Filter for Scenario 

cols = ['échantillon', 'position', 'fréquentation','situation', 'distance', 'substrat']
test_xi = test_xt.copy()
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat',  'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2 = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Ligne d\'eau'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-tous-md-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Ligne d\'eau'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='tous-md-sa', display=False, order="observed")

In [8]:
# Filter for Scenario 
test_xi = test_xt[(test_xt['position'] == 2)].copy()
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2 = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Plage seche'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-plageseche-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Plage seche'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='plageseche-sa', display=False, order="observed")

In [9]:
# Filter for Scenario 
cols = ['échantillon', 'position', 'fréquentation','situation', 'distance', 'substrat']
test_xi = test_xt[(test_xt['position'] == 2)&(test_xt.objet == 'fibres')].copy()
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2 = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Plage seche et fibres'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-plagesechefibres-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Plage seche et fibres'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='plagesechefibres-sa', display=False, order="observed")

In [10]:
# Filter for Scenario 
test_xi = test_xt[(test_xt['substrat'] == 1)].copy()
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2 = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Sables fins'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-sablesfins-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Sables fins'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='sablesfins-sa', display=False, order="observed")

In [11]:
# Filter for Scenario 
test_xi = test_xt[(test_xt['substrat'] == 4)].copy()
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2 = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Cailloux'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-cailloux-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Cailloux'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='cailloux-sa', display=False, order="observed")

In [12]:
# Filter for Scenario 
test_xi = test_xt[(test_xt['fréquentation'] == 3)].copy()
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2 = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Fréquentation élevée'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-freq3-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Fréquentation élevée'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='freq3-sa', display=False, order="observed")

In [13]:
# Filter for Scenario 
test_xi = test_xt[(test_xt['fréquentation'] == 2)].copy()
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2 = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Fréquentation moyenne'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-freq2-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Sables fins'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='freq2-sa', display=False, order="observed")

# Micros particules 

__Format__ 

Le format suit celui de l'annexe pour les [microplastiques](micro_atts) . Nous incluons également l'analyse utilisant des variables combinées selon la méthode décrite dans la section [Résultats précédents](previous_results).  

## Resumé des résultats


### Autres campganes études



### Différences entre les types de plage


## Situation

::::{grid}

:::{grid-item}

resumé de [microplastiques](micro_atts) table A1-4 and Fig 3

avec détail du table A1-5

:::

:::{grid-item}
Dates d'echantillonage
:::
::::

(random_forest_sa_md)=
### Random Forest 

Source : [scikit-learn random forest](https://scikit-learn.org/0.16/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

criterion : `absolute error`

La régression avec forêt aléatoire est une technique d'apprentissage automatique (machine learning) utilisée pour prédire des résultats continus (par opposition aux catégories dans la classification). C'est une méthode d'apprentissage ensembliste, ce qui signifie qu'elle combine les prédictions de plusieurs algorithmes d'apprentissage automatique pour produire des prédictions plus précises.

::::{tab-set}

:::{tab-item} Toutes les conditions
{glue}`tous-md-sa`
:::

:::{tab-item} Fréquentation moyenne
{glue}`freq2-sa`
:::

:::{tab-item} Haute fréquentation
{glue}`freq3-sa`
:::

:::{tab-item} Cailloux
{glue}`cailloux-sa`
:::

:::{tab-item} Sables fins
{glue}`sablesfins-sa`
:::

:::{tab-item} Plage seche et sable fins
{glue}`plagesechefibres-sa`

:::

:::{tab-item} Plage seche
{glue}`plageseche-sa`

:::

:::{tab-item} Ligne d'eau
{glue}`lignedeau-sa`

:::


:::{tab-item} Résultats
:selected:

````{grid} 1 2 2 2

```{grid-item}
{glue}`q-tous-md-sa`
```

```{grid-item}

Les modèles ont fait l'objet d'un bootstrap, 100 itérations pour chaque scénario. Les résultats estimés sont la collection de toutes les prédictions de chaque itération.

Par exemple, le tableau intitulé "Gravier" présente les résultats observés et prévus pour les plages ayant un substrat de 3 ou 4.

```

```{grid-item}
{glue}`q-freq2-sa`
```

```{grid-item}
{glue}`q-freq3-sa`
```

```{grid-item}
{glue}`q-cailloux-sa`
```

```{grid-item}
{glue}`q-sablesfins-sa`
```

```{grid-item}
{glue}`q-plagesechefibres-sa`
```

```{grid-item}
{glue}`q-plageseche-sa`
```

```{grid-item}
{glue}`q-lignedeau-sa`
```

````
:::

::::
    


## Substrat

Le substrat définit la surface de l'emplacement d'échantillonnage.

In [15]:
%watermark --iversions -b -r

Git repo: https://github.com/hammerdirt-analyst/plastock.git

Git branch: dec20

seaborn   : 0.12.2
numpy     : 1.24.2
pandas    : 2.0.0
matplotlib: 3.7.1

