In [1]:
%load_ext watermark
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from myst_nb import glue
from slugify import slugify

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import resample

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import KFold

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from plastockconf import name_zones, name_frequentation, name_situation
from plastockconf import name_substrate, name_distance, table_css_styles, table_css_styles_top

from plastock import add_table_to_page, capitalize_x_tick_labels, capitalize_x_and_y_axis_labels, capitalize_legend_components, attribute_summary

import reportclass as rc
import setvariables as conf_


import matplotlib as mpl

def translate_describe(x, value_column):
    described = x.to_dict()
    described.pop("count")
    described["moyenne"] = described.pop("mean")
    described["écart-type"] = described.pop("std")
    df = pd.DataFrame(described.items())
    df.set_index(0, inplace=True)
    df.rename(columns={1:value_column}, inplace=True)
    df.index.name = None
    
    
    return df


format_kwargs = dict(precision=2, thousands="'", decimal=",")
def make_exportable(data, file_name, cmap='YlOrBr'):
    data.fillna(0, inplace=True)
    fig, ax = plt.subplots(figsize=(12,8))
    sns.heatmap(data=data, vmin=0, vmax=1, cmap=cmap, annot=True, fmt='.2', annot_kws={'size':10}, ax=ax, cbar=False)
    plt.tight_layout()
    ax.tick_params(which='both', axis='both', bottom=False, left=False)
    plt.savefig(file_name, dpi=300)

    plt.close()

glue('blank_caption', " ", display=False)

# Macros déchets m² 

## Resumé des résultats



### Différences entre les types de plage


### Exigences particulières données plastock

La quantité de déchets sauvages par mètre de plage correspond au nombre total d'objets identifiés divisé par la longueur du rivage. Pour les données Plastock, cela signifie que nous devons considérer la position un (ligne d'eau) et la position deux (plage sèche) ensemble. De plus, il y a __douze échantillons où le substrat était différent entre la position un et la position deux__. Par conséquent, ces 12 échantillons ont tous été classés dans la catégorie Sables grossiers.

#### La distribution de la variable de substrat après la réattribution des 12 échantillons en question.

1. Sables fins : 27%
2. Sables Grossiers : 32%
3. Graviers : 16%
4. Cailloux : 25%

La variable "position" n'est pas prise en compte. Cela signifie que les déchets par mètre carré sont considérés comme la somme de la surface de la position 1 et de la surface de la position 2 pour chaque échantillon [Macro déchets plage et attribut](macro-attributes).

In [2]:
new_data = pd.read_csv("data/end_pipe/macro_current.csv")
beach_data = pd.read_csv("data/pstock_beaches_current.csv")
codes = pd.read_csv('data/end_pipe/codes.csv').set_index('code')

new_column_names = {
    "Position":"position",
    "Substrat":"substrat",
    "Date":"date",
    "Code":"code",
    "Quantité":"quantité",
    "Aire":"area"
}

length_key = beach_data[["Plage","length"]].drop_duplicates("Plage").set_index("Plage")
work_data = new_data[["Plage", *new_column_names.keys()]].copy()
work_data.rename(columns=new_column_names, inplace=True)
# work_data["length"] = work_data.Plage.apply(lambda x: length_key.loc[x, "length"])
work_data["slug"] = work_data.Plage.apply(lambda x: slugify(x))
work_data["echantillon"] = list(zip(work_data.slug, work_data['date']))
work_data['date'] = pd.to_datetime(work_data["date"], format="mixed", dayfirst=True)
work_data.dropna(inplace=True)
work_data[["position", "substrat"]] = work_data[["position", "substrat"]].astype("int")
work_data['échantillon'] = work_data['echantillon'].astype(str)
work_data.drop(['echantillon'], inplace=True, axis=1)

# sum of the areas for each position at each sample
total_area_dup = work_data.drop_duplicates(['échantillon', 'area'])
total_area = total_area_dup.groupby('échantillon').area.sum()

work_data['area_c'] = work_data['échantillon'].apply(lambda x: total_area.loc[x])
work_data = work_data.groupby(['échantillon', 'Plage', 'substrat', 'date', 'area_c','slug', 'code'], as_index=False).agg({'quantité':'sum'})
# work_data['pcs/m'] = work_data['quantité']/work_data['length']

work_data.reset_index(inplace=True, drop=True)

# # sum of the areas for each position at each sample
# total_area_dup = work_data.drop_duplicates(['échantillon', 'area'])
# total_area = total_area_dup.groupby('échantillon').area.sum()

# work_data['area_c'] = work_data['échantillon'].apply(lambda x: total_area.loc[x])


work_data = work_data.groupby(['échantillon', 'Plage', 'substrat', 'date', 'area_c','slug', 'code'], as_index=False)['quantité'].sum()

work_data['pcs/m²'] = work_data['quantité']/work_data.area_c
test = work_data.groupby(['échantillon', 'substrat'], as_index=False)['pcs/m²'].sum()
dtest = test.groupby('substrat')['pcs/m²'].describe()
dtest.index.name = None

caption = "Les valeurs doivent correspondre au table A4-1 dans l'annexe 'Macro déchets plage et attribut'"

dtest.style.set_table_styles(table_css_styles).format(precision=2).set_caption(caption)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,27.0,1.04,0.88,0.09,0.46,0.7,1.32,3.64
2,31.0,0.45,0.42,0.08,0.18,0.38,0.52,1.93
3,16.0,0.32,0.36,0.03,0.1,0.2,0.3,1.39
4,36.0,0.51,0.56,0.04,0.1,0.21,0.72,1.96


In [3]:
voi = 'substrat'
vals = "pcs/m²"
some_data = work_data.copy()
groupby = ['échantillon', voi]
data = some_data.groupby(groupby, as_index=False)[vals].sum()
# these are the duplicate values that need to be changed
dd = data[data['échantillon'].duplicated()].copy()

duplicated = work_data[work_data['échantillon'].isin(dd['échantillon'].unique())].copy()
# notduplicated
not_duplicated = work_data[~(work_data['échantillon'].isin(dd['échantillon'].unique()))].copy()
duplicated['substrat'] = 2 

# put it back to gether again
work_data = pd.concat([duplicated, not_duplicated])
work_data = work_data.groupby(['échantillon', 'Plage', 'substrat', 'date', 'area_c', 'slug', 'code'], as_index=False).agg({'quantité':'sum'})
work_data['pcs/m²'] = work_data['quantité']/work_data['area_c']

# accounting for objects not found at a sample:
# the codes that were indentified
codes_ip = work_data.code.unique()
# the unique samples
loc_dates = work_data['échantillon'].unique()

# a copy for itterating
wd = work_data.copy()

# for each sample (échantillon) indentify the codes that were not
# found by indentifying all the codes that were found in all surveys
# and removing the codes that were not identified at that sample.
# for each unidentified code per sample, add a row with the sample
# id and the code. give the row a quantity of zero.
rows = []
for a_loc in loc_dates:
    r = wd.loc[wd['échantillon'] == a_loc].copy()
    r.reset_index(inplace=True, drop=True)
    
    t = r.loc[0][['échantillon', 'Plage', 'substrat', 'date', 'area_c', 'slug']].values
    asamp = [x for x in t]
    used_codes = r.code.unique()
    unused = [x for x in codes_ip if x not in used_codes]
    for element in unused:
        arow = [*asamp, element, 0, 0]
        rows.append(arow)
        

work_x = pd.DataFrame(rows, columns=['échantillon', 'Plage', 'substrat', 'date', 'area_c', 'slug', 'code', 'quantité', 'pcs/m²'])
work_data = pd.concat([work_x, work_data])

# they can be merged on the Plage column and the index
features = ['frequentation', 'situation', 'orientation', 'distance']
beach_data = pd.read_csv("data/end_pipe/asl_beaches.csv").set_index('Plage')
ti = work_data.copy()
env_plastock = ti.merge(beach_data[features], left_on='Plage', right_index=True)

In [4]:
# the changes in G27 in relation to the different land use variables.
operations =  {'échantillon':'nunique', 'pcs/m²':'median'}

def make_categorical_matrix(data: pd.DataFrame = None, feature_columns: list = None, operations: dict = operations):    
    # executes a pd.DataFrame.groupby operation on data feature_columns using operations  
    nd = data.groupby(feature_columns, as_index=False).agg(operations)    
    
    return nd


def name_the_new_distance(x, less='<= 500 m', more = '> 500 m'):
    if x == 1:
        return less
    else:
        return more

def name_the_new_freq(x, new):
    if x <= 2:
        return new
    else:
        return 'Elévée'


# apply to the survey data
t_and_f = env_plastock.loc[:, ['échantillon', 'slug','date','code', 'pcs/m²', 'quantité', 'frequentation', 'situation', 'distance', 'substrat']].copy()

# creation of composite variables
# the substrat and distance features are being combined
# the two lowest and the two highest of each group are being combined
# substrat is a matter of combining different granularities. They are being grouped as
# sand and gravel.
# distance is now grouped by locations either less than or equal to 500 meters
t_and_f.loc[t_and_f.substrat <= 2, 'substrat'] = 1
t_and_f.loc[t_and_f.substrat > 2, 'substrat'] = 2
t_and_f.loc[t_and_f.distance <= 2, 'distance'] = 1
t_and_f.loc[t_and_f.distance > 2, 'distance'] = 2
t_and_f.loc[t_and_f.frequentation <= 2, 'frequentation'] = 2

f_combi = t_and_f.copy()

f_combi.rename(columns={'frequentation':'fréquentation', 'loc_date': 'échantillon'}, inplace=True)

mask = (f_combi.code == 'G27')
f_comb = f_combi.copy()
f_comb['distance'] = f_comb['distance'].apply(lambda x: name_the_new_distance(x))
f_comb['fréquentation'] = f_comb['fréquentation'].apply(lambda x: name_the_new_freq(x, 'faible-moyenne'))
f_comb['situation'] = f_comb['situation'].apply(lambda x: name_situation[x])
f_comb['substrat'] = f_comb['substrat'].apply(lambda x: name_the_new_distance(x, less='Sable', more='Graviers'))


# The work data for the GPT. The data of reference.
# We will add the orientation column later.
no_combined = env_plastock.loc[:, ['échantillon', 'slug','date','code', 'pcs/m²', 'frequentation', 'situation', 'distance', 'substrat']].copy()
no_combined.rename(columns={'frequentation':'fréquentation', 'loc_date': 'échantillon'}, inplace=True)
# for the GPT:
no_combined.to_csv('plastock_with _asl_landuse.csv', index=False)

operations =  {'échantillon':'nunique', 'pcs_m':'median'}

no_combined['distance'] = no_combined['distance'].apply(lambda x: name_distance[x])
no_combined['fréquentation'] = no_combined['fréquentation'].apply(lambda x: name_frequentation[x])
no_combined['situation'] = no_combined['situation'].apply(lambda x: name_situation[x])
no_combined['substrat'] = no_combined['substrat'].apply(lambda x: name_substrate[x])

In [5]:
def create_bins(predictions, bin_width=0.2):
    """
    Create bins from the predictions with a specified width.

    :param predictions: List or array of prediction values.
    :param bin_width: Width of each bin. Default is 0.2.
    :return: A tuple (bins, bin_counts).
        bins: The edges of the bins.
        bin_counts: The count of predictions in each bin.
    """
    # Determine the range for the bins
    max_prediction = max(predictions)
    bins = np.arange(0, max_prediction + bin_width, bin_width)

    # Count the number of predictions in each bin
    bin_counts, _ = np.histogram(predictions, bins=bins)

    return bins, bin_counts

def calculate_bin_probabilities(bin_counts):
    """
    Calculate the probability for each bin.

    :param bin_counts: The count of predictions in each bin.
    :return: List of probabilities for each bin.
    """
    total_predictions = sum(bin_counts)
    bin_probabilities = bin_counts / total_predictions
    return bin_probabilities


def analyze_scenario(scenario_data, func, n_iterations=100, bin_width=0.2):
    """
    Analyze a specific scenario using Random Forest regression with bootstrapping,
    and calculate feature importances.

    :param data: DataFrame containing the dataset.
    :param feature_1: The name of the first feature for filtering.
    :param feature_1_value: The value of the first feature to filter by.
    :param feature_2: The name of the second feature for filtering.
    :param feature_2_value: The value of the second feature to filter by.
    :param n_iterations: Number of bootstrap iterations. Default is 100.
    :param bin_width: Width of each bin for histogram. Default is 0.2.
    :return: A tuple containing bins, bin probabilities, flattened predictions, and feature importances.
    """
    
    # Prepare data for regression
    y_scaler = MinMaxScaler()
    y_scaled = y_scaler.fit_transform(scenario_data['pcs_m'].values.reshape(-1,1)).flatten()
    
    # Initialize the OneHotEncoder
    # here we encode the ordinal data
    encoder = OneHotEncoder(sparse_output=False)
    
    X = scenario_data.drop('pcs_m', axis=1)
    
    # Apply the encoder to the categorical columns
    encoded_data = encoder.fit_transform(scenario_data[['fréquentation', 'situation', 'distance', 'substrat']])
    # Create a DataFrame with the encoded data
    X_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['fréquentation', 'situation', 'distance', 'substrat']))

    
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_scaled, test_size=0.2, random_state=42)

    # Bootstrap predictions and accumulate feature importances
    bootstrap_predictions = []
    feature_importances_accumulated = np.zeros(X_train.shape[1])
    
    # Collect diagnostic at each repetition
    cum_mse = []
    cum_r2 = []
    
    for _ in range(n_iterations):
        X_train_sample, y_train_sample = resample(X_train, y_train)
        rf_model_sample = func
        rf_model_sample.fit(X_train_sample, y_train_sample)
        
        # the results of this prediction are tested against the original
        # y_test
        pred = rf_model_sample.predict(X_test)
        
        r2 = r2_score(y_test, pred)
        pred = y_scaler.inverse_transform(pred.reshape(-1, 1)).flatten()
        bootstrap_predictions.append(pred)
        mse = mean_squared_error(y_test , pred)
        
        
        feature_importances_accumulated += rf_model_sample.feature_importances_
        
        cum_mse.append(mse)
        cum_r2.append(r2)
    
    cv_mse = []
    
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_index, val_index in kf.split(X_encoded):
        X_train_kfold, X_val_kfold = X_encoded.iloc[train_index], X_encoded.iloc[val_index]
        y_train_kfold, y_val_kfold = y_scaled[train_index], y_scaled[val_index]

        model_kfold = func
        model_kfold.fit(X_train_kfold, y_train_kfold)
        y_pred_kfold = model_kfold.predict(X_val_kfold)
        
        y_oscale_t = y_scaler.inverse_transform(y_val_kfold.reshape(-1, 1)).flatten()
        y_oscale_p = y_scaler.inverse_transform(y_pred_kfold.reshape(-1,1)).flatten()

        mse = mean_squared_error(y_oscale_t, y_oscale_p)
        r2 = r2_score(y_oscale_t, y_oscale_p)

        cv_mse.append(mse)
        


    # Average feature importances
    feature_importances = feature_importances_accumulated / n_iterations

    # Flatten the predictions array
    predictions_flat = np.array(bootstrap_predictions).flatten()

    # Create bins and calculate bin probabilities
    bins, bin_counts = create_bins(predictions_flat, bin_width)
    bin_probabilities = calculate_bin_probabilities(bin_counts)

    return predictions_flat, feature_importances, cum_mse, cum_r2, cv_mse, bins, bin_probabilities

def plot_histogram(predictions, observed, title="", reference='camp-dist-1', display=False):
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.histplot(predictions, bins=20, stat="probability", ax=ax, label='prédictions', zorder=0)
    sns.histplot(observed, bins=20, stat="probability", label='observée', zorder=1, ax=ax)
    plt.title(title, loc='left')
    plt.xlabel('pcs/m')
    plt.ylabel('Densité de Probabilité')
    plt.legend()
    glue(reference, fig, display=display)
    plt.close()

def evalutate_model(r2s, mses, label, model='random-forest'):
    r2 = np.round(np.mean(r2s), 2)
    mse = np.round(np.mean(mses), 2)
    results = {"cross validated error":r2, "mean² error":mse, 'model':model}
    return pd.DataFrame(results, index=[label])

# Calculating quantiles for Scenario 2
q_uants = [0.01, 0.25, 0.5, 0.75, 0.99]
index = ['1%', '25%', '50%', '75%', '99%', 'Moyenne']
def makeqdf(observed, predicted, index=index, quants=q_uants, caption=""):
    
    o_q = np.quantile(observed, quants)
    m_o = np.mean(observed)
    o_p = np.quantile(predicted, quants)
    m_p = np.mean(predicted)
    
    results = {'observée':[*o_q, m_o], 'prédiction': [*o_p, m_p]}
    return pd.DataFrame(results, index=index).style.set_table_styles(table_css_styles_top).format(precision=2).set_caption(caption)

cols = ['échantillon', 'fréquentation','situation', 'distance', 'substrat']

## Les objets trouvés en fonction de leur utilisation

Le type d'utilité est basé sur l'utilisation de l'objet avant qu'il ne soit jeté ou sur la description de l'objet si l'utilisation initiale est indéterminée. Les objets identifiés sont classés dans l'une des 260 catégories prédéfinies. Les catégories sont regroupées en fonction de leur utilisation ou de leur description.

- Eaux usées : objets rejetés par les stations d'épuration, y compris les objets susceptibles d'être jetés dans les toilettes.
- Microplastiques (< 5 mm) : plastiques fragmentés et résines plastiques de préproduction.
- Infrastructure : objets liés à la construction et à l'entretien des bâtiments, des routes et des réseaux d'eau et d'électricité.
- Alimentation et boisson : tous les matériaux liés à la consommation de nourriture et de boissons.
- Agriculture : principalement des feuilles industrielles, par exemple, paillis et bâches de culture, serres, fumigation du sol, films d'emballage de balles. Comprend les plastiques durs pour les clôtures agricoles, les pots de fleurs, etc.
- Tabac : principalement des filtres de cigarettes, y compris tous les matériaux liés au tabagisme.
- Loisirs : objets liés au sport et aux loisirs, par exemple, pêche, chasse, randonnée, etc.
- Emballages non alimentaires et non liés au tabac : matériaux d'emballage non identifiés comme étant liés à la nourriture, aux boissons ou au tabac.
- Fragments de plastique : morceaux de plastique d'origine ou d'utilisation indéterminée.
- Objets personnels : accessoires, articles d'hygiène et vêtements.

Pour des informations détaillées sur la composition des groupes, consultez [IQAASL - DE](https://hammerdirt-analyst.github.io/IQAASL-End-0f-Sampling-2021/code_groups.html) ou [IQAASL - EN](https://www.plagespropres.ch/code_groups.html).
<br>

In [6]:
language_maps = rc.language_maps()
top_label= ['feature_name', 'lac-leman']

f = pd.read_csv('data/u_pstk.csv')
city_map = f[["slug", "city"]].drop_duplicates()
city_map.loc[city_map.slug == 'savoniere', 'slug'] = 'savonniere'
city_map.set_index('slug', inplace=True)

# !important formatting data for use with IQAASL
# when combined with previous data there are duplicate values
# the locations in plastock data that have the same name as iqaasl data
# need to be changed
change_names = ['preverenges', 'tolochenaz', 'versoix', 'vidy', 'cully']

plastock_cols = ['loc_date', 'date','slug', 'code', 'quantity', 'city', 'feature_name', 'feature_type','parent_boundary', 'pcs_m']
features = ['frequentation', 'situation', 'orientation', 'distance']

changeus = work_data[work_data.slug.isin(change_names)].copy()
donotchange = work_data[~work_data.slug.isin(change_names)].copy()

new_slug = {
    'cully': 'cully-p',
    'preverenges': 'preverenges-p',
    'tolochenaz': 'tolochenaz-p',
    'versoix':'versoix-p',
    'vidy': 'vidy-p'}

# they have the same name as locations in iqaasl
changeus['new_slug'] = changeus.slug.apply(lambda x: new_slug[x])
changeus['slug'] = changeus.new_slug
changeus.drop('new_slug', inplace=True, axis=1)

# the plastock data with the converted names
wd_nn = pd.concat([changeus, donotchange])

# plastock did not use the same inventory as iqaasl
# here we select only the codes in the plastock inventory
pcodes = work_data.code.unique()

# identify and remove codes for which there is no defintion
# if the code is not defined then it can not be used
t = [x for x in pcodes if x not in codes.index]
wd_ni = wd_nn[~wd_nn.code.isin(t)].copy()

# # these items are not well divided into the composite subgroups
# # for example people often know what a cap is, but whether it 
# # comes from a drink bottle or other type is not well considered
# # we combine the subcategories into more comprehensive groups.
ti = rc.use_gfrags_gfoams_gcaps(wd_ni, codes)

# aggregate along all land-use and topo variables.
# ti = ti.groupby(['échantillon', 'Plage', 'date', 'substrat', 'length', 'slug', 'code'], as_index=False).agg({'quantité':'sum'})

# # the independent variables are in asl_beaches file
beach_data = pd.read_csv("data/end_pipe/asl_beaches.csv").set_index('Plage')

# !combinining with previous results!
# these are the default arguments for the report class
# the language maps gives the code definitions in english, german and french
# the top_label asserts the top level aggregation for the set of data defined by
# start, end dates and feature_name. These arguments are for the plastock data
language_maps = rc.language_maps()
top_label= ['feature_name', 'lac-leman']

# the default language is english in the report column class
# there are column names that need to be changed
new_names = {'échantillon': 'loc_date', 'pcs/m': 'pcs_m'}
ti.rename(columns={**new_names,'quantité': 'quantity'}, inplace=True)

# # define the pcs/m column and the data to merge
ti['pcs_m'] = ti['pcs/m²']

# adding and renaming columns according to reportclass requirements
# these values can be indexed on the IQAASL data
ti['city'] = ti.slug.apply(lambda x: city_map.loc[x])
ti['feature_name'] = 'lac-leman'
ti['feature_type'] = 'l'
ti['parent_boundary'] = 'rhone'

ti_work = ti[plastock_cols].copy()

# this data is formatted to work with the reporting structure of IQAASL
ti_work = ti_work.groupby(plastock_cols, as_index=False).agg(conf_.unit_agg)
ti_work['project']='Plastock'


# a report that includes just plastock data
boundaries = dict(start_date="2021-12-31", end_date="2023-01-01", feature_name="lac-leman", language="fr")
plastock_report = rc.ReportClass(ti_work.copy(), boundaries=boundaries, language="fr", lang_maps=language_maps, top_label=top_label)
most_common, weight = plastock_report.most_common

In [7]:
w_df = plastock_report.w_df.copy()
cities = w_df.city.unique()
cone = cities[:12]
ctwo = cities[12:]

groups_df = plastock_report.w_df.copy()
group_name_map = codes['groupname']
groups_df['groupname'] = groups_df.code.apply(lambda x: group_name_map.loc[x])
tg1 = rc.a_cumulative_report(groups_df, feature_name='city', object_column='groupname', table_split=cone)
rc.translated_and_style_for_display(tg1, plastock_report.lang_maps[plastock_report.language], plastock_report.language, gradient=True).set_caption("Les résultats des objets par utilisation pour chaque ville du projet: Amphion à Hemance")

Unnamed: 0,Amphion,Anthy,Aubonne,Bouveret,Clarens,Crans,Cully,Excenevex,Genève,Gland,Grangettes,Hermance,Cumulé
Agriculture,0,0,0,0,0,0,0,0,0,0,0,0,0
Nourriture et boissons,15,0,3,18,5,6,0,1,3,1,39,1,4
Infrastructures,11,0,1,20,10,0,0,1,2,33,49,0,3
Micro-plastiques (< 5mm),8,0,2,32,1,0,1,13,21,4,39,0,2
Emballage non alimentaire,16,0,1,9,1,0,1,0,1,0,6,0,1
Articles personnels,1,1,0,1,1,0,1,0,2,0,2,0,1
Morceaux de plastique,58,9,4,47,20,19,4,31,14,32,113,5,16
Loisirs,4,1,0,3,1,0,1,0,2,0,3,0,1
Tabac,3,2,3,9,18,0,1,2,6,6,7,2,3
Non classé,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
file_name = 'resources/images/group_names_one-sa.jpg'
make_exportable(tg1, file_name)

In [9]:
tg2 = rc.a_cumulative_report(groups_df, feature_name='city', object_column='groupname', table_split=ctwo)
rc.translated_and_style_for_display(tg2, plastock_report.lang_maps[plastock_report.language], plastock_report.language, gradient=True).set_caption("Les résultats des objets par utilisation pour chaque ville du projet: Lugirn à Vevey")

Unnamed: 0,Lugrin,Lutry,Meillerie,Préverenges,Rolle,Saint-disdille,Savonière,Tolochenaz,Tougues,Versoix,Vevey,Vidy,Cumulé
Agriculture,1,0,0,0,0,10,0,0,0,0,0,0,0
Nourriture et boissons,11,7,0,4,2,24,6,3,3,5,1,12,4
Infrastructures,19,2,10,4,3,9,1,1,1,2,0,2,3
Micro-plastiques (< 5mm),0,0,7,12,9,2,0,2,2,0,1,5,2
Emballage non alimentaire,2,1,0,0,0,10,2,0,0,1,0,2,1
Articles personnels,3,0,0,0,1,2,1,0,0,0,0,0,1
Morceaux de plastique,15,36,8,27,12,58,11,6,6,19,1,23,16
Loisirs,3,1,0,1,1,3,1,0,0,1,0,2,1
Tabac,3,0,4,5,7,10,3,0,11,0,1,22,3
Non classé,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
file_name = 'resources/images/group_names_two-sa.jpg'
make_exportable(tg2, file_name)

## Les objets les plus courants Plastock

### Définition des _objets les plus courants_

Les _objets les plus courants_ peuvent être sélectionnés de plusieurs manières. On peut également les appeler les _objets d'intérêt_. Dans le cadre de ce rapport, nous nous concentrons sur les objets qui représentent une proportion plus importante des résultats que les autres. Nous avons utilisé deux critères de sélection : i. la quantité, ii. le taux d'échec.

1. Quanité: Si un objet a une quantité totale qui le place dans les dix premiers, il est considéré comme 'courant'.
2. Taux d'échec: Si un objet a été trouvé dans au moins la moitié des échantillons, il est ÉGALEMENT considéré comme 'courant'.

Par conséquent, pour cette étude, les 'objets les plus courants' sont ceux qui se trouvent soit dans les dix premiers en termes de nombre total de pièces de déchets ET/OU qui ont été trouvés dans au moins 50% des enquêtes. Pour Plastock, les objets les plus courants représentent 89% du montant total ou 24'156/27'493 [Les plus courants](most_common_p).

In [11]:
w_df = plastock_report.w_df.copy()
cities = w_df.city.unique()

t = rc.a_cumulative_report(w_df[(w_df.code.isin(most_common.index))], feature_name='city', object_column='code', table_split=cone)

rc.translated_and_style_for_display(t, plastock_report.lang_maps[plastock_report.language], plastock_report.language, gradient=True).set_caption("Les résultats des objets les plus courants de Plastock pour chaque ville du projet: Amphion à Hemance")

Unnamed: 0,Amphion,Anthy,Aubonne,Bouveret,Clarens,Crans,Cully,Excenevex,Genève,Gland,Grangettes,Hermance,Cumulé
Fragments de plastique angulaires <5mm,0,0,0,15,0,0,0,5,7,0,6,0,0
Pellets industriels (gpi),0,0,0,9,0,0,0,0,4,4,5,0,0
Mégots et filtres à cigarettes,3,2,3,9,18,0,1,2,6,6,7,2,3
"Emballages de bonbons, de snacks",9,0,1,9,4,6,0,1,3,0,33,0,3
Bâtonnets de sucette,3,0,0,3,1,0,0,0,0,0,0,0,0
Mousse de plastique pour l'isolation thermique,0,0,0,0,5,0,0,0,0,0,13,0,0
Déchets de construction en plastique,0,0,0,11,0,0,0,0,0,0,5,0,0
Coton-tige,15,1,0,4,4,0,0,0,0,1,11,0,0
"Couvercles en plastique bouteille: g21, g22, g23, g24",14,0,1,9,1,0,1,0,1,0,6,0,1
"Fragments de polystyrène expansé: g81, g82, g83",9,0,0,6,3,0,0,1,1,24,26,0,1


In [12]:
file_name = 'resources/images/most_common_one-sa.jpg'
make_exportable(t, file_name)

<br>
<br>
Résultats de Lugrin à Vidy :

In [13]:
t = rc.a_cumulative_report(w_df[(w_df.code.isin(most_common.index))], feature_name='city', object_column='code', table_split=ctwo)

rc.translated_and_style_for_display(t, plastock_report.lang_maps[plastock_report.language], plastock_report.language, gradient=True).set_caption("Les résultats des objets les plus courants de Plastock pour chaque ville du projet: Lugirn à Vevey")

Unnamed: 0,Lugrin,Lutry,Meillerie,Préverenges,Rolle,Saint-disdille,Savonière,Tolochenaz,Tougues,Versoix,Vevey,Vidy,Cumulé
Fragments de plastique angulaires <5mm,0,0,2,0,6,1,0,0,0,0,0,3,0
Pellets industriels (gpi),0,0,0,9,0,0,0,0,0,0,0,2,0
Mégots et filtres à cigarettes,3,0,4,5,7,10,3,0,11,0,1,22,3
"Emballages de bonbons, de snacks",10,1,0,4,2,19,4,3,2,4,1,9,3
Bâtonnets de sucette,1,5,0,0,0,2,1,0,0,0,0,1,0
Mousse de plastique pour l'isolation thermique,0,0,1,0,0,2,0,0,0,1,0,1,0
Déchets de construction en plastique,0,0,0,0,0,0,0,0,0,0,0,0,0
Coton-tige,4,0,1,1,0,7,2,0,0,3,0,4,0
"Couvercles en plastique bouteille: g21, g22, g23, g24",2,1,0,0,0,9,2,0,0,1,0,2,1
"Fragments de polystyrène expansé: g81, g82, g83",14,2,9,4,2,4,0,0,1,0,0,0,1


In [14]:
file_name = 'resources/images/most_common_two-sa.jpg'
make_exportable(t, file_name)

In [15]:
# Filter for Scenario 
test_xi = f_combi[(f_combi['situation'] == 2) & (f_combi['fréquentation'] == 3)].copy()
test_xi.rename(columns={'pcs/m²':'pcs_m'}, inplace=True)
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()
test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

# model parameters
estimators = 10
iterations = 500

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2, cv_mse, bins, bin_probs = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

caption = 'Urban, Fréquentation Elévée'
q_sit_2_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-hf-ville-sa', q_sit_2_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Situation Ville, Haute Fréquentation'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='ville-hf-sa', display=False)

In [16]:
# Filter for Scenario 
test_xi = f_combi[(f_combi['situation'] == 1) & (f_combi['fréquentation'] == 3)].copy()
test_xi.rename(columns={'pcs/m²':'pcs_m'}, inplace=True)
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()
test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2, cv_mse, bins, bin_probs = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

# the quantiles for this scenario
caption="Campagne, Fréquentation Eléveé"
q_sit_1_freq_3 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-hf-camp-sa', q_sit_1_freq_3, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Situation Campagne, Haute Fréquentation\n'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='camp-hf-sa', display=False)

In [17]:
# Filter for Scenario 
test_xi = f_combi[(f_combi['situation'] == 1) & (f_combi['distance'] == 1)].copy()
test_xi.rename(columns={'pcs/m²':'pcs_m'}, inplace=True)
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()
test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]


func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2, cv_mse, bins, bin_probs = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

# the quantiles for this scenario
caption = 'Campagne, <= 500 m du parking'
q_sit_1_d_1 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-camp-dist_1-sa', q_sit_1_d_1, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Situation Campagne, distance < 500 m'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='camp-dist-1-sa', display=False)

In [18]:
# Filter for Scenario 
test_xi = f_combi[(f_combi['situation'] == 2) & (f_combi['distance'] == 1)].copy()
test_xi.rename(columns={'pcs/m²':'pcs_m'}, inplace=True)
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()
test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2, cv_mse, bins, bin_probs = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

# the quantiles for this scenario
caption = 'Urban, <= 500 m du parking'
q_sit_2_d_1 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-ville-dist_1-sa', q_sit_2_d_1, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Situation Ville, distance < 500 m'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='ville-dist-1-sa', display=False)

In [19]:
# Filter for Scenario 
test_xi = f_combi[(f_combi['substrat'] == 1)].copy()
test_xi.rename(columns={'pcs/m²':'pcs_m'}, inplace=True)
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()
test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]
func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2, cv_mse, bins, bin_probs = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

# the quantiles for this scenario
caption = 'Sables'
q_sub_1 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q_subs_1-sa', q_sub_1, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Sables'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='subs_1_hist-sa', display=False)

In [20]:
# Filter for Scenario 
test_xi = f_combi[(f_combi['substrat'] == 2)].copy()
test_xi.rename(columns={'pcs/m²':'pcs_m'}, inplace=True)
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()
test_x = test_x[['fréquentation', 'situation', 'distance', 'substrat', 'pcs_m']]
func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2, cv_mse, bins, bin_probs = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

# the quantiles for this scenario
caption='Graviers'
q_sub_2 = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q_subs_2-sa', q_sub_2, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions - Graviers'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='subs_2_hist-sa', display=False)

In [21]:
# Filter for Scenario 
# This is all the values => no filter
# just aggregating to the sample_id 
test_xi = f_combi.copy()
test_xi.rename(columns={'pcs/m²':'pcs_m'}, inplace=True)
test_x = test_xi.groupby(cols, as_index=False).pcs_m.sum()

func = RandomForestRegressor(n_estimators=estimators, criterion="absolute_error", random_state=42)
predictions, feature_importance, mse, r2, cv_mse, bins, bin_probs = analyze_scenario(test_x, func,  n_iterations=iterations, bin_width=0.2)

# the quantiles for this scenario
caption = 'Toutes les conditions'
q_tous = makeqdf(test_x.pcs_m.values, predictions, caption=caption)
glue('q-tous-sa', q_tous, display=False)

# the histogram for this scenario:
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions'
plot_histogram(predictions, test_x.pcs_m.values, title=title, reference='tous-sa', display=False)

## Estimation des paramètres et prédictions


(random_forest_sa)=
### Random Forest 

Source : [scikit-learn random forest](https://scikit-learn.org/0.16/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

criterion : `absolute error`

La régression avec forêt aléatoire est une technique d'apprentissage automatique (machine learning) utilisée pour prédire des résultats continus (par opposition aux catégories dans la classification). C'est une méthode d'apprentissage ensembliste, ce qui signifie qu'elle combine les prédictions de plusieurs algorithmes d'apprentissage automatique pour produire des prédictions plus précises.

::::{tab-set}

:::{tab-item} Toutes les conditions
{glue}`tous-sa`
:::

:::{tab-item} Graviers
{glue}`subs_2_hist-sa`
:::

:::{tab-item} Sables
{glue}`subs_1_hist-sa`
:::

:::{tab-item} Ville et haute Fréquentation
{glue}`ville-hf-sa`
:::

:::{tab-item} Campagne et haute fréquentation
{glue}`camp-hf-sa`
:::

:::{tab-item} Campagne et parking <= 500 m
{glue}`camp-dist-1-sa`

:::

:::{tab-item} Ville et parking <= 500 m
{glue}`ville-dist-1-sa`

:::



:::{tab-item} Résultats
:selected:

````{grid} 1 2 2 2

```{grid-item}
{glue}`q-tous-sa`
```

```{grid-item}

Les modèles ont fait l'objet d'un bootstrap, 100 itérations pour chaque scénario. Les résultats estimés sont la collection de toutes les prédictions de chaque itération.

Par exemple, le tableau intitulé "Gravier" présente les résultats observés et prévus pour les plages ayant un substrat de 3 ou 4.

```

```{grid-item}
{glue}`q_subs_2-sa`
```

```{grid-item}
{glue}`q_subs_1-sa`
```

```{grid-item}
{glue}`q-hf-ville-sa`
```

```{grid-item}
{glue}`q-hf-camp-sa`
```

```{grid-item}
{glue}`q-camp-dist_1-sa`
```

```{grid-item}
{glue}`q-ville-dist_1-sa`
```

````
:::

::::
    

In [22]:
from typing import Type, Optional, Callable
from typing import List, Dict, Union, Tuple

def sum_a_b(zipped):
    for element in zipped:
        # the new beta distribution would be
        # total success, (total tries - total success)
        new_element_0 = np.array([np.array([x[0], x[1] - x[0]]) for x in element[0]])
        new_element_1 = np.array([x for x in element[1]])
        t3 = new_element_0 + new_element_1
        
        yield t3

# Grid approximation

grid_val_index = np.linspace(0, 5.99, 600)
groupby_columns = ['sample_id', 'location', 'date', 'city', 'orchards', 'vineyards', 'buildings', 'forest',
                   'undefined', 'public_services', 'streets']
def draw_a_beta_value(generator):
    d = next(generator)
    # drawing a random number from the beta distribution
    # this is the the chance p, that a binomial distribution will
    # result in True.
    my_beta = [beta(x[0], x[1]).rvs(1) for x in d]
    yield my_beta


def binomial_probability_of_failure(generator):
    # in this case failure means exceeding the value
    # for trash a success is never exceeding the value
    d = next(generator)
    di = [x[0] for x in d]
    yield di

def bin_land_use_values(*, data: pd.DataFrame, column: str, num_bins: int = 4) -> pd.DataFrame:
    """
    Bins the specified column's values into a given number of bins and adds a new column to the DataFrame with these bin labels.

    Args:
        data (pd.DataFrame): The DataFrame to modify.
        column (str): The name of the column to bin.
        num_bins (int, optional): The number of bins to use. Defaults to 20.

    Returns:
        pd.DataFrame: The modified DataFrame with an additional column for binned values.
    """
    data[f'{column}_bin'] = pd.cut(data[column], bins=num_bins, labels=[1, 2, 3, 4 ], include_lowest=True)
    return data


def calculate_likelihood(*, aggregated_data: pd.DataFrame, bin_density_column: str, pcs_column: str = 'pcs/m',
                         grid_range: np.ndarray = None, bins: list = None) -> pd.DataFrame:
    """
    Calculates the likelihood of observing the aggregated pcs/m data for each grid point and bin density value.

    Args:
        aggregated_data (pd.DataFrame): The aggregated data to be used for likelihood calculation.
        bin_density_column (str): The column representing bin density numbers.
        pcs_column (str, optional): The pcs/m column to use for calculation. Defaults to 'pcs/m'.
        grid_range (np.ndarray, optional): The range of grid values. Defaults to np.linspace(0, 9.99, 1000).

    Returns:
        pd.DataFrame: A DataFrame with likelihood values for each grid value and bin density number.
    """
    likelihood_df = pd.DataFrame(index=grid_range)
    
    for bin_value in bins:
        bin_data = aggregated_data[aggregated_data[bin_density_column] == bin_value]
        if bin_data.empty:
            likelihoods = [np.array([1, 1]) for grid_point in grid_range]
        else:
            likelihoods = [np.array([(bin_data[pcs_column] > grid_point).sum(), len(bin_data)]) for grid_point in
                           grid_range]
        likelihood_df[f'Likelihood_{bin_value}'] = likelihoods
    return likelihood_df

def calculate_beta_prior(*, grid_range: np.ndarray = grid_val_index, bin_density_numbers: List[int] = list(range(1,
                                                                                                    21))) -> pd.DataFrame:
    """
    Calculates a Beta(1, 1) prior for each value in the specified grid range for each bin density number.

    Args:
        grid_range (np.ndarray, optional): The range of grid values. Defaults to np.linspace(0, 9.99, 1000).
        bin_density_numbers (List[int], optional): List of bin density numbers. Defaults to range(1, 21).

    Returns:
        pd.DataFrame: A DataFrame with Beta(1, 1) prior values for each grid value and bin density number.
    """
    prior_df = pd.DataFrame(index=grid_range)
    prior_values = np.array([1, 1])  # Constant value since Beta(1, 1) is uniform
    
    for bin_number in bin_density_numbers:
        prior_df[f'Bin_{bin_number}'] = [prior_values for grid_point in grid_range]
    return prior_df

class GridApproximation:
    posterior = []
    groupby_columns = groupby_columns
    
    def __init__(self, data: pd.DataFrame, these_codes: list[str] = None, value_column: str = 'pcs/m',
                 land_use_column: str = 'buildings', n_bins: int = 5, groupby_columns: list = groupby_columns):
        self.data = data
        self.these_codes = these_codes
        self.value_column = value_column
        self.land_use_column = land_use_column
        self.n_bins = n_bins
        self.groupby_columns = groupby_columns
    
    @property
    def sample_totals(self):
        aare_dt = self.data[self.data.code.isin(self.these_codes)].groupby(self.groupby_columns, as_index=False)[
            self.value_column].sum()
        return aare_dt
    
    @property
    def binned_samples(self):
        new_bins = bin_land_use_values(data=self.sample_totals, column=self.land_use_column, num_bins=self.n_bins)
        return new_bins

    @property
    def prior_grid(self, afunc: Callable = calculate_beta_prior):
        prior_grid = afunc(bin_density_numbers=list(range(1, self.n_bins + 1)))
        return prior_grid

    @property
    def likelihood_grid(self):
        l_grid = calculate_likelihood(aggregated_data=self.binned_samples,
                                            bin_density_column=f'{self.land_use_column}_bin',
                                            pcs_column=self.value_column)
        return l_grid

    @property
    def posterior_grid(self):
        ti = np.array(self.likelihood_grid.values)
        t2 = np.array(self.prior_grid.values)
        grid_val_index = self.prior_grid.index
    
        zd = list(zip(ti, t2))
        t = sum_a_b(zd)
    
        posteriors = []
    
        for i in grid_val_index:
            st = binomial_probability_of_failure(draw_a_beta_value(t))
            val = next(st)
            posteriors.append(val)
    
        pts = pd.DataFrame(posteriors, index=grid_val_index, columns=self.prior_grid.columns)
        pts['pcs'] = pts.index
        plg = pd.melt(pts, id_vars='pcs', value_vars=pts.columns)
        return plg

In [23]:
from scipy.stats import beta
from scipy.stats import multinomial

def define_posterior(likelihood, prior, grid_val_index: np.array = None):
    
    # the alpha, beta parameters of the likelihood and prior are assembled
    alpha_beta = list(zip(likelihood.values, prior.values))
    a_b_sum = sum_a_b(alpha_beta)
    
    posteriors = []
    for i in grid_val_index:
        # the sum of successes and failures for the scenario at the given
        # grid value are used as the alpha, beta parameters of the beta distribtion
        # for the binomial/bernouli probability that a sample will exceed the grid
        # value i.
        st = binomial_probability_of_failure(draw_a_beta_value(a_b_sum))
        val = next(st)
        posteriors.append(val)
    
    # return posterior probabilities with gird index and column labels
    post_grid_pstock = pd.DataFrame(posteriors, index=grid_val_index, columns=prior.columns)
    
    # identify the x scale of the grid
    post_grid_pstock['X'] = post_grid_pstock.index
    
    # this column is the normalized probabilities that a sample
    # will exceed a value on the grid.
    post_grid_pstock['norm'] = post_grid_pstock['Bin_1']/post_grid_pstock['Bin_1'].sum()
    
    return post_grid_pstock

def non_zero(alist):
    # find the first non-zero object in an array
    # return the index number and the value.
    for i, anum in enumerate(alist):
        if anum != 0:
            return i, anum
    return None

def draw_sample_from_multinomial(normed, n=100):
    # the norm column from the posterior data frame is
    # used as the probabilities of a multinomial distribution
    rv = multinomial(1, normed.values)
    y = rv.rvs(n)   

    indexes = []
    for i in range(0, len(y)):
        indexes.append(non_zero(y[i])[0])
    return indexes


def posterior_predictions(p_g_p):
    
    p_norm = p_g_p['norm']
    
    indexes = draw_sample_from_multinomial(p_norm)
    results_scale = p_g_p.reset_index(drop=True)
    sample_totals = results_scale.loc[indexes, "X"]
    
    return sample_totals

# the prior data from surveys
# iqaasl_prior = report_iq_pk.w_df[report_iq_pk.w_df.project == "IQAASL"].copy()
# iq_p = iqaasl_prior.groupby(['loc_date', 'project'], as_index=False).pcs_m.sum()
# iq_p['top'] = 1

# iq_prior = calculate_likelihood(aggregated_data=iq_p, bin_density_column='top', pcs_column='pcs_m', grid_range=grid_val_index, bins=[1])
# iq_prior.rename(columns={'Likelihood_1':'Bin_1'}, inplace=True)

# assuming know prior knowledge
beta_prior = calculate_beta_prior(bin_density_numbers=[1])
    
col = 'top'
pcs_col = 'pcs/m²'
grid_range = grid_val_index
bins = [1]

test_x = f_combi.copy().groupby(cols, as_index=False)[pcs_col].sum()
test_x['top'] = 1

grid_pstock = calculate_likelihood(aggregated_data=test_x, bin_density_column=col, pcs_column=pcs_col, grid_range=grid_range, bins=bins)

# posterior uninformed
post_grid_pstock = define_posterior(grid_pstock, beta_prior, grid_val_index=grid_range)

# # posterior informed
# post_grid_iqp =define_posterior(grid_pstock, iq_prior, grid_val_index=grid_range)

# samples
sample_totals = posterior_predictions(post_grid_pstock.copy())
# s_iqp = posterior_predictions(post_grid_iqp.copy())

caption = 'Toutes les conditions'

test_grid_quants = makeqdf(test_x[pcs_col].values, sample_totals, caption=caption)
glue('q-tous-b-sa', test_grid_quants, display=False)

title = 'Plastock 2022, Le Léman\nDistribution des Prédictions: toutes les conditions,  grid approximation, prior = IQAASL'

plot_histogram(sample_totals, test_x[pcs_col].values, title=title, reference='toutes-gapprox-sa', display=False)



In [24]:
col = 'substrat'
pcs_col = 'pcs/m²'
grid_range = grid_val_index
bins = [1]

test_xi = f_combi[(f_combi['substrat'] == 1)].copy()
test_x = test_xi.groupby(cols, as_index=False)[pcs_col].sum()


grid_pstock = calculate_likelihood(aggregated_data=test_x, bin_density_column=col, pcs_column=pcs_col, grid_range=grid_range, bins=bins)

# posterior uninformed
post_grid_pstock = define_posterior(grid_pstock, beta_prior, grid_val_index=grid_range)

# samples
sample_totals = posterior_predictions(post_grid_pstock.copy())

caption = 'Sable'

test_grid_quants = makeqdf(test_x[pcs_col].values, sample_totals, caption=caption)
glue('q-sable-b-sa', test_grid_quants, display=False)
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions: Sable, grid approximation'

plot_histogram(sample_totals, test_x[pcs_col].values, title=title, reference='sables-gapprox-sa', display=False)

In [25]:
col = 'substrat'
pcs_col = 'pcs/m²'
grid_range = grid_val_index
bins = [2]

test_xi = f_combi[(f_combi['substrat'] == 2)].copy()
test_x = test_xi.groupby(cols, as_index=False)[pcs_col].sum()


grid_pstock = calculate_likelihood(aggregated_data=test_x, bin_density_column=col, pcs_column=pcs_col, grid_range=grid_range, bins=bins)

# posterior uninformed
post_grid_pstock = define_posterior(grid_pstock, beta_prior, grid_val_index=grid_range)

# samples
sample_totals = posterior_predictions(post_grid_pstock.copy())

caption = 'Graviers'

test_grid_quants = makeqdf(test_x[pcs_col].values, sample_totals, caption=caption)
glue('q-gravier-b-sa', test_grid_quants, display=False)
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions: Graviers, grid approximation'

plot_histogram(sample_totals, test_x[pcs_col].values, title=title, reference='graviers-gapprox-sa', display=False)

In [26]:
col = 'fréquentation'
pcs_col = 'pcs/m²'
grid_range = grid_val_index
bins = [3]

test_xi = f_combi[(f_combi['situation'] == 2) & (f_combi['fréquentation'] == 3)].copy()
test_x = test_xi.groupby(cols, as_index=False)[pcs_col].sum()


grid_pstock = calculate_likelihood(aggregated_data=test_x, bin_density_column=col, pcs_column=pcs_col, grid_range=grid_range, bins=bins)

# posterior uninformed
post_grid_pstock = define_posterior(grid_pstock, beta_prior, grid_val_index=grid_range)

# samples
sample_totals = posterior_predictions(post_grid_pstock.copy())

caption = 'Ville et haut fréquentation'

test_grid_quants = makeqdf(test_x[pcs_col].values, sample_totals, caption=caption)
glue('q-v-hf-b-sa', test_grid_quants, display=False)
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions: Ville et haut fréquentation, grid approximation'

plot_histogram(sample_totals, test_x[pcs_col].values, title=title, reference='v-hf-gapprox-sa', display=False)

In [27]:
col = 'fréquentation'
pcs_col = 'pcs/m²'
grid_range = grid_val_index
bins = [3]

test_xi = f_combi[(f_combi['situation'] == 1) & (f_combi['fréquentation'] == 3)].copy()
test_x = test_xi.groupby(cols, as_index=False)[pcs_col].sum()


grid_pstock = calculate_likelihood(aggregated_data=test_x, bin_density_column=col, pcs_column=pcs_col, grid_range=grid_range, bins=bins)

# posterior uninformed
post_grid_pstock = define_posterior(grid_pstock, beta_prior, grid_val_index=grid_range)

# samples
sample_totals = posterior_predictions(post_grid_pstock.copy())

caption = 'Campagne et haut fréquentation'

test_grid_quants = makeqdf(test_x[pcs_col].values, sample_totals, caption=caption)
glue('q-cam-hf-b-sa', test_grid_quants, display=False)
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions: Campagne et haut fréquentation, grid approximation'

plot_histogram(sample_totals, test_x[pcs_col].values, title=title, reference='cam-hf-gapprox-sa', display=False)

In [28]:
col = 'situation'
pcs_col = 'pcs/m²'
grid_range = grid_val_index
bins = [2]

test_xi = f_combi[(f_combi['situation'] == 2) & (f_combi['distance'] == 1)].copy()
test_x = test_xi.groupby(cols, as_index=False)[pcs_col].sum()


grid_pstock = calculate_likelihood(aggregated_data=test_x, bin_density_column=col, pcs_column=pcs_col, grid_range=grid_range, bins=bins)

# posterior uninformed
post_grid_pstock = define_posterior(grid_pstock, beta_prior, grid_val_index=grid_range)

# samples
sample_totals = posterior_predictions(post_grid_pstock.copy())

caption = 'Ville et distance <= 500 m'

test_grid_quants = makeqdf(test_x[pcs_col].values, sample_totals, caption=caption)
glue('q-ville-d1-sa', test_grid_quants, display=False)
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions: Ville et distance <= 500 m, grid approximation, '

plot_histogram(sample_totals, test_x[pcs_col].values, title=title, reference='v-d1-sa', display=False)

In [29]:
col = 'situation'
pcs_col = 'pcs/m²'
grid_range = grid_val_index
bins = [1]

test_xi =  f_combi[(f_combi['situation'] == 1) & (f_combi['distance'] == 1)].copy()
test_x = test_xi.groupby(cols, as_index=False)[pcs_col].sum()


grid_pstock = calculate_likelihood(aggregated_data=test_x, bin_density_column=col, pcs_column=pcs_col, grid_range=grid_range, bins=bins)

# posterior uninformed
post_grid_pstock = define_posterior(grid_pstock, beta_prior, grid_val_index=grid_range)

# samples
sample_totals = posterior_predictions(post_grid_pstock.copy())

caption = 'Campagne et distance <= 500 m'

test_grid_quants = makeqdf(test_x[pcs_col].values, sample_totals, caption=caption)
glue('q-cam-d1-sa', test_grid_quants, display=False)
title = 'Plastock 2022, Le Léman\nDistribution des Prédictions: Campagne et haut fréquentation, grid approximation,'

plot_histogram(sample_totals, test_x[pcs_col].values, title=title, reference='cam-d1-sa', display=False)

(grid_approx_p)=
### Approximation Bayésienne par Grille

Source : [hammerdirt](https://hammerdirt-analyst.github.io/feb_2024/titlepage.html)

application : [solid-waste-team](https://hammerdirt-analyst.github.io/solid-waste-team/grid_approximation.html)

prior : beta(1,1)

Cas d'utilisation : Cette méthode est une approche manuelle de l'inférence Bayésienne. Elle est particulièrement utile lorsque vous souhaitez incorporer des croyances antérieures et mettre à jour ces croyances avec des données observées.

Mise en œuvre : Implique la définition d'une grille de valeurs de paramètres et le calcul de la vraisemblance des données observées à chaque point de cette grille. En multipliant par la probabilité a priori et en normalisant, on obtient la distribution a posteriori. Cela peut être fait pour chaque condition séparément ou pour toutes les conditions ensemble, bien que cela soit plus intensif en termes de calcul.



::::{tab-set}

:::{tab-item} Toutes les conditions
{glue}`toutes-gapprox-sa`
:::

:::{tab-item} Graviers
{glue}`graviers-gapprox-sa`
:::

:::{tab-item} Sables
{glue}`sables-gapprox-sa`

:::

:::{tab-item} Ville et haute Fréquentation
{glue}`v-hf-gapprox-sa`

:::

:::{tab-item} Campagne et haute fréquentation
{glue}`cam-hf-gapprox-sa`

:::

:::{tab-item} Campagne et parking <= 500 m
{glue}`cam-d1-sa`

:::

:::{tab-item} Ville et parking <= 500 m
{glue}`v-d1-sa`

:::



:::{tab-item} Résultats
:selected:

````{grid} 1 2 2 2

```{grid-item}
{glue}`q-tous-b-sa`
```

```{grid-item}

Prédictions : Fournit une distribution de valeurs possibles de pcs/m, offrant une idée de la fourchette et de l'incertitude des prédictions. Particulièrement utile lorsque la prise de décision nécessite de comprendre l'incertitude ou la variabilité des prédictions.

```

```{grid-item}
{glue}`q-gravier-b-sa`
```

```{grid-item}
{glue}`q-sable-b-sa`
```

```{grid-item}
{glue}`q-v-hf-b-sa`
```

```{grid-item}
{glue}`q-cam-hf-b-sa`
```

```{grid-item}
{glue}`q-cam-d1-sa`
```

```{grid-item}
{glue}`q-ville-d1-sa`
```

````
:::

::::
    

In [30]:
# def create_bins(predictions, bin_width=0.2):
#     """
#     Create bins from the predictions with a specified width.

#     :param predictions: List or array of prediction values.
#     :param bin_width: Width of each bin. Default is 0.2.
#     :return: A tuple (bins, bin_counts).
#         bins: The edges of the bins.
#         bin_counts: The count of predictions in each bin.
#     """
#     # Determine the range for the bins
#     max_prediction = max(predictions)
#     bins = np.arange(0, max_prediction + bin_width, bin_width)

#     # Count the number of predictions in each bin
#     bin_counts, _ = np.histogram(predictions, bins=bins)

#     return bins, bin_counts

# def calculate_bin_probabilities(bin_counts):
#     """
#     Calculate the probability for each bin.

#     :param bin_counts: The count of predictions in each bin.
#     :return: List of probabilities for each bin.
#     """
#     total_predictions = sum(bin_counts)
#     bin_probabilities = bin_counts / total_predictions
#     return bin_probabilities

# def sample_multinomial(n_samples, bin_probabilities):
#     """
#     Sample from a multinomial distribution using numpy.

#     :param n_samples: Number of samples to draw.
#     :param bin_probabilities: The probabilities of each outcome.
#     :return: Array of counts for each outcome.
#     """
#     # The number of outcomes is the length of bin_probabilities
#     n_outcomes = len(bin_probabilities)

#     # Draw samples
#     samples = np.random.multinomial(n_samples, bin_probabilities, size=1)
#     return samples

# bins, bin_counts = create_bins(predictions_flatxi)
# replace = list(set(bin_counts))[1]
# bin_counts[bin_counts == 0] = replace
# bin_probs = calculate_bin_probabilities(bin_counts)
# pred_samps = sample_multinomial(10000, bin_probs)
# bin_centers = (bins[:-1] + bins[1:]) / 2  # Calculate bin centers
# multinomial_pcs_m = []

# for bin_center, count in zip(bin_centers, pred_samps[0]):
#     multinomial_pcs_m.extend([bin_center] * count)

In [31]:
# t = rc.translate_for_display(plastock_report.inventory, amap=language_maps['fr'], lan='fr')
# t['objet'] = t.code.apply(lambda x: codes.loc[x, 'fr'])
# t = t[[t.columns[0], t.columns[-1], *t.columns[1:-1]]]
# t.set_index(['code', 'objet'], inplace=True)
# t.index.name = None
# t.style.set_table_styles(conf_.table_css_styles).format(**conf_.format_kwargs)

In [32]:
%watermark --iversions -b -r

Git repo: https://github.com/hammerdirt-analyst/plastock.git

Git branch: main

pandas    : 2.0.0
matplotlib: 3.7.1
numpy     : 1.24.2
seaborn   : 0.12.2

