In [1]:
%load_ext watermark
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from myst_nb import glue
from IPython.display import Markdown as md
from slugify import slugify

from plastockmethods import name_the_zones, name_the_particles, frequentation_name, situation_name
from plastockmethods import particle_groups, name_the_substrate, name_the_distance, table_css_styles

a_property =  {'color' : 'red'}

def translate_describe(x, value_column):
    described = x.to_dict()
    described.pop("count")
    described["moyenne"] = described.pop("mean")
    described["écart-type"] = described.pop("std")
    df = pd.DataFrame(described.items())
    df.set_index(0, inplace=True)
    df.rename(columns={1:value_column}, inplace=True)
    df.index.name = None
    
    
    return df
def add_labels_display(data, column_name, labels):
    # Changes column values from integer to labels and sets df.index to column_name
    data[column_name] = data[column_name].apply(lambda x: labels[x])
    data.set_index(column_name, drop=True, inplace=True)
    data.index.name = None
        
    return data.style.format(precision=2).set_table_styles(table_css_styles)
def normalize_column(x):
    return (x - x.min())/(x.max()-x.min())

def attribute_summary(some_data, vals, voi, columns: dict = None, labels: dict = None):
    
    groupby = ['echantillon', voi]
    
    data = some_data.groupby(groupby, as_index=False)[vals].sum()
    data[voi] = data[voi].apply(lambda x: labels[x])

    data_summary = data.groupby(voi, as_index=False)[vals].describe()
    data_summary[['count', 'max', 'min']] = data_summary[['count', 'max', 'min']].astype('int')
    data_summary.rename(columns=columns, inplace=True)
    data_summary.set_index(voi, inplace=True, drop=True)
    data_summary.index.name = None
    
    select_values = data_summary["moyenne"] > 1.45
    test_one = data_summary.loc[select_values].index
    
    select_values = data_summary["50%"] > .78
    test_two = data_summary.loc[select_values].index
        
    d_sum = data_summary.style.set_table_styles(table_css_styles).format(precision=2)
    d_sum = d_sum.set_properties(subset = pd.IndexSlice[test_one,["moyenne"]], **a_property)
    
    return d_sum.set_properties(subset = pd.IndexSlice[test_two,["50%"]], **a_property)

def attribute_summary_grid(data, vals, voi, figname, labels: dict=None, stat="probability"):
    
    groupby = ['date', voi]
    some_data = data.groupby(groupby, as_index=False)[vals].sum()
    some_data[voi] = some_data[voi].apply(lambda x: labels[x])
    
    fig, axs = plt.subplots(2,2, figsize=(8,7))

    sns.scatterplot(some_data,x="date", y=vals, hue=voi,  ax=axs[0,0])
    sns.boxplot(some_data, x=voi, y=vals, hue=voi, showfliers=False, ax=axs[1,0], dodge=False)
    sns.histplot(some_data,x=vals, hue=voi, ax=axs[0,1], stat=stat, kde=True)
    sns.ecdfplot(some_data, x=vals, hue=voi, ax=axs[1,1])
    # axs[0,0].set_ylim(-1, 1000)
    # axs[0,1].set_xlim(-1, 1000)
    # axs[1,1].set_xlim(-1, 1000)
    axs[0,0].tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
    axs[0,0].set_xlabel("échantillon")
    axs[0,0].set_ylabel("pcs/m2")
    axs[0,0].get_legend().remove()
    axs[1,0].get_legend().remove()
    axs[1,0].set_xlabel("")
    axs[1,0].set_ylabel("pcs/m2")
    axs[0,1].set_xlabel("")
    axs[0,1].set_ylabel("probabilité")
    axs[1,0].tick_params(axis="x", rotation=90, )
    axs[1,1].get_legend().remove()
    axs[1,1].set_xlabel("")
    plt.subplots_adjust(wspace=.3)

    plt.tight_layout
    
    glue(figname, fig, display=False)
    plt.close()
    
format_kwargs = dict(precision=2, thousands="'", decimal=",")
def add_table_to_page(table, table_no, caption, section, page, rule):
    
    caption = f'Table {section}{page}-{table_no}: {caption} {rule}'
    table = table.format_index(str.title, axis=1).format_index(str.title, axis=0).format(**format_kwargs)
    return table.set_caption(caption)
glue('blank_caption', " ", display=False)


new_data = pd.read_csv("data/macro_current.csv")
beach_data = pd.read_csv("data/end_pipe/asl_beaches.csv")
n_beach_data = pd.read_csv("data/pstock_beaches_current.csv")

# Macro déchets plage et attribut

## Substrat

In [2]:
new_data = new_data.merge(beach_data[["Plage", "frequentation", "situation", "distance", "orientation"]], left_on = "Plage", right_on="Plage")

new_column_names = {
    "Position":"position",
    "Substrat":"substrat",
    "Date":"date",
    "Code":"code",
    "Quantité":"quantite",
    "Aire":"area"
}

variables = ["position", "substrat", "frequentation", "situation", "distance", "orientation"]

length_key = n_beach_data[["Plage","length"]].drop_duplicates("Plage").set_index("Plage")
work_data = new_data[["Plage",  'frequentation', 'situation', 'distance', 'orientation', *new_column_names.keys()]].copy()
work_data.rename(columns=new_column_names, inplace=True)
work_data["length"] = work_data.Plage.apply(lambda x: length_key.loc[x, "length"])
work_data["slug"] = work_data.Plage.apply(lambda x: slugify(x))
work_data["echantillon"] = list(zip(work_data.slug, work_data['date']))
work_data['date'] = pd.to_datetime(work_data["date"], format="mixed", dayfirst=True)
work_data.dropna(inplace=True)
work_data[variables[:-1]] = work_data[variables[:-1]].astype("int")
work_data["pcs/m2"] = work_data.quantite/work_data.area

In [3]:
columns = ['Plage', 'echantillon', 'orientation', 'position', 'substrat','frequentation', 'situation', 'distance', "fibres", "fdure", "souple", "fragments", "added"]
column_display = {
    "count":"n échantillons",
    "mean": "moyenne",
    "std": "écart type",
    "echantillon":"échantillon"
}

# ananlysis of substrat
voi = "substrat"
vals = "pcs/m2"
groupbys = ['echantillon', voi]
labels = name_the_substrate

event_total = work_data.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()


substrat_summary = attribute_summary(event_total, vals, voi, columns=column_display, labels=name_the_substrate)
table_no = 1

caption = 'Le résumé des résultats du nombre de déchets trouvées par m²dans chaque échantillon pour chaque substrat.'
section = 'A'
rule = 'Les attributs dont la moyenne des résultats est supérieure à la moyenne du projet sont en rouge.'
page = 4


table_one = add_table_to_page(substrat_summary, table_no, caption, section, page, rule)
glue('tablea41', table_one, display=True)

Unnamed: 0,N Échantillons,Moyenne,Écart Type,Min,25%,50%,75%,Max
Galet,36,113,105,0,28,61,192,3
Gravier,16,62,63,0,20,46,64,2
Sable Fin,27,259,244,0,91,156,437,8
Sable Grosssier,31,69,82,0,29,40,63,3


In [4]:
attribute_summary_grid(event_total, vals, voi, 'fig-a41', labels=labels)

```{glue:figure} fig-a41
---
name: fig-a41
---
{glue:text}`blank_caption` 
```

### Les plages substrat = sable fin

In [5]:
md(', '.join(work_data[work_data.substrat.isin([1])].Plage.unique()))

Baby Plage, Bouveret, Excenevex, Préverenges, Rolle, Vidy, Grangettes

In [6]:
# fig, ax = plt.subplots()
# data = work_data.groupby(groupbys, as_index=False)[vals].sum()
# data[voi] = data[voi].apply(lambda x: labels[x])

# sns.kdeplot(data, x=vals, hue=voi, ax=ax)
# ax.set_xlim(0, 10)
# plt.show()

## Frequentation

In [7]:
# ananlysis of frequentation
voi = "frequentation"
vals = "pcs/m2"
groupbys = ['echantillon', voi]
labels = frequentation_name


event_total = work_data.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()
table_no += 1
caption = 'Le résumé des résultats du nombre de déchets trouvées par m²dans chaque échantillon pour chaque niveau de fréquentation.'

substrat_summary = attribute_summary(event_total, vals, voi, columns=column_display, labels=labels)
table_two = add_table_to_page(substrat_summary, table_no, caption, section, page, rule)
glue('tablea42', table_two, display=True)

Unnamed: 0,N Échantillons,Moyenne,Écart Type,Min,25%,50%,75%,Max
Faible,15,37,16,0,25,38,49,0
Moyenne,28,187,206,0,34,105,262,8
Élévée,55,153,161,0,64,94,189,8


In [8]:
attribute_summary_grid(event_total, vals, voi, 'fig-a42', labels=labels)

```{glue:figure} fig-a42
---
name: fig-a42
---
{glue:text}`blank_caption` 
```

### Les plages fréquentation = moyenne

In [9]:
md(', '.join(work_data[work_data.frequentation.isin([2])].Plage.unique()))

Aubonne, Lugrin, Saint-Disdille, Grangettes, Anthy, Gland, Pichette

### Les plages fréquentation = élevée

In [10]:
md(', '.join(work_data[work_data.frequentation.isin([3])].Plage.unique()))

Amphion, Baby Plage, Bouveret, Clarens, Excenevex, Préverenges, Rolle, Savonnière, Vidy, Tougues, Versoix, Port Choiseul, Hermance, Lutry

In [11]:
# fig, ax = plt.subplots()

# data = work_data.groupby(groupbys, as_index=False)[vals].sum()
# data[voi] = data[voi].apply(lambda x: labels[x])

# sns.kdeplot(data, x=vals, hue=voi, ax=ax)
# ax.set_xlim(0, 10)
# plt.show()

## Situation

In [12]:
# ananlysis of situation
voi = "situation"
vals = "pcs/m2"
groupby = ['echantillon', voi]
labels = situation_name

event_total = work_data.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()
table_no += 1

caption = 'Le résumé des résultats du nombre de déchets trouvées par m²dans chaque échantillon pour chaque situation: urbain, campagne.'

sit = attribute_summary(event_total, vals, voi, columns=column_display, labels=labels)
table_three = add_table_to_page(sit, table_no, caption, section, page, rule)
glue('tablea43', table_three, display=True)

Unnamed: 0,N Échantillons,Moyenne,Écart Type,Min,25%,50%,75%,Max
Campagne,67,159,194,0,36,70,215,8
Urbain,31,114,93,0,57,94,147,4


In [13]:
attribute_summary_grid(event_total, vals, voi, 'fig-a43', labels=labels)

```{glue:figure} fig-a43
---
name: fig-a43
---
{glue:text}`blank_caption` 
```

### Les plages situation = campagne

In [14]:
md(', '.join(work_data[work_data.situation.isin([1])].Plage.unique()))

Amphion, Aubonne, Bouveret, Excenevex, Lugrin, Préverenges, Saint-Disdille, Savonnière, Grangettes, Tougues, Versoix, Tolochenaz, Anthy, Crans, Gland, Cully, Pichette

### Les plages situation = urbain

In [15]:
md(', '.join(work_data[work_data.situation.isin([2])].Plage.unique()))

Baby Plage, Clarens, Rolle, Vidy, Port Choiseul, Hermance, Lutry, Meillerie

In [16]:
# fig, ax = plt.subplots()

# data = work_data.groupby(groupby, as_index=False)[vals].sum()
# data[voi] = data[voi].apply(lambda x: situation_name[x])

# sns.kdeplot(data, x=vals, hue=voi, ax=ax)
# ax.set_xlim(0, 10)
# plt.show()

## Distance

In [17]:
# ananlysis of distance
voi = "distance"
vals = "pcs/m2"
groupby = ['echantillon', voi]
labels = name_the_distance

event_total = work_data.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()

table_no += 1

caption = 'Le résumé des résultats du nombre de déchets trouvées par m²dans chaque échantillon selon la distance du parking.'

dist = attribute_summary(event_total, vals, voi, columns=column_display, labels=labels)
table_three = add_table_to_page(dist, table_no, caption, section, page, rule)
glue('tablea44', table_three, display=True)

Unnamed: 0,N Échantillons,Moyenne,Écart Type,Min,25%,50%,75%,Max
100 - 500M,44,184,204,0,67,111,194,8
500 - 1000M,7,43,15,0,38,40,52,0
< 100M,39,142,140,0,38,78,215,5
> 1000M,8,32,14,0,21,28,43,0


In [18]:
attribute_summary_grid(event_total, vals, voi, 'fig-a44', labels=labels)

```{glue:figure} fig-a44
---
name: fig-a44
---
{glue:text}`blank_caption` 
```

### Les plages distance = 100 - 500m 

In [19]:
md(', '.join(work_data[work_data.distance.isin([2])].Plage.unique()))

Baby Plage, Bouveret, Lugrin, Préverenges, Vidy, Grangettes, Tougues, Versoix, Port Choiseul, Hermance, Lutry

### Les plages distance = < 100m  

In [20]:
md(', '.join(work_data[work_data.distance.isin([1])].Plage.unique()))

Amphion, Clarens, Excenevex, Rolle, Saint-Disdille, Savonnière, Anthy, Gland, Cully, Pichette

In [21]:
# fig, ax = plt.subplots()

# data = work_data.groupby(groupby, as_index=False)[vals].sum()
# data[voi] = data[voi].apply(lambda x: labels[x])

# sns.kdeplot(data, x=vals, hue=voi, ax=ax)
# ax.set_xlim(0, 10)
# plt.show()

## Orientation

In [22]:
# ananlysis of substrat
voi = "orientation"
vals = "pcs/m2"
groupby = ['echantillon', voi]

event_total = work_data.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()
# data[voi] = data[voi].apply(lambda x: situation_name[x])

data_summary = event_total.groupby(voi, as_index=False)[vals].describe()
data_summary[['count', 'max', 'min']] = data_summary[['count', 'max', 'min']].astype('int')
data_summary.rename(columns=column_display, inplace=True)
data_summary.set_index("orientation", inplace=True, drop=True)
data_summary.index.name = None

select_values = data_summary["moyenne"] > 1.45
test_one = data_summary.loc[select_values].index

select_values = data_summary["50%"] > .78
test_two = data_summary.loc[select_values].index

d_sum = data_summary.style.set_table_styles(table_css_styles)
d_sum = d_sum.set_properties(subset = pd.IndexSlice[test_one,["moyenne"]], **a_property)
d_sum = d_sum.set_properties(subset = pd.IndexSlice[test_two,["50%"]], **a_property)

table_no += 1

caption = 'Le résumé des résultats du nombre de déchets trouvées par m² dans chaque échantillon selon l\'orientation de la plage'

# dist = attribute_summary(event_total, vals, voi, columns=column_display, labels=labels)
table_five = add_table_to_page(d_sum, table_no, caption, section, page, rule)
glue('tablea45', table_five, display=True)

Unnamed: 0,N Échantillons,Moyenne,Écart Type,Min,25%,50%,75%,Max
E,4,232,138,0,142,228,319,3
Ese,4,60,8,0,56,59,64,0
N,8,150,117,0,45,144,246,3
Ne,12,215,153,0,136,194,262,5
Nne,8,126,49,0,93,111,151,2
Nno,4,32,13,0,26,32,38,0
No,12,235,277,0,51,77,475,8
Ono,8,256,318,0,37,68,442,8
Oso,4,26,31,0,9,14,31,0
Se,4,30,18,0,18,25,36,0


In [23]:
%watermark --iversions -b -r

Git repo: https://github.com/hammerdirt-analyst/plastock.git

Git branch: main

pandas    : 2.0.0
numpy     : 1.24.2
seaborn   : 0.12.2
matplotlib: 3.7.1

