In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from scipy.stats import tukey_hsd

from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm

import json

rng = np.random.default_rng(42)
sns.set_style("whitegrid")

header_row = {'selector': 'th:nth-child(1)', 'props': f'background-color: #FFF;'}
even_rows = {"selector": 'tr:nth-child(even)', 'props': f'background-color: rgba(139, 69, 19, 0.08);'}
odd_rows = {'selector': 'tr:nth-child(odd)', 'props': 'background: #FFF;'}
table_font = {'selector': 'tr', 'props': 'font-size: 12px;'}
table_css_styles = [even_rows, odd_rows, table_font, header_row]

## Plastock micro

_doc version: 2, doc type: proposal, doc title: plastock micro: Roger Erismann, roger@hammerdirt.ch_

Ce document est destiné à tester la mise en œuvre de méthodes statistiques sur des données de comptage. Les données sont nettoyées dans une étape préalable. Le problème se résout autour des quantités relatives de 18 types de particules différentes (3 catégories de matériaux et 6 catégories de couleurs par matériau). Il y a des échantillons provenant de 25 plages différentes du Lac Léman.

__Nous comptons trois catégories de particules:__

1. fibres
2. fragments durs
3. fragments souples

In [2]:
c = pd.read_csv("data/end_pipe/micro_results.csv")
# column groups by material type
fibres = ['fbr', 'fbb', 'fbj', 'fbt', 'fbn', 'fba']
fdure = ['frr', 'frb', 'frj', 'frt', 'frn', 'fra']
souple = ['fsr', 'fsb', 'fsj', 'fst', 'fsn', 'fsa']
fragments = [*fdure, *souple]

name_the_zones = {1:"ligne-d'eau", 2:"plage-seche"}

name_the_particles = {
    "fbr":"fibre rouge",
    "fbb":"fibre bleu",
    "fbj":"fibre jaune",
    "fbt":"fibre transparent",
    "fbn":"fibre noire",
    "fba":"fibre autre",
    "frr":"fragment rigide rouge",
    "frb":"fragment rigide bleu",
    "frj":"fragment rigide jaune",
    "frt":"fragment rigide transparent",
    "frn":"fragment rigide noire",
    "fra":"fragment rigide autre",
    "fsr":"fragment souple rouge",
    "fsb":"fragment souple bleu",
    "fsj":"fragment souple jaune",
    "fst":"fragment souple transparent",
    "fsn":"fragment souple noire",
    "fsa":"fragment souple autre",
}

frequentation_name = {
    1:"faible",
    2:"moyenne",
    3:"élévée"
}

situation_name = {
    1: "campagne",
    2: "urbain"
}

particle_groups = {
    "fibres":"fibre",
    "fdure":"fragment rigide",
    "souple":"fragment souple"
}

name_the_substrate = {
    1:"sable fin",
    2:"sable grosssier",
    3:"gravier",
    4:"galet"
}

substrate = {
    1:"wheat",
    2:"burlywood",
    3:"lightslategrey",
    4:"darkslategrey"
}

# the sum of all particles for one sample
c["added"] = c[[*fragments, *fibres]].sum(axis=1)
# the sum of the different groups
c["fibres"] = c[fibres].sum(axis=1)
c["fdure"] = c[fdure].sum(axis=1)
c["souple"] = c[souple].sum(axis=1)
c["fragments"] = c[fragments].sum(axis=1)
c.rename(columns={"Echantillon":"echantillon"}, inplace=True)
# the emplacement on the beach

cAdded = c[['Plage', 'echantillon', 'orientation', 'position', 'substrat','frequentation', 'situation', 'distance', "fibres", "fdure", "souple", "fragments"]].copy()

# work columns
id_vars = ['Plage', 'echantillon', 'orientation', 'position', 'substrat', 'frequentation', 'situation', 'distance']
value_vars = ["fibres", "fdure", "souple"]


# make workdata to long form
work_data = pd.melt(cAdded, id_vars=id_vars, value_vars=value_vars)
work_data.rename(columns={"variable":"particle", "value":"compte"}, inplace=True)
work_data["compte"] = work_data.compte.astype("int")

# colors and styles
zone_palette = {"plage-seche":"darkgoldenrod","ligne-d'eau":"lightseagreen"}
region_palette = {"GE":"darkgoldenrod","VD":"lightseagreen"}

In [3]:
# les plages
locations = c.Plage.unique()
nlocations = len(locations)
samples = c.echantillon.unique()
nsamples = len(samples)
tquantity = work_data.compte.sum()

print(nlocations, nsamples, tquantity)

25 217 39089


In [4]:
work_data.head()

Unnamed: 0,Plage,echantillon,orientation,position,substrat,frequentation,situation,distance,particle,compte
0,Amphion,74_Amp_1,NE,1,4,3.0,1.0,1.0,fibres,97
1,Amphion,74_Amp_10,NNE,2,4,3.0,1.0,1.0,fibres,140
2,Amphion,74_Amp_2,NNE,1,4,3.0,1.0,1.0,fibres,121
3,Amphion,74_Amp_3,NE,1,4,3.0,1.0,1.0,fibres,31
4,Amphion,74_Amp_4,NNE,1,4,3.0,1.0,1.0,fibres,179


In [14]:
agg_ = {"compte":"sum", "echantillon":"nunique"}


def add_weight_of_samples(data, samps: int=nsamples, tquantity: int=tquantity):
    
    data["poids compte"] = data.compte/tquantity
    data["poids echantillon"] = data.echantillon/nsamples
    
    return data

def add_labels_display(data, column_name, labels):
    data[column_name] = data[column_name].apply(lambda x: labels[x])
    data.set_index(column_name, drop=True, inplace=True)
    data.index.name = None
        
    return data.style.format(precision=2).set_table_styles(table_css_styles)

position_total = work_data.groupby("position", as_index=False).agg(agg_)
substrat_total = work_data.groupby("substrat", as_index=False).agg(agg_)
frequentation_total = work_data.groupby("frequentation", as_index=False).agg(agg_)
situation_total = work_data.groupby("situation", as_index=False).agg(agg_)
orient_total = work_data.groupby("orientation", as_index=False).agg(agg_)
particle_type_total = work_data.groupby("particle", as_index=False).agg(agg_)
pos_total = add_weight_of_samples(position_total)

pos_total = add_labels_display(pos_total, "position", name_the_zones)
# pos_total["position"] = pos_total.position.apply(lambda x: name_the_zones[x])
# pos_total.set_index("position", drop=True, inplace=True)
# pos_total.index.name = None
# pos_total.style.format(precision=2).set_table_styles(table_css_styles)
pos_total

Unnamed: 0,compte,echantillon,poids compte,poids echantillon
ligne-d'eau,12969,111,0.33,0.51
plage-seche,26120,106,0.67,0.49


In [15]:
sub_total = add_weight_of_samples(substrat_total)

add_labels_display(sub_total, "substrat", name_the_substrate)



Unnamed: 0,compte,echantillon,poids compte,poids echantillon
sable fin,19345,114,0.49,0.53
sable grosssier,4253,28,0.11,0.13
gravier,9732,29,0.25,0.13
galet,5759,46,0.15,0.21


In [16]:
freq_total = add_weight_of_samples(frequentation_total)
add_labels_display(freq_total, "frequentation", frequentation_name)

Unnamed: 0,compte,echantillon,poids compte,poids echantillon
faible,587,8,0.02,0.04
moyenne,9910,37,0.25,0.17
élévée,28592,172,0.73,0.79


In [17]:
sit_total =  add_weight_of_samples(situation_total)
add_labels_display(sit_total, "situation", situation_name)

Unnamed: 0,compte,echantillon,poids compte,poids echantillon
campagne,26098,140,0.67,0.65
urbain,12991,77,0.33,0.35


In [18]:
ori_total =  add_weight_of_samples(orient_total)
ori_total.set_index("orientation", drop=True, inplace=True)
ori_total.index.name = None
ori_total.style.format(precision=2).set_table_styles(table_css_styles)

Unnamed: 0,compte,echantillon,poids compte,poids echantillon
E,40,1,0.0,0.0
ENE,271,2,0.01,0.01
ESE,399,4,0.01,0.02
N,2380,14,0.06,0.06
NE,8158,42,0.21,0.19
NNE,1104,9,0.03,0.04
NNO,1339,10,0.03,0.05
NO,5675,25,0.15,0.12
O,1487,7,0.04,0.03
ONO,724,6,0.02,0.03


In [22]:
par_total =  add_weight_of_samples(particle_type_total)
add_labels_display(par_total, "particle", particle_groups)

Unnamed: 0,compte,echantillon,poids compte,poids echantillon
fragment rigide,5105,217,0.13,1.0
fibre,29909,217,0.77,1.0
fragment souple,4075,217,0.1,1.0


In [34]:
p_t_c = c[name_the_particles.keys()].sum(axis=0)
p_t_cdf = pd.DataFrame({"couleur et type":p_t_c.index, "compte":p_t_c.values})
p_t_cdf["poids compte"] = p_t_cdf.compte/tquantity
p_t_cdf["couleur et type"] = p_t_cdf["couleur et type"].apply(lambda x: name_the_particles[x])
p_t_cdf.set_index("couleur et type", drop=True, inplace=True)
p_t_cdf.index.name = None
p_t_cdf["compte"] = p_t_cdf.compte.astype("int")
p_t_cdf.style.format(precision=2).set_table_styles(table_css_styles)

Unnamed: 0,compte,poids compte
fibre rouge,2364,0.06
fibre bleu,4060,0.1
fibre jaune,400,0.01
fibre transparent,16920,0.43
fibre noire,5892,0.15
fibre autre,273,0.01
fragment rigide rouge,955,0.02
fragment rigide bleu,1400,0.04
fragment rigide jaune,508,0.01
fragment rigide transparent,1202,0.03


In [29]:
p_t_cdf

Unnamed: 0_level_0,compte,poids compte
couleur et type,Unnamed: 1_level_1,Unnamed: 2_level_1
fibre rouge,2364.0,0.060477
fibre bleu,4060.0,0.103866
fibre jaune,400.0,0.010233
fibre transparent,16920.0,0.432858
fibre noire,5892.0,0.150733
fibre autre,273.0,0.006984
fragment rigide rouge,955.0,0.024431
fragment rigide bleu,1400.0,0.035816
fragment rigide jaune,508.0,0.012996
fragment rigide transparent,1202.0,0.03075


In [None]:
c.head()

In [None]:
work_data.head()

In [None]:
c.head()

In [None]:
freq_d_to_parking = pd.read_csv("data/ignorethese/asl_plages_d.csv")
f_d_t = freq_d_to_parking[["echantillon", "frequentation", "situation", "distance", "orientation"]].set_index("echantillon")

col_edit = ['Plage', 'Echantillon', 'position', 'substrat', 'fbr',
       'fbb', 'fbj', 'fbt', 'fbn', 'fba', 'frr', 'frb', 'frj', 'frt', 'frn',
       'fra', 'fsr', 'fsb', 'fsj', 'fst', 'fsn', 'fsa', 'added', 'fibres',
       'fdure', 'souple', 'fragments']

end_this = c[col_edit].merge(f_d_t, right_index=True, left_on="Echantillon", validate="one_to_one")
end_this.to_csv("data/end_pipe/micro_results.csv", index=False)

(section-one-01)=
### Comptage moyen par événement par plage

#### Toutes variantes confondues. 

Le nombre moyen pour chaque événement variait de 19 à 180 fragments par échantillon de 5 X 10 X 10 cm. Les événements pour lesquels aucun compte n'a été rapporté ont été retirés de ce test.

**Note** Ces chiffres sont-ils complets ? Certains semblent très bas ou toutes les particules n'ont pas encore été comptées ?

In [None]:
a_df= work_data.groupby(['Echantillon', 'Plage'], as_index=False)["value"].sum()

fig, ax = plt.subplots()

sns.boxplot(data=a_df, x="Plage", y="value", hue="Plage", width=.8, ax=ax, dodge=False, showfliers=False)
ax.tick_params(axis="x", which="both", labelrotation=90)
ax.set_ylabel("Particules", fontsize=12, labelpad=20)
ax.legend().remove()
ax.set_title("La distribution du nombre de fragments par événement et par plage.\n", fontsize=14, pad=20)
ax.set_xlabel(" ")
ax.margins(x=.9, y=None)
plt.tight_layout()

plt.show()

In [None]:
f = f"""
Le nombre moyen de particules par événement tous les sites et matériaux

NOTE: mean = moyen, count=nombre d'événements, std=écart type

{a_df.value.describe()} 

"""
print(f)

In [None]:
f = f""" 
Le nombre moyen de particules par événement et par plage

{a_df.groupby("Plage").value.mean()}

"""
print(f)


(section-two-01)=
### Comparaison du nombre de particules trouvées par matériau et par emplacement sur la plage.

Même s'il semble qu'il y ait plus de particules sur la plage-sèche, cela peut être dû à quelques événements seulement. Le matériau souple a été identifié le moins souvent, et cela semble se généraliser à tous les événements.

In [None]:
a_df= work_data.groupby(["Echantillon","position", "variable"], as_index=False)["value"].sum()

fig, axs = plt.subplots(1, 2, figsize=(10,6), sharey=True)

ax = sns.scatterplot(data=a_df, x="Echantillon", y="value", hue="position", palette=zone_palette, ax=axs[0], style="variable")
ax.xaxis.set_ticklabels([])
ax.grid(axis='x')
ax.set_xlabel("Événement")
ax.set_ylabel("Fragments", fontsize=12, labelpad=14)
ax.legend(loc="upper left")

axOne = axs[1]
sns.boxplot(data=a_df, x="variable", y="value", hue="position", palette=zone_palette, width=.8, ax=axOne, dodge=True)
axOne.tick_params(axis="x", which="both", labelrotation=90)
axOne.set_ylabel(" ")
axOne.set_xlabel(" ")
axOne.margins(x=.9, y=None)

plt.suptitle("Comparaison entre le type de matériel le ligne d'eau et le fond de la plage. ", fontsize=14, y=.98)
plt.tight_layout()

ax.set_ylim(-1, 1000)

plt.show()

In [None]:
f = f""" 
Le nombre moyen de particules par zone et matériel

{a_df.groupby(["position", "variable"], as_index=False).value.mean()}

"""
print(f)

(section-three-01)=
### Ligne d'eau et arrière de la plage

On peut voir ici que les valeurs de la plage-sèche ne proviennent que de quelques endroits. Nous allons attendre jusqu'à ce que les comptages arrivent pour ces endroits.

Cependant, les méthodes d'analyse peuvent toujours être testées sur les données de la ligne d'eau. Nous allons également combiner le matériel fragmenté en un seul groupe "fragments".

In [None]:
a_df = work_data.groupby(["Echantillon","Plage","position"], as_index=False)["value"].sum()

fig, ax = plt.subplots(figsize=(8,6))

sns.boxplot(data=a_df, x="Plage", y="value", hue="position", palette=zone_palette, width=0.8, ax=ax, dodge=True)
ax.tick_params(axis="x", which="both", labelrotation=90)
ax.set_ylabel("Fibres + fragments", fontsize=12, labelpad=20)
ax.legend(loc="upper center")
ax.set_title("La distribution du nombre de fragments (fibres + fragments) de chaque plage.\nComparaison entre la ligne d'eau et le fond de la plage. ", fontsize=14, pad=20)
ax.set_xlabel(" ")
ax.margins(x=.9, y=None)
plt.tight_layout()
plt.show()

### Ligne d'eau comparée à la plage-sèche tous types de matériaux et plages confondus.

In [None]:
fig, ax = plt.subplots()

sns.boxplot(data=a_df[[ "position", "value"]], x= "position", y="value", hue= "position", palette=zone_palette, width=.6, ax=ax, dodge=False, showfliers=False)
ax.tick_params(axis="x", which="both", labelrotation=90)
ax.set_ylabel("Fibres + fragments", fontsize=12, labelpad=20)
ax.legend(loc="upper center")
ax.set_title("La distribution du nombre de fragments (fibres + fragments).\nComparaison entre la ligne d'eau et le fond de la plage.", fontsize=14, pad=20)
ax.set_xlabel(" ")
ax.margins(x=.9, y=None)
plt.tight_layout()
plt.show()

(section-four-01)=
### La fonction de répartition

Même s'il y a un nombre différent d'échantillons et quelques valeurs extrêmes. Les deux zones de plage ont des distributions cumulatives très similaires.

In [None]:
fig, axs = plt.subplots(1,2, sharey=False)

sns.ecdfplot(data=work_data, x="value",  ax=axs[1], hue= "position", palette=zone_palette)
sns.histplot(data=work_data, x="value",  ax=axs[0], hue= "position", stat="probability", palette=zone_palette)
axs[0].set_ylabel("probabilité", labelpad=15)
axs[1].set_ylabel("probabilité", labelpad=15)
axs[0].set_xlabel("Fibres + fragments", labelpad=15)
axs[1].set_xlabel("Fibres + fragments", labelpad=15)
plt.tight_layout()
plt.show()

(section-five-01)=
## Analyse de variance: position, substrat

### Total cumulé de tous les types de fragments

In [None]:
di_data = work_data.groupby(["Plage", "Echantillon", "orientation", "position", "substrat"], as_index=False).value.sum()

In [None]:
fig, ax = plt.subplots()
# sns.scatterplot(data=work_data, x="substrat", y="value",  hue="position", palette=zone_palette)
sns.boxplot(data=di_data, x="substrat", y="value",  hue="position", palette=zone_palette)
ax.set_ylim(-1, 2000)
plt.show()

In [None]:
di_data.position.unique()

#### Résultats moindres carrés ordinaires

In [None]:
model = ols('value ~ C(position)', data=di_data).fit()

model.summary()


In [None]:
anova_lm(model)

### Total cumulé de fibres

In [None]:
di = work_data[work_data.variable == "fibres"].groupby(["Plage", "Echantillon", "orientation", "position", "substrat"], as_index=False).value.sum()
fig, ax = plt.subplots()
# sns.scatterplot(data=work_data, x="substrat", y="value",  hue="position", palette=zone_palette)
sns.boxplot(data=di, x="substrat", y="value",  hue="position", palette=zone_palette)
ax.set_ylim(-1, 1000)
plt.show()

#### Résultats moindres carrés ordinaires

In [None]:
model = ols('value ~ C(position) + C(substrat)', data=di).fit()

model.summary()


In [None]:
anova_lm(model)

### Total cumulé de fragments dure

In [None]:
di = work_data[work_data.variable == "fdure"].groupby(["Plage", "Echantillon", "orientation", "position", "substrat"], as_index=False).value.sum()
fig, ax = plt.subplots()
# sns.scatterplot(data=work_data, x="substrat", y="value",  hue="position", palette=zone_palette)
sns.boxplot(data=di, x="substrat", y="value",  hue="position", palette=zone_palette)
ax.set_ylim(-1, 500)
plt.show()

#### Résultats moindres carrés ordinaires

In [None]:
model = ols('value ~ C(position) + C(substrat)', data=di).fit()

model.summary()

In [None]:
anova_lm(model)

### Total cumulé de fragments souple

In [None]:
di = work_data[work_data.variable == "souple"].groupby(["Plage", "Echantillon", "orientation", "position", "substrat"], as_index=False).value.sum()
fig, ax = plt.subplots()
# sns.scatterplot(data=work_data, x="substrat", y="value",  hue="position", palette=zone_palette)
sns.boxplot(data=di, x="substrat", y="value",  hue="position", palette=zone_palette)
ax.set_ylim(-1, 200)
plt.show()

#### Résultats moindres carrés ordinaires

In [None]:
model = ols('value ~ C(position) + C(substrat)', data=di).fit()

model.summary()

In [None]:
anova_lm(model)

### Lieux avec un substrat de classe 3

In [None]:
', '.join(work_data[work_data.substrat == 3].Plage.unique())