In [1]:
%load_ext watermark
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from myst_nb import glue
from slugify import slugify

from plastockconf import name_zones, name_particles, name_frequentation, name_situation
from plastockconf import particle_groups, name_substrate, name_distance, table_css_styles

from typing import List, Dict, Union, Tuple, Callable

def aggregate_dataframe(df: pd.DataFrame,
                        groupby_columns: List[str],
                        aggregation_functions: Dict[str, Union[str, callable]],
                        index: bool = False) -> pd.DataFrame:
    """
    Aggregate specified columns in a Pandas DataFrame using given aggregation functions.

    Args:
        df (pd.DataFrame): The input DataFrame.
        groupby_columns (List[str]): List of column names to group by.
        aggregation_functions (Dict[str, Union[str, callable]]):
            A dictionary where keys are column names to aggregate,
            and values are either aggregation functions (e.g., 'sum', 'mean', 'max', 'min')
            or custom aggregation functions (callable functions).
        index (bool, optional): Whether to use the groupby columns as an index.
            Default is False.

    Returns:
        pd.DataFrame: A new DataFrame with aggregated values.
    """
    grouped = df.groupby(groupby_columns, as_index=index).agg(aggregation_functions)
    
    return grouped

# miro data
work_data = pd.read_csv("data/end_pipe/long_form_micro.csv")
beach_data = pd.read_csv("data/end_pipe/asl_beaches.csv")

# macro data
new_data = pd.read_csv("data/macro_current.csv")
beach_data = pd.read_csv("data/pstock_beaches_current.csv")

new_column_names = {
    "Position":"position",
    "Substrat":"substrat",
    "Date":"date",
    "Code":"code",
    "Quantité":"quantité",
    "Aire":"area"
}

length_key = beach_data[["Plage","length"]].drop_duplicates("Plage").set_index("Plage")
work_datad = new_data[["Plage", *new_column_names.keys()]].copy()
work_datad.rename(columns=new_column_names, inplace=True)
work_datad["length"] = work_datad.Plage.apply(lambda x: length_key.loc[x, "length"])
work_datad["slug"] = work_datad.Plage.apply(lambda x: slugify(x))
work_datad["echantillon"] = list(zip(work_datad.slug, work_datad['date']))
work_datad['date'] = pd.to_datetime(work_datad["date"], format="mixed", dayfirst=True)
work_datad.dropna(inplace=True)
work_datad[["position", "substrat"]] = work_datad[["position", "substrat"]].astype("int")
event_total = work_datad.groupby(["echantillon", "Plage", "date", "position", "area"], as_index=False)['quantité'].sum()
event_total["pcs/m²"] = event_total['quantité']/event_total.area

NameError: name 'slugify' is not defined

# location summary methods

Cela concerne: location_summary.ipynb, macro_summary.ipynb

## Tests

### Test `aggregate_dataframe`

In [None]:
class TestAggregateDataFrame(unittest.TestCase):

    def test_aggregate_dataframe(self):
        # Sample data
        data = pd.DataFrame({'code': ['A', 'B', 'C', 'D', 'E', 'F'],
                             'sample_id': [1, 2, 1, 2, 1, 1],
                             'density': [1.5, 0.5, 1.5, 0.5, 1.5, 1],
                             'quantity': [2, 1, 2, 1, 2, 1],
                             'prop a': ['s1', 's2','s1','s1','s2', 's3'],
                             'prop b': ['x' ,'x', 'z','z','z', 'q']})
        group_by_columns = ['sample_id','prop a']
        aggregation_functions = {'quantity': 'sum','density': 'median'}

        # Expected result
        expected_result = pd.DataFrame({
            'sample_id': {0: 1, 1: 1, 2: 1, 3: 2, 4: 2},
            'prop a': {0: 's1', 1: 's2', 2: 's3', 3: 's1', 4: 's2'},
            'quantity': {0: 4, 1: 2, 2: 1, 3: 1, 4: 1},
            'density': {0: 1.5, 1: 1.5, 2: 1.0, 3: 0.5, 4: 0.5}})

        # Call the function
        result = aggregate_dataframe(data, groupby_columns=group_by_columns, aggregation_functions=aggregation_functions)

        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected_result)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestAggregateDataFrame)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)

## Données

### micro

In [None]:
work_data.head()

### macro

In [None]:
event_total.head()

### Compte moyen par échantillon et plage

Cela concerne: Table A1-4, A1-5, A3-3, A3-4

#### Micro table A1-4

In [None]:
# table A1-4
work_data["particules"] = work_data["compte"]
sample_totals = work_data.groupby(["Plage","echantillon"], as_index=False).particules.sum()
df2 = sample_totals.copy()
sample_totals.particules.describe()

In [None]:
groupby_columns = ['Plage', 'echantillon']
agg_function = {'particules':'sum'}
v_o_i = 'particules'

st = aggregate_dataframe(work_data, groupby_columns=groupby_columns, aggregation_functions=agg_function)
st[v_o_i].describe()

#### Macro table A3-3

In [None]:
# table A3-3
sample_totalsd = event_total.groupby(["echantillon", "date", "Plage"], as_index=False)["pcs/m²"].sum()
df2d = sample_totalsd.copy()
value_column = "moyenne"

sample_totalsd["pcs/m²"].describe()

In [None]:
groupby_columns = ["echantillon", "date", "Plage"]
agg_function = {"pcs/m²":'sum'}
v_o_i = "pcs/m²"

std = aggregate_dataframe(event_total, groupby_columns=groupby_columns, aggregation_functions=agg_function)
std[v_o_i].describe()

### Compte moyenne par position

#### Micro table A1-5

In [None]:
# table A1-5
position_totals = work_data.groupby(["Plage","echantillon", "position"], as_index=False).particules.sum()
position_sample_totals = position_totals.groupby(["echantillon", "Plage", "position"], as_index=False).particules.sum()

summary_ligne_deau = position_sample_totals[position_sample_totals.position == 1].particules.describe()
summary_plage_seche = position_sample_totals[position_sample_totals.position == 2].particules.describe()

summary_ligne_deau

In [None]:
summary_plage_seche

#### Macro table A3-4

In [None]:
position_totalsd = event_total.groupby(["Plage","echantillon","date", "position"], as_index=False)["pcs/m²"].sum()
df4d = position_totalsd.groupby(["Plage","position"], as_index=False)["pcs/m²"].mean()

summary_ligne_deaud = position_totalsd[position_totalsd.position == 1]["pcs/m²"].describe()
summary_plage_seched = position_totalsd[position_totalsd.position == 2]["pcs/m²"].describe()

summary_ligne_deaud

In [None]:
summary_plage_seched

### Nombre moyen par forme et plage

Concernant table A1-6

In [None]:
forme_totals = work_data.groupby(["Plage","echantillon", "objet"], as_index=False).particules.sum()
df3 = forme_totals.copy()
df3.head()

__dur__

In [None]:
tex_dure = forme_totals[forme_totals.objet=="fdure"].particules.describe()

tex_dure

#### souple

In [None]:
tex_souple = forme_totals[forme_totals.objet=="souple"].particules.describe()
tex_souple

#### fibres

In [None]:
tex_fibres = forme_totals[forme_totals.objet=="fibres"].particules.describe()
tex_fibres

### Nombre moyen par forme et position

Concernant table A1-8 et A1-9

#### ligne deau

__dur__

In [None]:
position_totals = work_data.groupby(["Plage","echantillon", "position", "objet"], as_index=False).particules.sum()
position_totals.head()

In [None]:
leau_dure = position_totals[(position_totals.position == 1) & (position_totals.objet == "fdure")].particules.describe()
leau_souple = position_totals[(position_totals.position == 1) & (position_totals.objet == "souple")].particules.describe()
leau_fibre = position_totals[(position_totals.position == 1) & (position_totals.objet == "fibres")].particules.describe()

leau_dure 

__souple__

In [None]:
leau_souple

__fibres__

In [None]:
leau_fibre

#### plage seche

__dur__

In [None]:
plage_dure = position_totals[(position_totals.position == 2) & (position_totals.objet == "fdure")].particules.describe()
plage_souple = position_totals[(position_totals.position == 2) & (position_totals.objet == "souple")].particules.describe()
plage_fibre = position_totals[(position_totals.position == 2) & (position_totals.objet == "fibres")].particules.describe()

plage_dure

__souple__

In [None]:
plage_souple

__fibres__

In [None]:
plage_fibre

In [None]:
%watermark --iversions -b -r