In [1]:
%load_ext watermark
import numpy as np
import pandas as pd
import datetime as dt
from slugify import slugify
import unittest

from plastockmethods import name_zones, name_particles, name_frequentation, name_situation
from plastockmethods import particle_groups, name_substrate, name_distance, table_css_styles

def attribute_summary(some_data: pd.DataFrame, vals: str, voi: str):
    """
    Résume les attributs d'un DataFrame basé sur des groupes spécifiques.

    Cette fonction regroupe les données de 'some_data' par 'echantillon' et une variable d'intérêt 'voi',
    calcule la somme des valeurs de la colonne spécifiée par 'vals' pour chaque groupe, puis fournit
    des statistiques descriptives pour chaque groupe.

    Args:
        some_data (pd.DataFrame): Le DataFrame contenant les données à analyser.
        vals (str): Le nom de la colonne dont les valeurs sont résumées.
        voi (str): Le nom de la variable d'intérêt utilisée pour le groupement.

    Returns:
        pd.DataFrame: Un DataFrame contenant les statistiques descriptives pour chaque valeur de 'voi',
        incluant le nombre d'observations, la valeur maximale, la valeur minimale, la moyenne, l'écart-type, etc.

    Le DataFrame résultant est indexé par 'voi' avec le nom de l'index supprimé pour une présentation plus claire.
    """
    
    groupby = ['echantillon', voi]
    data = some_data.groupby(groupby, as_index=False)[vals].sum()
    
    data_summary = data.groupby(voi, as_index=False)[vals].describe()
    data_summary[['count', 'max', 'min']] = data_summary[['count', 'max', 'min']].astype('int')
    data_summary.set_index(voi, inplace=True, drop=True)
    data_summary.index.name = None
    
    return data_summary

work_data = pd.read_csv("data/end_pipe/long_form_micro.csv")
beach_data = pd.read_csv("data/end_pipe/asl_beaches.csv")

# Attribute summary methods

Cela concerne: location_attributes.ipynb, macro_attributes.ipynb

## Tests

### test `attribute_summary` 

In [2]:
class TestAttributeSummary(unittest.TestCase):

    def test_attribute_summary(self):
        # Create a sample DataFrame
        data = {
            'echantillon': ['e1', 'e1', 'e2', 'e2', 'e3'],
            'category': ['A', 'B', 'A', 'C', 'B'],
            'quantity': [10, 20, 15, 5, 25]
        }
        df = pd.DataFrame(data)

        # Define the parameters for the function
        vals = 'quantity'
        voi = 'category'

        # Call the function
        result = attribute_summary(df, vals, voi)

        # Define the expected output
        expected_data = {
            'count': [2, 2, 1],
            'mean': [12.5, 22.5, 5.0],
            'std': [3.535534, 3.535534, pd.NA],
            'min': [10, 20, 5],
            '25%': [11.25, 21.25, 5.0],
            '50%': [12.5, 22.5, 5.0],
            '75%': [13.75, 23.75, 5.0],
            'max': [15, 25, 5]
        }
        expected_output = pd.DataFrame(expected_data, index=['A', 'B', 'C'])

        # Check if the result matches the expected output
        pd.testing.assert_frame_equal(result, expected_output, check_dtype=False)

# Create a test suite and add the test case
test_suite = unittest.TestLoader().loadTestsFromTestCase(TestAttributeSummary)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)

test_attribute_summary (__main__.TestAttributeSummary) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.016s

OK


## Données

### micro

In [3]:
work_data.head()

Unnamed: 0,Plage,echantillon,orientation,position,substrat,frequentation,situation,distance,objet,compte
0,Amphion,74_Amp_1,NE,1,4,3,1,1,fibres,97
1,Amphion,74_Amp_10,NNE,2,4,3,1,1,fibres,140
2,Amphion,74_Amp_2,NNE,1,4,3,1,1,fibres,121
3,Amphion,74_Amp_3,NE,1,4,3,1,1,fibres,31
4,Amphion,74_Amp_4,NNE,1,4,3,1,1,fibres,179


### macro

In [4]:
new_data = pd.read_csv("data/macro_current.csv")
beach_data = pd.read_csv("data/end_pipe/asl_beaches.csv")
n_beach_data = pd.read_csv("data/pstock_beaches_current.csv")

new_data = new_data.merge(beach_data[["Plage", "frequentation", "situation", "distance", "orientation"]], left_on = "Plage", right_on="Plage")

new_column_names = {
    "Position":"position",
    "Substrat":"substrat",
    "Date":"date",
    "Code":"code",
    "Quantité":"quantite",
    "Aire":"area"
}

variables = ["position", "substrat", "frequentation", "situation", "distance", "orientation"]

length_key = n_beach_data[["Plage","length"]].drop_duplicates("Plage").set_index("Plage")
work_datad = new_data[["Plage",  'frequentation', 'situation', 'distance', 'orientation', *new_column_names.keys()]].copy()
work_datad.rename(columns=new_column_names, inplace=True)
work_datad["length"] = work_datad.Plage.apply(lambda x: length_key.loc[x, "length"])
work_datad["slug"] = work_datad.Plage.apply(lambda x: slugify(x))
work_datad["echantillon"] = list(zip(work_datad.slug, work_datad['date']))
work_datad['date'] = pd.to_datetime(work_datad["date"], format="mixed", dayfirst=True)
work_datad.dropna(inplace=True)
work_datad[variables[:-1]] = work_datad[variables[:-1]].astype("int")
work_datad["pcs/m2"] = work_datad.quantite/work_datad.area

work_datad.head()

Unnamed: 0,Plage,frequentation,situation,distance,orientation,position,substrat,date,code,quantite,area,length,slug,echantillon,pcs/m2
0,Amphion,3,1,1,NE,1,4,2022-02-01,G24,2,98,91,amphion,"(amphion, 01.02.2022)",0.020408
1,Amphion,3,1,1,NE,2,4,2022-02-01,G24,42,342,91,amphion,"(amphion, 01.02.2022)",0.122807
2,Amphion,3,1,1,NE,2,4,2022-05-03,G24,5,342,91,amphion,"(amphion, 03.05.2022)",0.01462
3,Amphion,3,1,1,NE,1,4,2022-07-19,G24,5,98,91,amphion,"(amphion, 19.07.2022)",0.05102
4,Amphion,3,1,1,NE,2,4,2022-07-19,G24,11,342,91,amphion,"(amphion, 19.07.2022)",0.032164


## Substrat

Concernant tables A2-2, A4-1

### micro table A2-2

In [5]:
# ananlysis of substrat
voi = "substrat"
vals = "compte"

substrat_summary = attribute_summary(work_data, vals, voi)
substrat_summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,114,169.692982,155.163809,0,82.75,132.0,200.0,971
2,28,151.892857,136.763472,19,56.5,122.0,174.5,550
3,29,335.586207,622.587201,0,62.0,90.0,230.0,2991
4,46,125.195652,107.162933,26,57.5,92.5,155.75,665


### macro table A4-1

In [6]:
# ananlysis of substrat
voi = "substrat"
vals = "pcs/m2"

event_total = work_datad.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()

substrat_summaryd = attribute_summary(event_total, vals, voi)
substrat_summaryd

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,27,2.59251,2.442611,0,0.910233,1.55506,4.374276,8
2,31,0.685954,0.82373,0,0.288767,0.396226,0.632386,3
3,16,0.623042,0.632783,0,0.197133,0.457894,0.635391,2
4,36,1.129098,1.051481,0,0.284012,0.61194,1.923513,3


## Frequentation

Concernant tables A2-3, A4-2

### micro table A2-3

In [7]:
# ananlysis of frequentation
voi = "frequentation"
vals = "compte"
groupbys = ['echantillon', voi]

freq = attribute_summary(work_data, vals, voi)

freq

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,8,73.375,58.848322,0,46.0,57.0,98.75,182
2,37,267.837838,497.48593,20,72.0,117.0,209.0,2991
3,172,166.232558,189.288923,0,71.5,124.5,194.0,1492


### macro table A4-2

In [8]:
voi = "frequentation"
vals = "pcs/m2"

event_total = work_datad.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()

freqd = attribute_summary(event_total, vals, voi)
freqd

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,15,0.371521,0.155429,0,0.250897,0.375,0.494823,0
2,28,1.866299,2.060383,0,0.336952,1.053459,2.624421,8
3,55,1.52817,1.609684,0,0.63667,0.940848,1.887766,8


## Situation

Concernant tables A2-4, A4-3

### micro table A2-4

In [9]:
# ananlysis of situation
voi = "situation"
vals = "compte"

sit = attribute_summary(work_data, vals, voi)

sit

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,140,186.414286,286.48633,0,75.75,128.0,200.0,2991
2,77,168.714286,230.732749,0,59.0,104.0,192.0,1492


### macro table A4-3

In [10]:
# ananlysis of situation
voi = "situation"
vals = "pcs/m2"

event_total = work_datad.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()

sitd = attribute_summary(event_total, vals, voi)
sitd

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,67,1.589544,1.935867,0,0.363926,0.701884,2.150119,8
2,31,1.14126,0.929372,0,0.566038,0.940848,1.471335,4


## Distance

Concernant tables A2-5, A4-4

### micro table A2-5

In [11]:
# ananlysis of distance
voi = "distance"
vals = "compte"

dist = attribute_summary(work_data, vals, voi)

dist

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,93,176.806452,312.846304,0,78.0,124.0,192.0,2991
2,110,194.818182,240.050499,0,70.5,126.0,210.5,1492
3,2,38.0,26.870058,19,28.5,38.0,47.5,57
4,12,95.0,53.746882,44,56.5,77.0,105.75,200


### macro table A4-4

In [12]:
# ananlysis of distance
voi = "distance"
vals = "pcs/m2"

event_total = work_datad.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()

distd = attribute_summary(event_total, vals, voi)
distd

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,39,1.422208,1.398758,0,0.37746,0.7835,2.150119,5
2,44,1.836804,2.039244,0,0.672559,1.110467,1.942738,8
3,7,0.432783,0.151515,0,0.375,0.396226,0.517099,0
4,8,0.320446,0.144612,0,0.208744,0.282305,0.428016,0


## Orientation

Concernant tables A2-6, A4-5

### micro table A2-6

In [13]:
# ananlysis of substrat
voi = "orientation"
vals = "compte"
groupby = ['echantillon', voi]

data = work_data.groupby(groupby, as_index=False)[vals].sum()
# data[voi] = data[voi].apply(lambda x: name_situation[x])

data_summary = data.groupby(voi, as_index=False)[vals].describe()
data_summary[['count', 'max', 'min']] = data_summary[['count', 'max', 'min']].astype('int')
# data_summary.rename(columns=column_display, inplace=True)
data_summary.set_index("orientation", inplace=True, drop=True)
data_summary.index.name = None
data_summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
E,1,40.0,,40,40.0,40.0,40.0,40
ENE,2,135.5,65.760931,89,112.25,135.5,158.75,182
ESE,4,99.75,34.932077,62,83.0,95.5,112.25,146
N,14,170.0,158.502609,27,80.25,136.5,203.0,665
NE,42,194.238095,290.857841,31,71.5,115.0,182.0,1492
NNE,9,122.666667,70.809604,23,59.0,126.0,182.0,225
NNO,10,133.9,89.98327,38,71.25,116.0,182.75,334
NO,25,227.0,200.156189,25,99.0,155.0,288.0,697
O,7,212.428571,344.636938,0,47.0,59.0,181.5,971
ONO,6,120.666667,74.001802,44,83.25,98.0,137.5,255


### macro table A4-5

In [14]:
# ananlysis of substrat
voi = "orientation"
vals = "pcs/m2"
groupby = ['echantillon', voi]

event_total = work_datad.groupby(["echantillon", "Plage","date", voi], as_index=False)[vals].sum()

data_summaryd = event_total.groupby(voi, as_index=False)[vals].describe()
data_summaryd[['count', 'max', 'min']] = data_summaryd[['count', 'max', 'min']].astype('int')
data_summaryd.set_index("orientation", inplace=True, drop=True)
data_summaryd.index.name = None
data_summaryd

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
E,4,2.324111,1.377206,0,1.417984,2.282609,3.188735,3
ESE,4,0.600788,0.07886,0,0.559173,0.593776,0.635391,0
N,8,1.496605,1.172829,0,0.45283,1.438477,2.463037,3
NE,12,2.145841,1.525577,0,1.361732,1.941857,2.617405,5
NNE,8,1.255157,0.491909,0,0.932248,1.110467,1.510227,2
NNO,4,0.316064,0.125951,0,0.257013,0.321052,0.380102,0
NO,12,2.346572,2.772267,0,0.506435,0.773161,4.750937,8
ONO,8,2.556354,3.175594,0,0.373549,0.680792,4.422963,8
OSO,4,0.262938,0.31443,0,0.088814,0.137107,0.311231,0
SE,4,0.298333,0.178737,0,0.182179,0.246538,0.362692,0


In [15]:
%watermark --iversions -b -r

Git repo: https://github.com/hammerdirt-analyst/plastock.git

Git branch: main

pandas: 2.0.0
numpy : 1.24.2

