# Temporo spatial join

Method : 
1. Temporo Spatial Join : for each year - Spatial join between Senf & Seidl and the others datasets 
3. Group creation : groupby on the Senf & Seidl index. (One group per index)
2. Group work : 
    - Computing weights for each row
    - Computing score per disturbance type
    - Save group to a dict with the index as key
    - Create row with year, class, score, tree_type, essence  

In [134]:
# Loading 
import geopandas as gpd
epsg = 'epsg:2154'
senfseidl = gpd.read_parquet('../data/processed_datasets/SenfSeidl_joined_EPSG4326_FR.parquet').to_crs(epsg)
nfi = gpd.read_parquet('../data/processed_datasets/NFI_2003-2021_EPSG4326_FR.parquet').to_crs(epsg)
hm = gpd.read_parquet('../data/processed_datasets/health-monitoring_2007-2023_EPSG4326_FR.parquet').to_crs(epsg)
dfde = gpd.read_parquet('../data/processed_datasets/DFDE_1984_2021_EPSG4326_FR.parquet').to_crs(epsg)

# Preprocessing 

Performing geometric computation here so we can entirely rely on Dask.DataFrame later for big data processing. 


In [135]:
#Senf & Seidl
senfseidl.year = senfseidl.year.astype(int)

number_to_class = {
    1:'Storm,Biotic', 
    2:'Fire',
    3:'Other'
}

senfseidl['class'] = senfseidl['cause'].map(number_to_class)
senf_seidl_col = ['year', 'geometry', 'class', 'tree_type', 'essence']
senfseidl = senfseidl[senf_seidl_col]
senfseidl.drop_duplicates(inplace=True)
senfseidl.dropna(inplace=True)

In [163]:
# DFDE

#class
dict_class = {
    'Fire': ['Fire'],
    'Storm': ['Wind'],
    'Drought': ['Summer drought', 'Frost'],
    'Biotic': [
        'Ips typographus', 'Pissodes spp.', 'Bark beetles', 'Bombix desparate',
        'Zeiraphera diniana', 'Biotic', 'Insects', 'Other insects', 'Biotic;Abiotic',
        'Pityogenes chalcographus', 'Tetropium luridum;Tetropium fuscum',
        'Ips acuminatus', 'Tomicus piniperda;Tomicus minor',
        'Phaenops cyanea', 'Pissodes pini', 'Ips cembrae',
        'Tetropium gabrieli', 'Agrilus biguttatus', 'Agrilus viridis',
        'Xyloterus lineatus', 'Erannis defoliaria',
        'Operophtera brumata;Operophtera fagata', 'Lymantria dispar',
        'Thaumetopoea processionea', 'Hylobius abietis',
        'Melolontha hippocastani;Melolontha melolontha',
        'Microtus agrestis;Microtus arvalis;Clethrionomys glareolus',
        'Arvicola terrestris', 'Lophodermium seditiosum',
        'Sphaeropsis sapinea', 'Heterobasidion annosum',
        'Armillaria mellea', 'Chalara fraxinea;Hymenoscyphus fraxineus',
        'Beech decline', 'Oak decline', 'Viscum album', 'Ips sexdentatus'
    ],
    'Tree-logging': [],
    'Other': ['Accident']
}

def get_class(x):
    for key, values in dict_class.items():
        if x in values:
            return key
    return 'Other'

dfde['class'] = dfde['cause'].apply(get_class)

#geometry
dname_geom = {k:v for k,v in zip(dfde['name'].tolist(), dfde['geometry'].tolist())}
dname_geom = {k:v.buffer(5000).simplify(5000) for k,v in dname_geom.items()}
dname_area = {k:v.area for k,v in dname_geom.items()}

#drop duplicates 
dfde.drop_duplicates(subset=['name', 'start_date', 'end_date', 'essence', 'cause', 'notes'], inplace=True)
dfde['geometry'] = dfde['name'].apply(lambda x: dname_geom[x]) 

#compute area here !
dfde['area'] = dfde['name'].apply(lambda x: dname_area[x])

#clean date
import pandas as pd
dfde['start_date'] = pd.to_datetime(dfde['start_date'])
dfde['end_date'] = pd.to_datetime(dfde['end_date'])

#keep_col
dfde_col = ['start_date', 'end_date', 'geometry', 'class', 'tree_type', 'essence', 'cause', 'notes', 'area']
dfde = dfde[dfde_col]

dfde.dropna(inplace=True)
dfde.drop_duplicates(inplace=True)


In [138]:
#nfi 

#filtering
nfi = nfi[ (nfi['probability'] >= 0.1) ]
nfi = nfi[ ~((nfi['class'] == 'Tree-logging')&(nfi['intensity']==0)) ]

#correct start_date
from datetime import timedelta
import pandas as pd
def get_start_date(row):
    if not pd.isnull(row['start_date']):
        return row['start_date']
    else:
        return row['end_date'] - timedelta(days=5*365.25)
    

nfi['start_date'] = nfi.apply(get_start_date, axis=1)

#keep col 
nfi_col = ['start_date', 'end_date', 'geometry', 'class', 'tree_type', 'essence']
nfi = nfi[nfi_col]

nfi.dropna(inplace=True)
nfi.drop_duplicates(inplace=True)

In [139]:
#hm
def get_class(x):
    if x  == 'biotic-factor':
        return 'Biotic'
    else :
        return 'Other'
    
hm['class'] = hm['class'].apply(get_class)
hm['year'] = hm['year'].astype(int)
hm.drop_duplicates(inplace=True)
hm.dropna(inplace=True)

hm.rename(columns={'LIB_Problème principal':'cause', 'Remarques':'notes'}, inplace=True)

# Joining

In [164]:
#full Dask -> 20s 
# with sjoin_nearest and sd computation -> 8s  
import dask_geopandas as dgpd
import dask.dataframe as dd

years = senfseidl['year'].unique()

temporal_buffer = 5 #years 
spatial_buffer = 5000 #meters
# nfi.geometry = nfi.geometry.buffer(spatial_buffer)
# hm.geometry = hm.geometry.buffer(spatial_buffer)

nfi['dataset'] = 'nfi'
hm['dataset'] = 'hm'
dfde['dataset'] = 'dfde'
senfseidl['dataset'] = 'senfseidl'

year = 2010

senfseidl_year = senfseidl[senfseidl['year'] == year]
nfi_year = nfi[(nfi['start_date'].dt.year >= year - temporal_buffer) & (nfi['end_date'].dt.year <= year + temporal_buffer)]
hm_year = hm[(hm['year'] >= year - temporal_buffer) & (hm['year'] <= year + temporal_buffer)]
dfde_year = dfde[(dfde['start_date'].dt.year >= year - temporal_buffer) & (dfde['end_date'].dt.year <= year + temporal_buffer)]

senfseidl_year_ = dgpd.from_geopandas(senfseidl_year, npartitions=10)
# nfi_year = dgpd.from_geopandas(nfi_year, npartitions=10)
# hm_year = dgpd.from_geopandas(hm_year, npartitions=10)
dfde_year_ = dgpd.from_geopandas(dfde_year, npartitions=10)

# senfseidl_nfi_year = nfi_year.sjoin(senfseidl_year)
# senfseidl_hm_year = hm_year.sjoin(senfseidl_year)
senfseidl_hm_year = hm_year.sjoin_nearest(senfseidl_year, max_distance=spatial_buffer, distance_col='sd')
senfseidl_nfi_year = nfi_year.sjoin_nearest(senfseidl_year, max_distance=spatial_buffer, distance_col='sd')
senfseidl_dfde_year = dfde_year_.sjoin(senfseidl_year_)

#concat with dask_geopandas
import dask.dataframe as dd
concatenation = gpd.GeoDataFrame(dd.concat([senfseidl_nfi_year, senfseidl_hm_year, senfseidl_dfde_year], axis=0).compute(), crs=epsg)

In [165]:
concatenation

Unnamed: 0,start_date,end_date,geometry,class_left,tree_type_left,essence_left,dataset_left,index_right,year,class_right,tree_type_right,essence_right,dataset_right,sd,year_left,cause,notes,year_right,area
24647,2005-12-27 18:00:00,2010-12-28,POINT (383445.932 6345590.073),Tree-logging,Broadleaf,Pedunculate Oak,nfi,4042259,2010.0,"Storm,Biotic",Mixed,"nr,maritime pine",senfseidl,374.502488,,,,,
24647,2005-12-27 18:00:00,2010-12-28,POINT (383445.932 6345590.073),Tree-logging,Broadleaf,Pedunculate Oak,nfi,2441906,2010.0,Other,Mixed,"nr,maritime pine",senfseidl,374.502488,,,,,
24649,2005-12-27 18:00:00,2010-12-28,POINT (392125.967 6429437.274),Tree-logging,Conifer,Maritime Pine,nfi,2460281,2010.0,Other,Conifer,maritime pine,senfseidl,295.931920,,,,,
24650,2005-12-27 18:00:00,2010-12-28,POINT (701431.392 6263028.476),Storm,Broadleaf,Holly Oak,nfi,2523287,2010.0,Other,Broadleaf,broadleaf,senfseidl,2118.659759,,,,,
24650,2005-12-27 18:00:00,2010-12-28,POINT (701431.392 6263028.476),Storm,Broadleaf,Holly Oak,nfi,5468311,2010.0,Fire,Broadleaf,broadleaf,senfseidl,2118.659759,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,2015-01-01 00:00:00,2015-12-28,"POLYGON ((1016482.342 6707283.553, 996492.098 ...",Biotic,mixed,conifers;broadleaves,dfde,4092912,2010.0,"Storm,Biotic",Conifer,conifers,senfseidl,,,Biotic;Abiotic,state forests (only barked wood for the Alsace...,,1.107470e+10
735,2015-01-01 00:00:00,2015-12-28,"POLYGON ((1016482.342 6707283.553, 996492.098 ...",Biotic,mixed,conifers;broadleaves,dfde,4092918,2010.0,"Storm,Biotic",Mixed,mixed,senfseidl,,,Biotic;Abiotic,state forests (only barked wood for the Alsace...,,1.107470e+10
735,2015-01-01 00:00:00,2015-12-28,"POLYGON ((1016482.342 6707283.553, 996492.098 ...",Biotic,mixed,conifers;broadleaves,dfde,4092919,2010.0,"Storm,Biotic",Mixed,mixed,senfseidl,,,Biotic;Abiotic,state forests (only barked wood for the Alsace...,,1.107470e+10
735,2015-01-01 00:00:00,2015-12-28,"POLYGON ((1016482.342 6707283.553, 996492.098 ...",Biotic,mixed,conifers;broadleaves,dfde,4092911,2010.0,"Storm,Biotic",Mixed,mixed,senfseidl,,,Biotic;Abiotic,state forests (only barked wood for the Alsace...,,1.107470e+10


In [166]:
#compute weights
def spatial_weight(x) -> float:
    if x <= 1:
        return 1
    else: 
        return 1 - (x-1)/9 
    
def temporal_weight(x) -> float:
    if x <= 3:
        return 1 - x/12
    else: 
        return 0.75 * (1 - (x-3)/3)

from thefuzz import fuzz

def compute_tree_coherence(row_tt, row_e, ref_tt, ref_e) -> float:

    for essence in ref_e.split(','):
        if fuzz.token_set_ratio(row_e.lower(), essence.lower()) > 80:
            return 1
        
    if row_tt.lower() == ref_tt.lower():
        return 0.75 
    
    if row_tt.lower() == 'mixed' or ref_tt.lower() == 'mixed':
        return 0.5

    return 0.25 

def compute_weight(row, reference):
    # spatial distance, spatial weight, temporal distance, temporal weight, tree correspondance weight, overall accuracy
    if row['dataset'] == 'senfseidl':
        return 0, 1, 0, 1, 1, 0.91, 0.91  
    elif row['dataset'] == 'dfde':
        sd = (row['area'] / 1e6)** (1/2) / 35
        oa = 0.95
    elif row['dataset'] in ['hm', 'nfi']:
        sd = row['sd'] / 1e3
        oa = 0.9

    if row['dataset'] in ['dfde', 'nfi']:
        td = min(abs(reference['year'] - row['start_date'].year), abs(row['end_date'].year - reference['year']))
    elif row['dataset'] == 'hm':
        td = min(abs(reference['year'] - row['year']), abs(row['year'] - reference['year']))
    
    tc = compute_tree_coherence(row['tree_type'], row['essence'], reference['tree_type'], reference['essence'])
    sw = spatial_weight(sd)
    tw = temporal_weight(td)

    return sd, sw, td, tw, tc, oa, sw * tw * tc * oa

#compute proba per class
dict_isin = {
    'Fire': ['Fire'],
    'Storm': ['Storm', 'Storm,Biotic'],
    'Drought': ['Drought'],
    'Biotic-dieback': ['Biotic-dieback', 'Biotic', 'Storm,Biotic', 'Other'],
    'Biotic-mortality': ['Biotic-mortality', 'Biotic', 'Storm,Biotic', 'Other'],
    'Tree-logging': ['Tree-logging', 'Other'],
    'Other': ['Other']
}
def compute_proba_per_class(gdf):
    dclasses = {}
    for c in ['Fire', 'Storm', 'Biotic-dieback', 'Biotic-mortality','Drought', 'Tree-logging', 'Other']:
        cond = gdf['class'].isin(dict_isin[c])
        for_ = gdf[cond]['p'].sum()
        against_ = gdf[~cond]['p'].sum()
        dclasses[c] = (for_ - against_) / len(gdf)
    return dclasses 

def compute_class_p_spread(d):
    #compute key with the highest value
    max_key = max(d, key=d.get)
    max_value = d[max_key]

    #compute the spread between the highest value and the second highest value
    del d[max_key]
    second_max_key = max(d, key=d.get)
    second_max_value = d[second_max_key]

    return max_key, max_value, max_value - second_max_value

def wrappper_weight_group(group):
    reference = group.iloc[0]
    group[['sd', 'sw', 'td', 'tw', 'tc', 'oa', 'p']] = group.apply(lambda x: compute_weight(x,reference), axis=1, result_type='expand')
    return group

In [167]:
import numpy as np

#vectorized with numpy spatial_weight, temporal_weight, tree_coherence, 

vectorized_spatial_weight = np.vectorize(spatial_weight, otypes=[np.float64])
vectorized_temporal_weight = np.vectorize(temporal_weight, otypes=[np.float64])
vectorized_tree_coherence = np.vectorize(compute_tree_coherence, otypes=[np.float64])

def vectorized_compute_weight(df):   
        
    reference = df.iloc[0] 
    # Initialize result dataframe with default values
    result = pd.DataFrame({
        'sd': 0.,
        'sw': 0.,
        'td': 0.,
        'tw': 0.,
        'tc': 0.,
        'oa': 0.,
        'p': 0.
    }, index=df.index)
    
    # Conditions for different datasets
    mask_senfseidl = df['dataset'] == 'senfseidl'
    mask_dfde = df['dataset'] == 'dfde'
    mask_hm_or_nfi = df['dataset'].isin(['hm', 'nfi'])
    mask_dfde_or_nfi = df['dataset'].isin(['dfde', 'nfi'])
    
    # For 'senfseidl'
    result.loc[mask_senfseidl, ['sd', 'sw', 'td', 'tw', 'tc', 'oa']] = [0., 1., 0., 1., 1., 0.91]
    
    # For 'dfde'
    result.loc[mask_dfde, 'sd'] = (df.loc[mask_dfde, 'area'].values / 1e6)**(1/2) / 35
    result.loc[mask_dfde, 'oa'] = 0.95
    
    # For 'hm' or 'nfi'
    result.loc[mask_hm_or_nfi, 'sd'] = df.loc[mask_hm_or_nfi, 'sd'] / 1e3
    result.loc[mask_hm_or_nfi, 'oa'] = 0.9
    
    # Temporal calculations
    result.loc[mask_dfde_or_nfi, 'td'] = np.minimum(np.abs(reference['year'] - df.loc[mask_dfde_or_nfi, 'start_date'].dt.year),
                                                   np.abs(df.loc[mask_dfde_or_nfi, 'end_date'].dt.year - reference['year']))
    result.loc[mask_hm_or_nfi, 'td'] = np.abs(reference['year'] - df.loc[mask_hm_or_nfi, 'year'])
    
    # Tree coherence (placeholder)
    result['tc'] = vectorized_tree_coherence(df['tree_type'], df['essence'], reference['tree_type'], reference['essence'])
    
    # Compute weights
    result['sw'] = vectorized_spatial_weight(result['sd'])
    result['tw'] = vectorized_temporal_weight(result['td'])
    result['p'] = result[['sw','tw','tc']].mean(axis=1) * result['oa']
    
    return pd.concat([df, result], axis=1)


In [168]:
#simplifying code : using concat and groupby for appending the reference (senfseidl) row. 

#entire dataset -> 2.3s
co = concatenation
print(co.shape)

col = ['start_date', 'end_date', 'geometry', 'year_left', 'class_left', 'tree_type_left', 'essence_left', 'dataset_left', 'cause', 'notes', 'area', 'sd']
all_index_right = co['index_right'].unique()
co = co[['index_right']+col]
rename = {c: c.split('_left')[0] for c in col}
co = co.rename(columns=rename)

senfseidl_year['index_right'] = senfseidl_year.index

#concat
# groups = dd.concat([concatenation, senfseidl_year.loc[all_index_right]], axis=0).reset_index().groupby(by='index')
# groups = dd.concat([co, senfseidl_year.loc[all_index_right]], axis=0).compute().groupby(by='index_right')
dtypes = {'year': 'float64',
 'geometry': 'geometry',
 'class': 'object',
 'tree_type': 'object',
 'essence': 'object',
 'dataset': 'object',
 'start_date': 'datetime64[ns]',
 'end_date': 'datetime64[ns]',
 'cause': 'string',
 'notes': 'string',
 'sd': 'float64',
 'sw': 'float64',
 'td': 'float64',
 'tw': 'float64',
 'tc': 'float64',
 'oa': 'float64',
 'p': 'float64'}

meta = pd.DataFrame(columns=list(dtypes.keys())).astype(dtypes)

#l'order of senfseidl_year and co is important. If we want to ise iloc[0] on the group to retrieve senfseidl row, we have to stick to this order.
from dask.diagnostics import ProgressBar

# with ProgressBar():
groups = gpd.GeoDataFrame(dd.concat([senfseidl_year.loc[all_index_right], co], axis=0).compute(),crs=epsg).groupby(by='index_right')
len(groups)

(354418, 19)


92236

La version vectorisée n'est pas plus rapide car les groupes sont petits. De 2 à 50 lignes. 

In [169]:
#1000 -> 14.5s 
#10000 -> 2min 14s
from itertools import islice
for name, group in islice(groups, 1000):
    vectorized_compute_weight(group)


In [170]:
#1000 -> 6.6s 
#10000 -> 1mins 
#130 000 -> 13min
from itertools import islice
for name, group in islice(groups, 1000):
    group[['sd', 'sw', 'td', 'tw', 'tc', 'oa', 'p']] = group.apply(lambda x: compute_weight(x, group.iloc[0]), axis=1, result_type='expand')

In [None]:
from joblib import Parallel, delayed
def wrapper(group):
    reference = group.iloc[0]
    group[['sd', 'sw', 'td', 'tw', 'tc', 'oa', 'p']] = group.apply(lambda x: compute_weight(x, group.iloc[0]), axis=1, result_type='expand')
    return group

#1000 -> 9.7s 
#10000 -> 
results = Parallel(n_jobs=8, prefer='threads')(delayed(wrapper)(group) for _, group in islice(groups, 10000))

In [175]:
gr = [group for _,group in islice(groups, 1000)]

In [177]:
meta = {'year': 'float64',
 'geometry': 'geometry',
 'class': 'object',
 'tree_type': 'object',
 'essence': 'object',
 'dataset': 'object',
 'start_date': 'datetime64[ns]',
 'end_date': 'datetime64[ns]',
 'cause': 'string',
 'notes': 'string',
 'area': 'float64',
 'index_right': 'int64',
 'sd': 'float64',
 'sw': 'float64',
 'td': 'float64',
 'tw': 'float64',
 'tc': 'float64',
 'oa': 'float64',
 'p': 'float64'}


from dask.diagnostics import ProgressBar
with ProgressBar():
    gr = dd.concat([senfseidl_year.loc[all_index_right], co], axis=0).groupby(by='index_right').apply(wrapper, meta=meta).compute()

[#################################       ] | 84% Completed | 18m 4ssss


ValueError: The columns in the computed data do not match the columns in the provided metadata
  Extra:   ['area', 'index_right']
  Missing: []

In [28]:
(groups.index_right.value_counts() == 2).sum()

9285

In [35]:
from itertools import islice
for name, group in islice(groups, 1):
    print(name)

2425008


In [63]:
#10 -> 1.7s
#100 -> 8.4s
#208 -> 12.9s 

col = ['start_date', 'end_date', 'geometry', 'year_left', 'class_left', 'tree_type_left', 'essence_left', 'dataset_left', 'cause', 'notes']
col_row = ['year', 'geometry', 'class', 'tree_type', 'essence']
groups = concatenation.groupby('index_right')

dict_disturbances = dict()
list_rows = []
from itertools import islice
for name, group in islice(groups, 1):
    disturbance_group = create_disturbance_group(group, senfseidl_year_, col)
    disturbance_group[['sd', 'sw', 'td', 'tw', 'tc', 'oa', 'p']] = disturbance_group.apply(lambda x: compute_weight(x, disturbance_group.iloc[0]), axis=1, result_type='expand')
    # dclasses = compute_proba_per_class(disturbance_group)
    # index_senfseidl = disturbance_group.index[0] 
    # dict_disturbances[index_senfseidl] = disturbance_group
    # row = disturbance_group.iloc[[0]][col_row]
    # row[['class', 'probability', 'spread']] = compute_class_p_spread(dclasses)
    # list_rows.append(row)


In [10]:
len(concatenation.iloc[:65000].groupby('index_right'))

47837

In [23]:
#remove future warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

dtypes = {'year': 'float64',
 'geometry': 'geometry',
 'class': 'object',
 'tree_type': 'object',
 'essence': 'object',
 'dataset': 'object',
 'start_date': 'datetime64[ns]',
 'end_date': 'datetime64[ns]',
 'cause': 'string',
 'notes': 'string'}

meta = pd.DataFrame(columns=list(dtypes.keys())).astype(dtypes)

#create group
def create_disturbance_group_(geodataframe, reference):
    index_reference = geodataframe.index_right.iloc[0]
    row_reference = reference.loc[[index_reference]]
    return pd.concat([row_reference, geodataframe.drop(columns=['index_right'])], axis=0)

#208 groups -> 3.3s (/4)
#967 groups -> 12.3s ()
#10473 groups -> 2min
#ALL groups -> out of memory 'IOStream.flushed timed out'
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

dd_concattenation = dd.from_pandas(concatenation.iloc[:65000], npartitions=10)
with ProgressBar():
    groups_disturbances = dd_concattenation.groupby('index_right').apply(create_disturbance_group_, \
                                                                     reference=senfseidl_year, \
                                                                            meta=meta).compute()

[##############################          ] | 77% Completed | 287.55 ss


KeyboardInterrupt: 

In [100]:
groups_disturbances

Unnamed: 0_level_0,Unnamed: 1_level_0,year,geometry,class,tree_type,essence,dataset,start_date,end_date,cause,notes
index_right,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2433867,2433867,2010.0,"POLYGON ((374508.785 6329044.108, 374538.659 6...",Other,Conifer,maritime pine,senfseidl,NaT,NaT,,
2433867,24647,,"POLYGON ((402431.478 6346178.885, 402437.501 6...",Tree-logging,Broadleaf,Pedunculate Oak,nfi,2005-12-27 18:00:00,2010-12-28,,
2433867,25582,,"POLYGON ((388430.082 6344295.535, 388436.105 6...",Tree-logging,Conifer,Maritime Pine,nfi,2005-12-27 18:00:00,2010-12-28,,
2433867,25583,,"POLYGON ((388430.082 6344295.535, 388436.105 6...",Tree-logging,Conifer,Maritime Pine,nfi,2010-12-27 18:00:00,2015-12-28,,
2433867,26583,,"POLYGON ((392360.272 6336271.479, 392366.295 6...",Tree-logging,Conifer,Maritime Pine,nfi,2005-12-27 18:00:00,2010-12-28,,
...,...,...,...,...,...,...,...,...,...,...,...
2444153,25583,,"POLYGON ((388430.082 6344295.535, 388436.105 6...",Tree-logging,Conifer,Maritime Pine,nfi,2010-12-27 18:00:00,2015-12-28,,
2444153,26105,,"POLYGON ((420363.073 6340037.856, 420369.096 6...",Storm,Broadleaf,Pedunculate Oak,nfi,2005-12-27 18:00:00,2010-12-28,,
2444153,26583,,"POLYGON ((392360.272 6336271.479, 392366.295 6...",Tree-logging,Conifer,Maritime Pine,nfi,2005-12-27 18:00:00,2010-12-28,,
2444153,26769,,"POLYGON ((394407.273 6342248.628, 394413.295 6...",Tree-logging,Conifer,Maritime Pine,nfi,2005-12-27 18:00:00,2010-12-28,,


In [47]:
#10 -> 1.8s
#100 -> 3.7s
#200 -> 5.6s 
#1000 -> 27.6s 
#// version
col = ['start_date', 'end_date', 'geometry', 'year_left', 'class_left', 'tree_type_left', 'essence_left', 'dataset_left', 'cause', 'notes']
def classify_disturbance_group(group, reference, col):
    disturbance_group = create_disturbance_group(group, reference, col)
    disturbance_group[['sd', 'sw', 'td', 'tw', 'tc', 'oa', 'p']] = disturbance_group.apply(lambda x: compute_weight(x, disturbance_group.iloc[0]), axis=1, result_type='expand')
    dclasses = compute_proba_per_class(disturbance_group)
    index_senfseidl = disturbance_group.index[0] 
    dict_disturbances[index_senfseidl] = disturbance_group
    row = disturbance_group.iloc[[0]][col_row]
    row[['class', 'probability', 'spread']] = compute_class_p_spread(dclasses)
    return disturbance_group

from joblib import Parallel, delayed
disturbance_groups = Parallel(n_jobs=8, prefer='threads')(delayed(classify_disturbance_group)(group, senfseidl, col) for name, group in islice(groups, 200))

In [43]:
import dask.dataframe as dd

dd_concattenation = dd.from_pandas(concatenation.iloc[:100000], npartitions=10)
groups = dd_concattenation.groupby('index_right')

#aplly classify_disturbance_group
dd_disturbance_groups = groups.apply(classify_disturbance_group, reference=senfseidl, col=col)

ValueError: Metadata inference failed in `groupby.apply(classify_disturbance_group)`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
KeyError("None of [Index([1], dtype='int64')] are in the [index]")

Traceback:
---------
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/dask/dataframe/utils.py", line 193, in raise_on_meta_error
    yield
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/dask/dataframe/groupby.py", line 2483, in apply
    meta = self._meta_nonempty.apply(func, *meta_args, **meta_kwargs)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/groupby/groupby.py", line 1765, in apply
    result = self._python_apply_general(f, self._selected_obj)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/groupby/groupby.py", line 1814, in _python_apply_general
    values, mutated = self.grouper.apply_groupwise(f, data, self.axis)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/groupby/ops.py", line 905, in apply_groupwise
    res = f(group)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/groupby/groupby.py", line 1753, in f
    return func(g, *args, **kwargs)
  File "/var/folders/1c/h8t9_vd53gsgz_wm9kswm8wm0000gp/T/ipykernel_2805/100303142.py", line 7, in classify_disturbance_group
    disturbance_group = create_disturbance_group(group, reference, col)
  File "/var/folders/1c/h8t9_vd53gsgz_wm9kswm8wm0000gp/T/ipykernel_2805/4035822556.py", line 3, in create_disturbance_group
    row_reference = reference.loc[[index_reference]]
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/indexing.py", line 1153, in __getitem__
    return self._getitem_axis(maybe_callable, axis=axis)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/indexing.py", line 1382, in _getitem_axis
    return self._getitem_iterable(key, axis=axis)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/indexing.py", line 1322, in _getitem_iterable
    keyarr, indexer = self._get_listlike_indexer(key, axis)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/indexing.py", line 1520, in _get_listlike_indexer
    keyarr, indexer = ax._get_indexer_strict(key, axis_name)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 6114, in _get_indexer_strict
    self._raise_if_missing(keyarr, indexer, axis_name)
  File "/Users/arthurcalvi/Venv/DiAtDaJo/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 6175, in _raise_if_missing
    raise KeyError(f"None of [{key}] are in the [{axis_name}]")


In [16]:
col = ['start_date', 'end_date', 'geometry', 'year_left', 'class_left', 'tree_type_left', 'essence_left', 'dataset_left', 'cause', 'notes']
a = create_disturbance_group(group, senfseidl, col)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf.rename(columns=rename, inplace=True)


In [32]:
a

Unnamed: 0,year,geometry,class,tree_type,essence,dataset,start_date,end_date,cause,notes
2424928,2010.0,"POLYGON ((318846.967 6257104.615, 318876.859 6...",Other,Mixed,larch,senfseidl,NaT,NaT,,
38816,,"POLYGON ((323783.633 6256228.629, 323749.926 6...",Biotic-dieback,Broadleaf,Pedunculate Oak,nfi,2007-12-28 18:00:00,2012-12-28,,
39202,,"POLYGON ((331807.127 6260158.896, 331773.420 6...",Biotic-dieback,Broadleaf,Pedunculate Oak,nfi,2007-12-28 18:00:00,2012-12-28,,
65892,,"POLYGON ((323840.672 6263220.681, 323806.965 6...",Biotic-dieback,Broadleaf,Pedunculate Oak,nfi,2011-12-28 18:00:00,2016-12-28,,
67527,,"POLYGON ((325789.502 6257211.198, 325755.795 6...",Tree-logging,Broadleaf,Ash-leaved Willow,nfi,2011-12-28 18:00:00,2016-12-28,,
14149,2011.0,"POLYGON ((331902.236 6256978.361, 331868.530 6...",Biotic,conifer,Other Conifers,hm,NaT,NaT,Chancre des cyprès forestiers,grandes plages de houppier desséchés. Semis na...
20492,2012.0,"POLYGON ((328581.033 6257619.350, 328547.326 6...",Biotic,conifer,Larches,hm,NaT,NaT,Phéole de Schweinitz,"mortalité diffuse sans cause déterminée, prése..."
23019,2013.0,"POLYGON ((322380.417 6262788.823, 322346.710 6...",Biotic,broadleaf,Oaks,hm,NaT,NaT,Botrytis sp.,"feuilles nécrosées, limbe et nervures, chute p..."
26142,2014.0,"POLYGON ((327215.725 6256096.673, 327182.018 6...",Biotic,conifer,Pines,hm,NaT,NaT,Maladie des taches brunes du pin,
32603,2016.0,"POLYGON ((327118.083 6257917.628, 327084.376 6...",Biotic,broadleaf,Oaks,hm,NaT,NaT,Oïdium = 'blanc' du chene,CHP très atteint par l'oïdium déjà affaibli pa...


In [69]:
a[['sd', 'sw', 'td', 'tw', 'tc', 'oa', 'p']] = a.apply(lambda x: compute_weight(x, a.iloc[0]), axis=1, result_type='expand')
a.sort_values(by='p', ascending=False)

In [70]:
a[['year', 'start_date', 'end_date', 'dataset', 'essence', 'cause', 'class', 'p']]

Unnamed: 0,year,start_date,end_date,dataset,essence,cause,class,p
2424928,2010.0,NaT,NaT,senfseidl,larch,,Other,0.91
38816,,2007-12-28 18:00:00,2012-12-28,nfi,Pedunculate Oak,,Biotic-dieback,0.322861
39202,,2007-12-28 18:00:00,2012-12-28,nfi,Pedunculate Oak,,Biotic-dieback,0.137963
65892,,2011-12-28 18:00:00,2016-12-28,nfi,Pedunculate Oak,,Biotic-dieback,0.162496
67527,,2011-12-28 18:00:00,2016-12-28,nfi,Ash-leaved Willow,,Tree-logging,0.4125
14149,2011.0,NaT,NaT,hm,Other Conifers,Chancre des cyprès forestiers,Biotic,0.181496
20492,2012.0,NaT,NaT,hm,Larches,Phéole de Schweinitz,Biotic,0.60261
23019,2013.0,NaT,NaT,hm,Oaks,Botrytis sp.,Biotic,0.124581
26142,2014.0,NaT,NaT,hm,Pines,Maladie des taches brunes du pin,Biotic,0.208031
32603,2016.0,NaT,NaT,hm,Oaks,Oïdium = 'blanc' du chene,Biotic,0.0


In [83]:
hm['class'].unique()

array(['Biotic', 'Other'], dtype=object)

In [84]:
senfseidl['class'].unique()

array(['Other', 'Storm,Biotic', 'Fire'], dtype=object)

In [85]:
nfi['class'].unique()

array(['Biotic-dieback', 'Tree-logging', 'Storm', 'Fire', 'Drought',
       'Biotic-mortality'], dtype=object)

In [86]:
dfde['class'].unique()

array(['Storm', 'Biotic', 'Drought'], dtype=object)

In [97]:
compute_proba_per_class(a)

{'Fire': -0.22260482969450945,
 'Storm': -0.16021243972642737,
 'Biotic-dieback': 0.10864993972642736,
 'Biotic-mortality': 0.030734979053987005,
 'Drought': -0.22260482969450945,
 'Tree-logging': -0.05729232969450944,
 'Other': -0.10885482969450938}

In [7]:
compute_class_p_spread(d)

('Biotic-dieback', 0.10864993972642736, 0.07791496067244036)