# Temporo spatial join

Method : 
1. Temporo Spatial Join : for each year - Spatial join between Senf & Seidl and the others datasets 
3. Group creation : groupby on the Senf & Seidl index. (One group per index)
2. Group work : 
    - Computing weights for each row
    - Computing score per disturbance type
    - Save group to a dict with the index as key
    - Create row with year, class, score, tree_type, essence  

In [1]:
# Loading 
import geopandas as gpd

senfseidl = gpd.read_parquet('../data/processed_datasets/SenfSeidl_joined_EPSG4326_FR.parquet').to_crs('epsg:2154')
nfi = gpd.read_parquet('../data/processed_datasets/NFI_2003-2021_EPSG4326_FR.parquet').to_crs('epsg:2154')
hm = gpd.read_parquet('../data/processed_datasets/health-monitoring_2007-2023_EPSG4326_FR.parquet').to_crs('epsg:2154')
dfde = gpd.read_parquet('../data/processed_datasets/DFDE_1984_2021_EPSG4326_FR.parquet').to_crs('epsg:2154')

# Preprocessing 



In [2]:
#Senf & Seidl
senfseidl.year = senfseidl.year.astype(int)

number_to_class = {
    1:'Storm,Biotic', 
    2:'Fire',
    3:'Other'
}

senfseidl['class'] = senfseidl['cause'].map(number_to_class)
senf_seidl_col = ['year', 'geometry', 'class', 'tree_type', 'essence']
senfseidl = senfseidl[senf_seidl_col]
senfseidl.drop_duplicates(inplace=True)
senfseidl.dropna(inplace=True)



In [88]:
# DFDE

#class
dict_class = {
    'Fire': ['Fire'],
    'Storm': ['Wind'],
    'Drought': ['Summer drought', 'Frost'],
    'Biotic': [
        'Ips typographus', 'Pissodes spp.', 'Bark beetles', 'Bombix desparate',
        'Zeiraphera diniana', 'Biotic', 'Insects', 'Other insects', 'Biotic;Abiotic',
        'Pityogenes chalcographus', 'Tetropium luridum;Tetropium fuscum',
        'Ips acuminatus', 'Tomicus piniperda;Tomicus minor',
        'Phaenops cyanea', 'Pissodes pini', 'Ips cembrae',
        'Tetropium gabrieli', 'Agrilus biguttatus', 'Agrilus viridis',
        'Xyloterus lineatus', 'Erannis defoliaria',
        'Operophtera brumata;Operophtera fagata', 'Lymantria dispar',
        'Thaumetopoea processionea', 'Hylobius abietis',
        'Melolontha hippocastani;Melolontha melolontha',
        'Microtus agrestis;Microtus arvalis;Clethrionomys glareolus',
        'Arvicola terrestris', 'Lophodermium seditiosum',
        'Sphaeropsis sapinea', 'Heterobasidion annosum',
        'Armillaria mellea', 'Chalara fraxinea;Hymenoscyphus fraxineus',
        'Beech decline', 'Oak decline', 'Viscum album', 'Ips sexdentatus'
    ],
    'Tree-logging': [],
    'Other': ['Accident']
}

def get_class(x):
    for key, values in dict_class.items():
        if x in values:
            return key
    return 'Other'

dfde['class'] = dfde['cause'].apply(get_class)

#geometry
dname_geom = {k:v for k,v in zip(dfde['name'].tolist(), dfde['geometry'].tolist())}
dname_geom = {k:v.buffer(5000).simplify(5000) for k,v in dname_geom.items()}

#drop duplicates 
dfde.drop_duplicates(subset=['name', 'start_date', 'end_date', 'essence', 'cause', 'notes'], inplace=True)
dfde['geometry'] = dfde['name'].apply(lambda x: dname_geom[x]) #change

#clean date
import pandas as pd
dfde['start_date'] = pd.to_datetime(dfde['start_date'])
dfde['end_date'] = pd.to_datetime(dfde['end_date'])

#keep_col
dfde_col = ['start_date', 'end_date', 'geometry', 'class', 'tree_type', 'essence', 'cause', 'notes']
dfde = dfde[dfde_col]

dfde.dropna(inplace=True)
dfde.drop_duplicates(inplace=True)


In [4]:
#nfi 

#filtering
nfi = nfi[ (nfi['probability'] >= 0.1) ]
nfi = nfi[ ~((nfi['class'] == 'Tree-logging')&(nfi['intensity']==0)) ]

#correct start_date
from datetime import timedelta
import pandas as pd
def get_start_date(row):
    if not pd.isnull(row['start_date']):
        return row['start_date']
    else:
        return row['end_date'] - timedelta(days=5*365.25)
    

nfi['start_date'] = nfi.apply(get_start_date, axis=1)

#keep col 
nfi_col = ['start_date', 'end_date', 'geometry', 'class', 'tree_type', 'essence']
nfi = nfi[nfi_col]

nfi.dropna(inplace=True)
nfi.drop_duplicates(inplace=True)

In [81]:
#hm
def get_class(x):
    if x  == 'biotic-factor':
        return 'Biotic'
    else :
        return 'Other'
    
hm['class'] = hm['class'].apply(get_class)
hm['year'] = hm['year'].astype(int)
hm.drop_duplicates(inplace=True)
hm.dropna(inplace=True)

hm.rename(columns={'LIB_Problème principal':'cause', 'Remarques':'notes'}, inplace=True)

# Joining

In [82]:
import dask_geopandas as dgpd
import dask.dataframe as dd
from tqdm import tqdm

years = senfseidl['year'].unique()

temporal_buffer = 6 #years 
spatial_buffer = 7000 #meters
nfi.geometry = nfi.geometry.buffer(spatial_buffer)
hm.geometry = hm.geometry.buffer(spatial_buffer)

nfi['dataset'] = 'nfi'
hm['dataset'] = 'hm'
dfde['dataset'] = 'dfde'

for year in tqdm(years):
    senfseidl_year = senfseidl[senfseidl['year'] == year]
    nfi_year = nfi[(nfi['start_date'].dt.year >= year - temporal_buffer) & (nfi['end_date'].dt.year <= year + temporal_buffer)]
    hm_year = hm[(hm['year'] >= year - temporal_buffer) & (hm['year'] <= year + temporal_buffer)]
    dfde_year = dfde[(dfde['start_date'].dt.year >= year - temporal_buffer) & (dfde['end_date'].dt.year <= year + temporal_buffer)]
    
    senfseidl_year = dgpd.from_geopandas(senfseidl_year, npartitions=10)
    nfi_year = dgpd.from_geopandas(nfi_year, npartitions=10)
    hm_year = dgpd.from_geopandas(hm_year, npartitions=10)
    dfde_year = dgpd.from_geopandas(dfde_year, npartitions=10)
    
    senfseidl_nfi_year = senfseidl_year.sjoin(nfi_year, how='left', op='intersects').compute()
    senfseidl_hm_year = senfseidl_year.sjoin(hm_year, how='left', op='intersects').compute()
    senfseidl_dfde_year = senfseidl_year.sjoin(dfde_year, how='left', op='intersects').compute()

    #concat with dask_geopandas
    senfseidl_year = pd.concat([senfseidl_nfi_year, senfseidl_hm_year, senfseidl_dfde_year], axis=0)


    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


TypeError: 'module' object is not callable

In [6]:
import dask_geopandas as dgpd
import dask.dataframe as dd

years = senfseidl['year'].unique()

temporal_buffer = 6 #years 
spatial_buffer = 7000 #meters
nfi.geometry = nfi.geometry.buffer(spatial_buffer)
hm.geometry = hm.geometry.buffer(spatial_buffer)

nfi['dataset'] = 'nfi'
hm['dataset'] = 'hm'
dfde['dataset'] = 'dfde'
senfseidl['dataset'] = 'senfseidl'

year = 2010

senfseidl_year = senfseidl[senfseidl['year'] == year]
nfi_year = nfi[(nfi['start_date'].dt.year >= year - temporal_buffer) & (nfi['end_date'].dt.year <= year + temporal_buffer)]
hm_year = hm[(hm['year'] >= year - temporal_buffer) & (hm['year'] <= year + temporal_buffer)]
dfde_year = dfde[(dfde['start_date'].dt.year >= year - temporal_buffer) & (dfde['end_date'].dt.year <= year + temporal_buffer)]

senfseidl_year = dgpd.from_geopandas(senfseidl_year, npartitions=10)
nfi_year = dgpd.from_geopandas(nfi_year, npartitions=10)
hm_year = dgpd.from_geopandas(hm_year, npartitions=10)
dfde_year = dgpd.from_geopandas(dfde_year, npartitions=10)

senfseidl_nfi_year = nfi_year.sjoin(senfseidl_year).compute()
senfseidl_hm_year = hm_year.sjoin(senfseidl_year).compute()
senfseidl_dfde_year = dfde_year.sjoin(senfseidl_year).compute()

#concat with dask_geopandas
concatenation = pd.concat([senfseidl_nfi_year, senfseidl_hm_year, senfseidl_dfde_year], axis=0)

In [7]:
concatenation

Unnamed: 0,start_date,end_date,geometry,class_left,tree_type_left,essence_left,dataset_left,index_right,year,class_right,tree_type_right,essence_right,dataset_right,year_left,cause,notes,year_right
19308,2004-12-27 18:00:00,2009-12-28,"POLYGON ((395391.390 6339555.569, 395357.684 6...",Tree-logging,Broadleaf,Pedunculate Oak,nfi,2441859,2010.0,Other,Conifer,maritime pine,senfseidl,,,,
19308,2004-12-27 18:00:00,2009-12-28,"POLYGON ((395391.390 6339555.569, 395357.684 6...",Tree-logging,Broadleaf,Pedunculate Oak,nfi,2440737,2010.0,Other,Conifer,maritime pine,senfseidl,,,,
19308,2004-12-27 18:00:00,2009-12-28,"POLYGON ((395391.390 6339555.569, 395357.684 6...",Tree-logging,Broadleaf,Pedunculate Oak,nfi,2441864,2010.0,Other,Conifer,maritime pine,senfseidl,,,,
19308,2004-12-27 18:00:00,2009-12-28,"POLYGON ((395391.390 6339555.569, 395357.684 6...",Tree-logging,Broadleaf,Pedunculate Oak,nfi,2440757,2010.0,Other,Mixed,"maritime pine,nr",senfseidl,,,,
19308,2004-12-27 18:00:00,2009-12-28,"POLYGON ((395391.390 6339555.569, 395357.684 6...",Tree-logging,Broadleaf,Pedunculate Oak,nfi,2441870,2010.0,Other,Mixed,"maritime pine,nr",senfseidl,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,2015-01-01 00:00:00,2015-12-28,"POLYGON ((1016482.342 6707283.553, 996492.098 ...",Biotic,mixed,conifers;broadleaves,dfde,4092912,2010.0,"Storm,Biotic",Conifer,conifers,senfseidl,,Biotic;Abiotic,state forests (only barked wood for the Alsace...,
735,2015-01-01 00:00:00,2015-12-28,"POLYGON ((1016482.342 6707283.553, 996492.098 ...",Biotic,mixed,conifers;broadleaves,dfde,4092918,2010.0,"Storm,Biotic",Mixed,mixed,senfseidl,,Biotic;Abiotic,state forests (only barked wood for the Alsace...,
735,2015-01-01 00:00:00,2015-12-28,"POLYGON ((1016482.342 6707283.553, 996492.098 ...",Biotic,mixed,conifers;broadleaves,dfde,4092919,2010.0,"Storm,Biotic",Mixed,mixed,senfseidl,,Biotic;Abiotic,state forests (only barked wood for the Alsace...,
735,2015-01-01 00:00:00,2015-12-28,"POLYGON ((1016482.342 6707283.553, 996492.098 ...",Biotic,mixed,conifers;broadleaves,dfde,4092911,2010.0,"Storm,Biotic",Mixed,mixed,senfseidl,,Biotic;Abiotic,state forests (only barked wood for the Alsace...,


In [8]:
groups = concatenation.groupby('index_right')

from itertools import islice

for name, group in islice(groups, 2):
    print(f"Index: {name}")
    print(group, "\n")

Index: 2424927
               start_date   end_date  \
38816 2007-12-28 18:00:00 2012-12-28   
39202 2007-12-28 18:00:00 2012-12-28   
65892 2011-12-28 18:00:00 2016-12-28   
67527 2011-12-28 18:00:00 2016-12-28   
14149                 NaT        NaT   
20492                 NaT        NaT   
23019                 NaT        NaT   
26142                 NaT        NaT   
32603                 NaT        NaT   
32606                 NaT        NaT   
32695                 NaT        NaT   
716   2009-01-24 00:00:00 2009-01-24   
717   2009-01-24 00:00:00 2009-01-24   
718   2009-01-24 00:00:00 2009-01-24   
719   2009-01-24 00:00:00 2009-01-24   

                                                geometry      class_left  \
38816  POLYGON ((323783.633 6256228.629, 323749.926 6...  Biotic-dieback   
39202  POLYGON ((331807.127 6260158.896, 331773.420 6...  Biotic-dieback   
65892  POLYGON ((323840.672 6263220.681, 323806.965 6...  Biotic-dieback   
67527  POLYGON ((325789.502 6257211.198,

In [9]:
def create_disturbance_group(geodataframe, reference, col):
    index_reference = geodataframe.index_right.iloc[0]
    print(index_reference)
    row_reference = reference.loc[[index_reference]]

    gdf = geodataframe[col]
    rename = {c: c.split('_left')[0] for c in col}
    gdf.rename(columns=rename, inplace=True)

    return gpd.GeoDataFrame(pd.concat([row_reference, gdf], axis=0), crs=geodataframe.crs)

In [31]:
col = ['start_date', 'end_date', 'geometry', 'year_left', 'class_left', 'tree_type_left', 'essence_left', 'dataset_left', 'cause', 'notes']
a = create_disturbance_group(group, senfseidl, col)

2424928


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf.rename(columns=rename, inplace=True)


In [32]:
a

Unnamed: 0,year,geometry,class,tree_type,essence,dataset,start_date,end_date,cause,notes
2424928,2010.0,"POLYGON ((318846.967 6257104.615, 318876.859 6...",Other,Mixed,larch,senfseidl,NaT,NaT,,
38816,,"POLYGON ((323783.633 6256228.629, 323749.926 6...",Biotic-dieback,Broadleaf,Pedunculate Oak,nfi,2007-12-28 18:00:00,2012-12-28,,
39202,,"POLYGON ((331807.127 6260158.896, 331773.420 6...",Biotic-dieback,Broadleaf,Pedunculate Oak,nfi,2007-12-28 18:00:00,2012-12-28,,
65892,,"POLYGON ((323840.672 6263220.681, 323806.965 6...",Biotic-dieback,Broadleaf,Pedunculate Oak,nfi,2011-12-28 18:00:00,2016-12-28,,
67527,,"POLYGON ((325789.502 6257211.198, 325755.795 6...",Tree-logging,Broadleaf,Ash-leaved Willow,nfi,2011-12-28 18:00:00,2016-12-28,,
14149,2011.0,"POLYGON ((331902.236 6256978.361, 331868.530 6...",Biotic,conifer,Other Conifers,hm,NaT,NaT,Chancre des cyprès forestiers,grandes plages de houppier desséchés. Semis na...
20492,2012.0,"POLYGON ((328581.033 6257619.350, 328547.326 6...",Biotic,conifer,Larches,hm,NaT,NaT,Phéole de Schweinitz,"mortalité diffuse sans cause déterminée, prése..."
23019,2013.0,"POLYGON ((322380.417 6262788.823, 322346.710 6...",Biotic,broadleaf,Oaks,hm,NaT,NaT,Botrytis sp.,"feuilles nécrosées, limbe et nervures, chute p..."
26142,2014.0,"POLYGON ((327215.725 6256096.673, 327182.018 6...",Biotic,conifer,Pines,hm,NaT,NaT,Maladie des taches brunes du pin,
32603,2016.0,"POLYGON ((327118.083 6257917.628, 327084.376 6...",Biotic,broadleaf,Oaks,hm,NaT,NaT,Oïdium = 'blanc' du chene,CHP très atteint par l'oïdium déjà affaibli pa...


In [74]:
#compute weights
def spatial_weight(x):
    if x <= 1:
        return 1
    else: 
        return 1 - (x-1)/9 
    
def temporal_weight(x):
    if x <= 3:
        return 1 - x/12
    else: 
        return 0.75 * (1 - (x-3)/3)

from thefuzz import fuzz

def compute_tree_coherence(row, reference):

    for essence in reference['essence'].split(','):
        if fuzz.token_set_ratio(row['essence'].lower(), essence.lower()) > 80:
            return 1
        
    if row['tree_type'].lower() == reference['tree_type'].lower():
        return 0.75 
    
    if row['tree_type'].lower() == 'mixed' or reference['tree_type'].lower() == 'mixed':
        return 0.5

    return 0.25 

def compute_weight(row, reference):
    # spatial distance, spatial weight, temporal distance, temporal weight, tree correspondance weight, overall accuracy
    if row['dataset'] == 'senfseidl':
        return 0, 1, 0, 1, 1, 0.91, 0.91  
    elif row['dataset'] == 'dfde':
        sd = (row['geometry'].area / 1e6)** (1/2) / 35
        oa = 0.95
    elif row['dataset'] in ['hm', 'nfi']:
        sd = row['geometry'].centroid.distance(reference['geometry'].centroid) / 1e3
        oa = 0.9

    if row['dataset'] in ['dfde', 'nfi']:
        td = min(abs(reference['year'] - row['start_date'].year), abs(row['end_date'].year - reference['year']))
    elif row['dataset'] == 'hm':
        td = min(abs(reference['year'] - row['year']), abs(row['year'] - reference['year']))
    
    tc = compute_tree_coherence(row[['tree_type', 'essence']], reference[['tree_type', 'essence']])
    sw = spatial_weight(sd)
    
    tw = temporal_weight(td)

    return sd, sw, td, tw, tc, oa, sw * tw * tc * oa



In [69]:
a[['sd', 'sw', 'td', 'tw', 'tc', 'oa', 'p']] = a.apply(lambda x: compute_weight(x, a.iloc[0]), axis=1, result_type='expand')
a.sort_values(by='p', ascending=False)

In [70]:
a[['year', 'start_date', 'end_date', 'dataset', 'essence', 'cause', 'class', 'p']]

Unnamed: 0,year,start_date,end_date,dataset,essence,cause,class,p
2424928,2010.0,NaT,NaT,senfseidl,larch,,Other,0.91
38816,,2007-12-28 18:00:00,2012-12-28,nfi,Pedunculate Oak,,Biotic-dieback,0.322861
39202,,2007-12-28 18:00:00,2012-12-28,nfi,Pedunculate Oak,,Biotic-dieback,0.137963
65892,,2011-12-28 18:00:00,2016-12-28,nfi,Pedunculate Oak,,Biotic-dieback,0.162496
67527,,2011-12-28 18:00:00,2016-12-28,nfi,Ash-leaved Willow,,Tree-logging,0.4125
14149,2011.0,NaT,NaT,hm,Other Conifers,Chancre des cyprès forestiers,Biotic,0.181496
20492,2012.0,NaT,NaT,hm,Larches,Phéole de Schweinitz,Biotic,0.60261
23019,2013.0,NaT,NaT,hm,Oaks,Botrytis sp.,Biotic,0.124581
26142,2014.0,NaT,NaT,hm,Pines,Maladie des taches brunes du pin,Biotic,0.208031
32603,2016.0,NaT,NaT,hm,Oaks,Oïdium = 'blanc' du chene,Biotic,0.0


In [83]:
hm['class'].unique()

array(['Biotic', 'Other'], dtype=object)

In [84]:
senfseidl['class'].unique()

array(['Other', 'Storm,Biotic', 'Fire'], dtype=object)

In [85]:
nfi['class'].unique()

array(['Biotic-dieback', 'Tree-logging', 'Storm', 'Fire', 'Drought',
       'Biotic-mortality'], dtype=object)

In [86]:
dfde['class'].unique()

array(['Storm', 'Biotic', 'Drought'], dtype=object)

In [96]:
#compute proba per class
dict_isin = {
    'Fire': ['Fire'],
    'Storm': ['Storm', 'Storm,Biotic'],
    'Drought': ['Drought'],
    'Biotic-dieback': ['Biotic-dieback', 'Biotic', 'Storm,Biotic', 'Other'],
    'Biotic-mortality': ['Biotic-mortality', 'Biotic', 'Storm,Biotic', 'Other'],
    'Tree-logging': ['Tree-logging', 'Other'],
    'Other': ['Other']
}
def compute_proba_per_class(gdf):
    dclasses = {}
    for c in ['Fire', 'Storm', 'Biotic-dieback', 'Biotic-mortality','Drought', 'Tree-logging', 'Other']:
        cond = gdf['class'].isin(dict_isin[c])
        for_ = gdf[cond]['p'].sum()
        against_ = gdf[~cond]['p'].sum()
        dclasses[c] = (for_ - against_) / len(gdf)
    return dclasses 

In [97]:
compute_proba_per_class(a)

{'Fire': -0.22260482969450945,
 'Storm': -0.16021243972642737,
 'Biotic-dieback': 0.10864993972642736,
 'Biotic-mortality': 0.030734979053987005,
 'Drought': -0.22260482969450945,
 'Tree-logging': -0.05729232969450944,
 'Other': -0.10885482969450938}