In [None]:
#load datasets
import geopandas as gpd
import pandas as pd

dfde = gpd.read_parquet('../data/processed_datasets/simplified_refined_DFDE_1984_2021_EPSG2154_FR.parquet') #ok 
hm = gpd.read_parquet('../data/processed_datasets/simplified_health-monitoring_2007-2023_EPSG2154_FR.parquet') #ok 
nfi = gpd.read_parquet('../data/processed_datasets/simplified_NFI_2003-2021_EPSG2154_FR.parquet') #ok 
senfseidl = gpd.read_parquet("../data/processed_datasets/simplified_SenfSeidl_joined_EPSG2154_FR.parquet") #ok 
bdiff = gpd.read_parquet('../data/processed_datasets/simplified_merge_lilan_bdiff_2012_2022_FR_EPSG2154.parquet') #ok 
cdi = gpd.read_parquet('../data/processed_datasets/simplified_CDI_2012_2023_EPSG2154_FR.parquet') #ajouter tree-specie
forms = gpd.read_parquet('../data/processed_datasets/simplified_FORMS_clearcut_2017_2020_EPSG2154.parquet') #ok


#preprocessing bdiff:
bdiff['class'] = 'Fire'
bdiff = bdiff[ bdiff.forest_area_m2 > 5000 ]
#create function that convert column 'start_date' to pd.datetime with the following format : '%Y-%m-%d', 
# if the 'start_date' is just mentionning the time and not the date, use the year column to create a start_date and end_date column as the first and last day of this year
#create this function as 'to_datetime_safe', it takes as input a row of a dataframe
def to_datetime_safe(row):
    try :
        return pd.to_datetime(row['start_date'], format='%Y-%m-%d'), pd.to_datetime(row['start_date'], format='%Y-%m-%d')
    except ValueError:
        return pd.to_datetime(row['year'], format='%Y'), pd.to_datetime(row['year'], format='%Y') + pd.offsets.YearEnd(0)


bdiff[['start_date', 'end_date']] = bdiff.apply(to_datetime_safe, axis=1, result_type='expand')

#preprocessing tcl:
tcl = gpd.read_parquet('../data/processed_datasets/simplified_TreeCoverLoss_2001-2022_EPSG2154_FR.parquet')
tcl['year'] = tcl['year'] + 2000
tcl['class'] = None

#autoreload 
from attribution2 import Attribution
from constants import DCLASS_SCORE

temporal_buffer = 2 

#subset of datasets
ddataset = {'dfde': dfde, 'hm': hm, 'nfi': nfi, 'senfseidl': senfseidl, 'bdiff': bdiff, 'cdi':cdi, 'forms':forms}
dtypes = {'dfde': 'polygon', 'hm': 'point', 'nfi': 'point', 'senfseidl': 'point', 'bdiff': 'polygon', 'cdi':'polygon', 'forms':'point'}

ddataset_profile = {
    'dfde': {
        'spatial': ('offset_gaussian', {'offset': 150, 'decrease': 5000}), #offset srt( min(area) / pi), k sqrt(median(area) / pi)
        'temporal': ('step', {'start': 0, 'end': 365})
    },
    'hm': {
        'spatial': ('offset_gaussian', {'offset': 10, 'decrease': 100}),
        'temporal': ('step', {'start': 0, 'end': 365})
    },
    'nfi': {
        'spatial': ('offset_gaussian', {'offset': 600, 'decrease': 25}),
        'temporal': ('step', {'start': 0, 'end': 5 * 365})
    },
    'bdiff': {
        'spatial': ('weighting_function', {'x0': 500, 'k': 500}),
        'temporal': ('gaussian', {'mean': 0, 'std': 30})
    },
    'senfseidl': {
        'spatial': ('offset_gaussian', {'offset': 30, 'decrease': 5 * 30}),
        'temporal': ('offset_gaussian', {'offset': 1 * 365, 'decrease': 1.5 * 365})
    },
    'forms': {
        'spatial': ('offset_gaussian', {'offset': 10, 'decrease': 5*10}),
        'temporal': ('step', {'start': 0, 'end': 365})
    },
    'reference': {
        'spatial': ('offset_gaussian', {'offset': 0, 'decrease': 3 * 50}),
        'temporal': ('step', {'start': 0, 'end': 365})
    },
    'cdi': {
        'spatial': ('offset_gaussian', {'offset': 5000, 'decrease': 500}),
        'temporal': ('offset_gaussian', {'offset': 1 * 365, 'decrease': 365})
    },
}

#visible disturbances
ddisturbance_profile = {
    'fire': {
        'spatial': ('gaussian', {'mean': 0, 'std': 500}),
        'temporal': ('gaussian', {'mean': 0, 'std': 3*365})
    },
    'storm': {
        'spatial': ('gaussian', {'mean': 0, 'std': 2000}),
        'temporal': ('gaussian', {'mean': 0, 'std': 1.5 * 365})
    },
    'biotic-dieback': {
        'spatial': ('gaussian', {'mean': 0, 'std': 1000}),
        'temporal': ('gaussian', {'mean': 0, 'std': 365})
    },
    'drought-dieback': {
        'spatial': ('gaussian', {'mean': 0, 'std': 2500}),
        'temporal': ('gaussian', {'mean': 0, 'std': 2*365})
    },
    'biotic-mortality': {
        'spatial': ('gaussian', {'mean': 0, 'std': 250}),
        'temporal': ('gaussian', {'mean': 0, 'std': 3*365})
    },
    'anthropogenic': {
        'spatial': ('gaussian', {'mean': 0, 'std': 500}),
        'temporal': ('gaussian', {'mean': 0, 'std': 1 * 365})
    }
}

doa = {'dfde': 1.0, 'hm': 1.0, 'nfi': 1.0, 'senfseidl': .75, 'bdiff': 1.0, 'cdi':.75, 'forms':0.75}
dsbuffer = {'dfde': None, 'hm': 5000, 'nfi': 7000, 'senfseidl': 100, 'bdiff': None, 'cdi':100, 'forms':100}

attribution = Attribution(ddataset, reference=tcl, doa=doa, dtypes=dtypes, 
                          temporal_buffer=temporal_buffer, dsbuffer=dsbuffer, 
                          dclass_score=DCLASS_SCORE, granularity=5, 
                          ddataset_profile=ddataset_profile, ddisturbance_profile=ddisturbance_profile, 
                          start_year=2017)

In [None]:
#get louvain communities
from utils import get_temporal_range
from shapely import box
from shapely.geometry import shape 
import numpy as np
bounds_4326 = (4.75,47.10,6.82,48.51)
pol_4326 = box(*bounds_4326)
import rasterio 
from rasterio.warp import transform_geom
pol_2154 = shape(transform_geom('epsg:4326', attribution.dataset.crs, pol_4326))

_dataset = attribution.dataset.clip(pol_2154)
n = _dataset.shape[0] / attribution.dataset.shape[0] * 100
_spatial_entity_dataset = attribution.spatial_entity_dataset.clip(pol_2154)
m = _spatial_entity_dataset.shape[0] / attribution.spatial_entity_dataset.shape[0] * 100
temporal_threshold = 180 * 2
spatial_threshold = 600
resolution = 100
print(n, m)
import os 
from tqdm import tqdm 
import networkx as nx 
b = "_".join([str(x) for x in bounds_4326])

temporal_threshold_ = temporal_threshold 
spatial_threshold_ = spatial_threshold
if os.path.isfile(f"../data/results/clusters/communities_r{resolution}_g{attribution.granularity}_{spatial_threshold_}m_{temporal_threshold_}d_v{attribution.version}_b{b}_n{n :.0f}_m{m :.0f}.parquet"):
    all_clusters_gdf = gpd.read_parquet(f"../data/results/clusters/communities_r{resolution}_g{attribution.granularity}_{spatial_threshold_}m_{temporal_threshold_}d_v{attribution.version}_b{b}_n{n :.0f}_m{m :.0f}.parquet")
    c = all_clusters_gdf.shape[0] / (_spatial_entity_dataset.shape[0] + _dataset.shape[0] )
    dataset = _dataset[['geometry', 'dataset', 'class', 'centroid_date']]
    spatial_entity_dataset = _spatial_entity_dataset[['geometry', 'centroid_date']]
else : 
    dataset = _dataset[['geometry', 'dataset', 'class', 'centroid_date']]
    spatial_entity_dataset = _spatial_entity_dataset[['geometry', 'centroid_date']]
    sindex = dataset.sindex
    spatial_entity_sindex = spatial_entity_dataset.sindex

    if os.path.isfile(f"../data/results/graph/graph_g{attribution.granularity}_{spatial_threshold_}m_{temporal_threshold_}d_v{attribution.version}_b{b}_n{n :.0f}_m{m :.0f}.gml"):
        G = nx.read_gml(f"../data/results/graph/graph_g{attribution.granularity}_{spatial_threshold_}m_{temporal_threshold_}d_v{attribution.version}_b{b}_n{n :.0f}_m{m :.0f}.gml")

    else: 
        # Assuming gdf is your GeoDataFrame
        print('Building graph...')
        G = nx.Graph()
        dataset_loc = dataset
        #while graph not connected build graph by doubling thresholds
        N = 3
        i = 0
        while (len(G) == 0 or not nx.is_connected(G)) and (i < N or len(dataset_loc) > 0):
            G = build_graph(dataset_loc, dataset, sindex, spatial_threshold, temporal_threshold, attribution, G=G)
            spatial_threshold *= 2
            temporal_threshold *= 2
            print(f'graph not connected, new thresholds : {spatial_threshold}m, {temporal_threshold}d')
            #set d as the dataset with the events not in the graph
            dataset_loc = dataset[~dataset.index.isin(G.nodes())]
            i += 1
        
        nx.write_gml(G, f"../data/results/graph/graph_{attribution.granularity}_{spatial_threshold_}_{temporal_threshold_}_{attribution.version}_b{b}_n{n :.0f}_m{m :.0f}.gml")

    # islands = list(nx.connected_components(G))
    communities = nx.community.louvain_communities(G, seed=0, resolution=resolution)

    # Create a list to store the sub-GeoDataFrames
    island_gdfs = []

    print('Building islands...')
    # Iterate over each island and create a sub-GeoDataFrame
    for island in tqdm(communities):
        # Select rows from the original GeoDataFrame that correspond to the current island
        island_gdf = attribution.dataset.iloc[list(island)]
        
        # Append this sub-GeoDataFrame to the list
        island_gdfs.append(island_gdf)

    for i, island_gdf in tqdm(enumerate(island_gdfs)):
        # Calculate the envelope (bounding box) of the cluster
        envelope = island_gdf.unary_union.envelope

        # Calculate the temporal range of the cluster
        cluster_start, cluster_end = get_temporal_range(island_gdf)

        # Find potential matches using spatial index
        possible_matches_index = list(spatial_entity_sindex.intersection(envelope.bounds))
        possible_matches = spatial_entity_dataset.iloc[possible_matches_index]

        if len(possible_matches_index) > 0 :
            break
        # Initialize an empty list to store events to be added
        events_to_add = []

        # Iterate through each potential match
        for event in possible_matches.itertuples(index=True):
            # Check spatial intersection
            spatial_condition = envelope.intersects(event.geometry)
            # Check temporal intersection
            event_start = event.start_date  # Replace with your actual column name
            event_end = event.end_date
            temporal_condition = ((event_start <= cluster_end + temporal_threshold) and (event_end >= cluster_end - temporal_threshold)) or ((event_start <= cluster_start + temporal_threshold) and (event_end >= cluster_start - temporal_threshold)) 

            # If both conditions are met, add the event to the list
            if spatial_condition and temporal_condition:
                events_to_add.append(event.Index)

        # Add the events to the cluster GeoDataFrame
        if len(events_to_add) > 0:
            additional_events = attribution.spatial_entity_dataset.loc[events_to_add]
            island_gdfs[i] = gpd.GeoDataFrame(pd.concat([island_gdf, additional_events]), geometry='geometry', crs=island_gdf.crs)
    
    # Initialize an empty list to store the modified cluster GeoDataFrames          
    modified_gdfs = []

    # Add a 'cluster' column and concatenate
    for i, island_gdf in tqdm(enumerate(island_gdfs)):
        island_gdf['cluster'] = i  # Add a 'cluster' column with the cluster index
        modified_gdfs.append(island_gdf)


    # Ensure the GeoDataFrame has the correct geometry set
    all_clusters_gdf = gpd.GeoDataFrame(pd.concat(modified_gdfs), geometry='geometry').drop(columns=['year'])

    # Save to GeoParquet
    c = all_clusters_gdf.shape[0] / (_spatial_entity_dataset.shape[0] + _dataset.shape[0] )
    all_clusters_gdf.to_parquet(f"../data/results/clusters/communities_r{resolution}_g{attribution.granularity}_{spatial_threshold}m_{temporal_threshold}d_v{attribution.version}_b{b}_n{n :.0f}_m{m :.0f}.parquet")

print(f'conversion rate : {c :.2%}')


In [None]:
import geopandas as gpd
from shapely.geometry import Polygon
from shapely.affinity import translate

def translate_polygon(df, polygon_column):
    def shift_polygon(polygon):
        # Random shifts in meters
        x_shift = np.random.normal(0, 100)  # Longitude shift
        y_shift = np.random.normal(0, 100)  # Latitude shift
        return translate(polygon, xoff=x_shift, yoff=y_shift)

    df[polygon_column] = df[polygon_column].apply(shift_polygon)
    return df

import pandas as pd
import numpy as np
from datetime import timedelta

def translate_time(df, start_date_column, end_date_column):
    # Applying a Gaussian disturbance with mean = 0 and std = 90 days
    disturbance_start = np.random.normal(0, 90, size=len(df))
    disturbance_end = np.random.normal(0, 90, size=len(df))

    df[start_date_column] = df[start_date_column] + pd.to_timedelta(disturbance_start, unit='d')
    df[end_date_column] = df[end_date_column] + pd.to_timedelta(disturbance_end, unit='d')
    return df



In [None]:
#define clusters
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score
import warnings
from joblib import Parallel, delayed 
import pandas as pd 
from utils import compute_tree_coherence, compute_class_similarity, get_cluster
from constants import DCLASS_SCORE
import time 

disturbances = 10
methods = ['DBSCAN', 'SpectralClustering'] 
dcustom_similarity_function = {'tree specie relatedness': (compute_tree_coherence, {}, 1.0), 'class relatedness': (compute_class_similarity, {'dclass_score': DCLASS_SCORE}, 1.0)}
weights = [0.40759976, 0.23017731, 0.20566699, 0.15655594]

from collections import defaultdict
dict_method_metric = defaultdict(list)

for method in methods:
    ref = gpd.read_parquet(f"../data/results/clusters/clusters_m{method}_r{resolution}_g{attribution.granularity}_{spatial_threshold_}m_{temporal_threshold_}d_v{attribution.version}_b{b}_n{n :.0f}_m{m :.0f}.parquet")
    list_reflabels = []
    for row in tqdm(ref.itertuples()):
        for id_ in row.Indexes:
            list_labels.append((id_, row.Index))
    df_reflabel = pd.DataFrame(list_labels, columns=['id', 'label']).sort_values(by='id')

    for j in range(disturbances):
        #disturb time period and polygons 
        all_clusters_gdf = translate_polygon(all_clusters_gdf, 'geometry')
        all_clusters_gdf = all_clusters_gdf.apply(translate_time, axis=1, args=('start_date', 'end_date'))

        #grouping 
        groups = all_clusters_gdf.groupby('cluster')
        start = time.time()
        with warnings.catch_warnings():
            list_gdf = []
            list_matrices = []
            list_labels = []
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            for i in tqdm(range(0, len(groups), 1000)):
                r = Parallel(n_jobs=-1, prefer='processes', verbose=0)(delayed(get_cluster)(data=data, dtypes_=attribution.dtypes_, dcustom_similarity_function=dcustom_similarity_function, doa=attribution.doa, dclass_score=DCLASS_SCORE, final_weighting_dict=attribution.final_weighting_dict, weights=weights, method=method) for _,data in islice(groups, i, min(len(groups), i+1000)))
                for x in r:
                    list_gdf.append(x[0])
                    list_matrices.append(x[1][0])
                    list_labels.append(x[1][1])

        end = time.time()
        print(f'elapsed time : {end - start :.2f}s')

        df = pd.concat(list_gdf)
        gdf = gpd.GeoDataFrame(df, geometry='geometry', crs=attribution.dataset.crs)
        
        list_labels = []
        for row in tqdm(gdf.itertuples()):
            for id_ in row.Indexes:
                list_labels.append((id_, row.Index))
        df_label = pd.DataFrame(list_labels, columns=['id', 'label']).sort_values(by='id')
        ARI = adjusted_rand_score(df_reflabel.label, df_label.label)
        AMI = adjusted_mutual_info_score(df_reflabel.label, df_label.label)
        dict_method_metric[method].append((ARI, AMI))
