In [5]:
import geopandas as gpd
import pandas as pd
import numpy as np

import openmeteo_requests
import requests_cache
from retry_requests import retry
from datetime import datetime, timedelta
import time
import os

from huggingface_hub import HfApi, create_repo, hf_hub_download

import plotly.express as px

import joblib
import json

In [2]:
gdf = gpd.read_file("../5 region geometry/output/europe_admin.geojson")
gdf.head(1)

Unnamed: 0,COUNTRY,NAME_1,NAME_2,area,representative_point_lat,representative_point_lon,Sea latitude,Sea longitude,Sea distance,geometry
0,Albania,Albania,Albania,28727.645536,41.146004,20.057685,41.202353,19.377711,57517.760212,"MULTIPOLYGON (((20.05408 39.69171, 19.98178 39..."


In [2]:
gdf = gpd.read_file("output/europe_admin.geojson")

In [3]:
delta_37_days = timedelta(days=37)
delta_30_days = timedelta(days=30)
delta_7_days = timedelta(days=7)
delta_1_day = timedelta(days=1)
def Create_df(data, start_date, end_date, day_time=24):
    result_df = pd.DataFrame()
    day = start_date
    i = 0
    while day <= end_date :
        # Define offsets in hours
        start_30 = i * day_time
        end = start_30 + 31 * day_time # 31 because open meteo end date is inclusive
        start_5 = end - 5 * day_time  # Last 5 days of the 30-day window
        start_1 = end - 1 * day_time  # Last 1 day of the 30-day window
        
        slice_30 = slice(start_30, end)
        slice_5 = slice(start_5, end)
        slice_1 = slice(start_1, end)

        new_row = {
            f"median_{key}_30": np.nanmedian(value[slice_30])
            for key, value in data.items()
        } | {
            f"mean_{key}_{suffix}": np.nanmean(value[slice_])
            for suffix, slice_ in [
                ("30", slice_30),
                ("5", slice_5),
                ("1", slice_1)
            ]
            for key, value in data.items()
        } | {
            f"max_{key}_1": np.nanmax(value[slice_1])
            for key, value in data.items()
        } 
        new_row["date"]= day
        new_row["date_id"]= i
        if len(result_df) == 0 :
            result_df = pd.DataFrame([new_row])
        else :
            result_df.loc[len(result_df)] = new_row
        day = day + delta_1_day
        i += 1
    return result_df


def Get_previous_month_weather(lat, lon, end_date) :

    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)
    start_date = end_date - delta_37_days
    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://api.open-meteo.com/v1/forecast"
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly":  ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "et0_fao_evapotranspiration", "vapour_pressure_deficit", "wind_speed_10m", "wind_gusts_10m"],
        "timezone": "GMT",
        "start_date": start_date.strftime('%Y-%m-%d'),
        "end_date": end_date.strftime('%Y-%m-%d'),
    }

    responses = openmeteo.weather_api(url, params=params, method="POST")
    complete_result = []
    for j in range(len(responses)) :
        response = responses[j]
        elevation = response.Elevation()

        # Process daily data. The order of variables needs to be the same as requested.
        hourly = response.Hourly()
        hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
        hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
        hourly_dew_point_2m = hourly.Variables(2).ValuesAsNumpy()
        hourly_precipitation = hourly.Variables(3).ValuesAsNumpy()
        hourly_et0_fao_evapotranspiration = hourly.Variables(4).ValuesAsNumpy()
        hourly_vapour_pressure_deficit = hourly.Variables(5).ValuesAsNumpy()
        hourly_wind_speed_10m = hourly.Variables(6).ValuesAsNumpy()
        hourly_wind_gusts_10m = hourly.Variables(7).ValuesAsNumpy()

        weather_data = {}
        weather_data["temperature_2m"] = hourly_temperature_2m
        weather_data["relative_humidity_2m"] = hourly_relative_humidity_2m
        weather_data["dew_point_2m"] = hourly_dew_point_2m
        weather_data["precipitation"] = hourly_precipitation
        weather_data["et0_fao_evapotranspiration"] = hourly_et0_fao_evapotranspiration
        weather_data["vapour_pressure_deficit"] = hourly_vapour_pressure_deficit
        weather_data["wind_speed_10m"] = hourly_wind_speed_10m
        weather_data["wind_gusts_10m"] = hourly_wind_gusts_10m

        result_df = Create_df(weather_data, start_date + delta_30_days,end_date)

        result_df["elevation"] = elevation    
        result_df["lat"] = lat[j]
        result_df["lon"] = lon[j]
        
        complete_result.append(result_df)
    return complete_result

def Get_soil_moisture(lat, lon, end_date) :

    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)
    start_date = end_date - delta_37_days
    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://api.open-meteo.com/v1/forecast"
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly": ["soil_moisture_0_to_7cm", "soil_moisture_7_to_28cm", "soil_moisture_28_to_100cm", "soil_moisture_100_to_255cm"],
        "models": "ecmwf_ifs025",
        "timezone": "GMT",
        "start_date": start_date.strftime('%Y-%m-%d'),
        "end_date": end_date.strftime('%Y-%m-%d'),
    }
    responses = openmeteo.weather_api(url, params=params, method="POST")
    complete_result = []
    for j in range(len(responses)) :
        response = responses[j]

        # Process daily data. The order of variables needs to be the same as requested.
        hourly = response.Hourly()
        hourly_soil_moisture_0_to_7cm = hourly.Variables(0).ValuesAsNumpy()
        hourly_soil_moisture_7_to_28cm = hourly.Variables(1).ValuesAsNumpy()
        hourly_soil_moisture_28_to_100cm = hourly.Variables(2).ValuesAsNumpy()
        hourly_soil_moisture_100_to_255cm = hourly.Variables(3).ValuesAsNumpy()


        weather_data = {}

        weather_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm
        weather_data["soil_moisture_7_to_28cm"] = hourly_soil_moisture_7_to_28cm
        weather_data["soil_moisture_28_to_100cm"] = hourly_soil_moisture_28_to_100cm
        weather_data["soil_moisture_100_to_255cm"] = hourly_soil_moisture_100_to_255cm
        
        result_df = Create_df(weather_data, start_date + delta_30_days,end_date)
        result_df["lat"] = lat[j]
        result_df["lon"] = lon[j]
        complete_result.append(result_df)
    return complete_result

# Function to get river flow data from USGS API
def get_river_discharge(lat,lon, end_date):
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)
    start_date = end_date - delta_37_days
    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://flood-api.open-meteo.com/v1/flood"
    params = {
        "latitude": lat,
        "longitude": lon,
        "daily": "river_discharge",
        "start_date": start_date.strftime('%Y-%m-%d'),
        "end_date": end_date.strftime('%Y-%m-%d'),
        "models": "seamless_v4",
        "timezone": "GMT"
    }
    responses = openmeteo.weather_api(url, params=params, method="POST")
    complete_result = []
    # Process first location. Add a for-loop for multiple locations or weather models
    for j in range(len(responses)) :
        response = responses[j]
        # Process daily data. The order of variables needs to be the same as requested.
        daily = response.Daily()
        daily_river_discharge = daily.Variables(0).ValuesAsNumpy()
        daily_data = {}
        
        daily_data["river_discharge"] = daily_river_discharge
        result_df = Create_df(daily_data, start_date + delta_30_days,end_date,1)
        result_df["lat"] = lat[j]
        result_df["lon"] = lon[j]
        complete_result.append(result_df)
    return complete_result


def Get_marine_weather(lat, lon, sea_lat, sea_lon, sea_distance, end_date):
    cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
    retry_session = retry(cache_session, retries=2, backoff_factor=0.1)  
    openmeteo = openmeteo_requests.Client(session=retry_session)
    start_date = end_date - delta_37_days
    url = "https://marine-api.open-meteo.com/v1/marine"
    params = {
                "latitude": sea_lat,
                "longitude": sea_lon,
                "hourly": ["wave_height", "sea_level_height_msl"],
                "timezone": "GMT",
                "start_date": start_date.strftime('%Y-%m-%d'),
                "end_date": end_date.strftime('%Y-%m-%d'),
            }
    responses = openmeteo.weather_api(url, params=params, method="POST")
    complete_result = []
    
    for j in range(len(responses)) :
        response = responses[j]
        # Process daily data. The order of variables needs to be the same as requested.
        hourly = response.Hourly()
        hourly_wave_height = hourly.Variables(0).ValuesAsNumpy()
        hourly_sea_level_height_msl = hourly.Variables(1).ValuesAsNumpy()


        hourly_data  = {}
        hourly_data["wave_height"] = hourly_wave_height
        hourly_data["sea_level_height_msl"] = hourly_sea_level_height_msl
        result_df = Create_df(hourly_data, start_date + delta_30_days,end_date)
        result_df["lat"] = lat[j]
        result_df["lon"] = lon[j]
        result_df["Sea distance"] = sea_distance[j]
        complete_result.append(result_df)
    return complete_result


In [10]:
test_gdf = gdf.iloc[:2]
lat = test_gdf["representative_point_lat"].tolist()
lon = test_gdf["representative_point_lon"].tolist()
sea_lat = test_gdf["Sea latitude"].tolist()
sea_lon = test_gdf["Sea longitude"].tolist()
sea_distance = test_gdf["Sea distance"].tolist()
end_date = datetime.now() + delta_7_days

weather_data = Get_previous_month_weather(lat,lon, end_date)
soil_moisture_data = Get_soil_moisture(lat, lon, end_date)
river_data = get_river_discharge(lat, lon, end_date)
marine_weather_data = Get_marine_weather(lat, lon, sea_lat, sea_lon, sea_distance, end_date)
weather_df = pd.concat(weather_data)
soil_moisture_df = pd.concat(soil_moisture_data)
river_df = pd.concat(river_data)
marine_df = pd.concat(marine_weather_data)

  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_

In [11]:
complete_df = pd.merge(weather_df, soil_moisture_df,on=["date", "lat", "lon", "date_id"])
complete_df = pd.merge(complete_df, river_df,on=["date", "lat", "lon", "date_id"])
complete_df = pd.merge(complete_df, marine_df,on=["date", "lat", "lon", "date_id"])
complete_df.head()

Unnamed: 0,median_temperature_2m_30,median_relative_humidity_2m_30,median_dew_point_2m_30,median_precipitation_30,median_et0_fao_evapotranspiration_30,median_vapour_pressure_deficit_30,median_wind_speed_10m_30,median_wind_gusts_10m_30,mean_temperature_2m_30,mean_relative_humidity_2m_30,...,median_sea_level_height_msl_30,mean_wave_height_30,mean_sea_level_height_msl_30,mean_wave_height_5,mean_sea_level_height_msl_5,mean_wave_height_1,mean_sea_level_height_msl_1,max_wave_height_1,max_sea_level_height_msl_1,Sea distance
0,12.453501,58.0,4.666291,0.0,0.045737,0.556161,6.618519,13.32,12.234549,57.538979,...,-0.48,0.740995,-0.463347,0.479833,-0.363833,0.41,-0.404167,0.58,-0.3,57517.760212
1,12.678501,60.0,5.039626,0.0,0.043593,0.548238,6.61362,12.959999,12.339119,58.623657,...,-0.48,0.728011,-0.460323,0.490167,-0.395333,0.4125,-0.483333,0.58,-0.4,57517.760212
2,12.703501,62.0,5.425471,0.0,0.043593,0.534841,6.489992,12.959999,12.386094,60.067204,...,-0.47,0.728629,-0.457137,0.520167,-0.4185,0.525833,-0.48375,0.62,-0.41,57517.760212
3,12.6285,63.0,5.579628,0.0,0.042727,0.515142,6.489992,12.959999,12.348796,61.563171,...,-0.46,0.727446,-0.450806,0.5035,-0.429167,0.355,-0.424583,0.48,-0.33,57517.760212
4,12.453501,65.0,5.744045,0.0,0.041464,0.489516,6.394417,12.959999,12.20424,63.200268,...,-0.46,0.737688,-0.442231,0.436,-0.436083,0.476667,-0.384583,0.62,-0.32,57517.760212


In [105]:
gdf.columns

Index(['COUNTRY', 'NAME_1', 'NAME_2', 'area', 'representative_point_lat',
       'representative_point_lon', 'Sea latitude', 'Sea longitude',
       'Sea distance', 'geometry'],
      dtype='object')

In [106]:
lat = gdf.iloc[:100]["representative_point_lat"].tolist()
lon = gdf[:100]["representative_point_lon"].tolist()
sea_lat = gdf[:100]["Sea latitude"].tolist()
sea_lon = gdf[:100]["Sea longitude"].tolist()
sea_distance = gdf[:100]["Sea distance"].tolist()

In [None]:
weather_data = Get_previous_month_weather(lat,lon, end_date)
soil_moisture_data = Get_soil_moisture(lat, lon, end_date)
river_data = get_river_discharge(lat, lon, end_date)
marine_weather_data = Get_marine_weather(lat, lon, sea_lat, sea_lon, sea_distance, end_date)
weather_df = pd.concat(weather_data)
soil_moisture_df = pd.concat(soil_moisture_data)
river_df = pd.concat(river_data)
marine_df = pd.concat(marine_weather_data)
complete_df = pd.merge(weather_df, soil_moisture_df,on=["date", "lat", "lon", "date_id"])
complete_df = pd.merge(complete_df, river_df,on=["date", "lat", "lon", "date_id"])
complete_df = pd.merge(complete_df, marine_df,on=["date", "lat", "lon", "date_id"])


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice encountered


All-NaN slice encountered


Mean of empty slice


All-NaN slice

In [108]:
complete_df

Unnamed: 0,median_temperature_2m_30,median_relative_humidity_2m_30,median_dew_point_2m_30,median_precipitation_30,median_et0_fao_evapotranspiration_30,median_vapour_pressure_deficit_30,median_wind_speed_10m_30,median_wind_gusts_10m_30,mean_temperature_2m_30,mean_relative_humidity_2m_30,...,median_sea_level_height_msl_30,mean_wave_height_30,mean_sea_level_height_msl_30,mean_wave_height_5,mean_sea_level_height_msl_5,mean_wave_height_1,mean_sea_level_height_msl_1,max_wave_height_1,max_sea_level_height_msl_1,Sea distance
0,12.453501,58.0,4.666291,0.0,0.045737,0.556161,6.618519,13.320000,12.234549,57.538979,...,-0.480,0.740995,-0.463347,0.479833,-0.363833,0.410000,-0.404167,0.58,-0.30,57517.760212
1,12.678501,60.0,5.039626,0.0,0.043593,0.548238,6.613620,12.959999,12.339119,58.623657,...,-0.480,0.728011,-0.460323,0.490167,-0.395333,0.412500,-0.483333,0.58,-0.40,57517.760212
2,12.703501,62.0,5.425471,0.0,0.043593,0.534841,6.489992,12.959999,12.386094,60.067204,...,-0.470,0.728629,-0.457137,0.520167,-0.418500,0.525833,-0.483750,0.62,-0.41,57517.760212
3,12.628500,63.0,5.579628,0.0,0.042727,0.515142,6.489992,12.959999,12.348796,61.563171,...,-0.460,0.727446,-0.450806,0.503500,-0.429167,0.355000,-0.424583,0.48,-0.33,57517.760212
4,12.453501,65.0,5.744045,0.0,0.041464,0.489516,6.394417,12.959999,12.204240,63.200268,...,-0.460,0.737688,-0.442231,0.436000,-0.436083,0.476667,-0.384583,0.62,-0.32,57517.760212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,7.215500,70.0,1.621307,0.0,0.032655,0.294345,7.993298,17.639999,8.054274,68.028229,...,-0.465,0.548602,-0.377392,0.665667,-0.521167,0.926667,-0.479583,1.14,1.12,334574.369138
796,7.215500,69.0,1.612732,0.0,0.037104,0.308963,8.361762,18.719999,8.038083,67.248657,...,-0.480,0.571425,-0.389785,0.716333,-0.534917,0.894167,-0.645833,1.10,0.50,334574.369138
797,7.190500,68.0,1.533976,0.0,0.039105,0.322086,8.707237,19.440001,7.977420,66.536293,...,-0.490,0.581801,-0.399503,0.674833,-0.506333,0.560833,-0.480833,0.72,0.80,334574.369138
798,7.215500,67.0,1.482906,0.0,0.039727,0.322828,8.873086,19.440001,7.984684,66.129036,...,-0.505,0.592043,-0.406492,0.690167,-0.509333,0.591667,-0.566667,0.64,0.97,334574.369138


In [109]:
complete_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 81 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   median_temperature_2m_30              800 non-null    float32       
 1   median_relative_humidity_2m_30        800 non-null    float32       
 2   median_dew_point_2m_30                800 non-null    float32       
 3   median_precipitation_30               800 non-null    float32       
 4   median_et0_fao_evapotranspiration_30  800 non-null    float32       
 5   median_vapour_pressure_deficit_30     800 non-null    float32       
 6   median_wind_speed_10m_30              800 non-null    float32       
 7   median_wind_gusts_10m_30              800 non-null    float32       
 8   mean_temperature_2m_30                800 non-null    float32       
 9   mean_relative_humidity_2m_30          800 non-null    float32       
 10  me

In [None]:
# TO DO :Handle missing values using the mean of data from the closest regions.

In [13]:
complete_df["month"] = complete_df['date'].dt.month

In [111]:
complete_df.to_csv("output/updated_weather.csv", index=False)

In [14]:
model = joblib.load("../4 flood predict ml/output/model_XGBC_predict_flood.pkl")

In [15]:
ordered_features = model.feature_names_in_

In [16]:
X = complete_df[ordered_features]

In [115]:
X.head()

Unnamed: 0,median_sea_level_height_msl_30,mean_sea_level_height_msl_30,mean_sea_level_height_msl_5,mean_sea_level_height_msl_1,max_sea_level_height_msl_1,Sea distance,median_river_discharge_30,mean_river_discharge_30,mean_river_discharge_5,mean_river_discharge_1,...,max_soil_moisture_7_to_28cm_1,max_soil_moisture_28_to_100cm_1,max_soil_moisture_100_to_255cm_1,elevation,median_wave_height_30,mean_wave_height_30,mean_wave_height_5,mean_wave_height_1,max_wave_height_1,month
0,-0.48,-0.463347,-0.363833,-0.404167,-0.3,57517.760212,2.155005,2.864853,3.037495,2.715352,...,0.414,0.401,0.391,281.0,0.64,0.740995,0.479833,0.41,0.58,4
1,-0.48,-0.460323,-0.395333,-0.483333,-0.4,57517.760212,2.184198,2.927431,2.73785,2.689776,...,0.414,0.402,0.391,281.0,0.58,0.728011,0.490167,0.4125,0.58,4
2,-0.47,-0.457137,-0.4185,-0.48375,-0.41,57517.760212,2.467369,2.976816,2.63694,2.467369,...,0.421,0.404,0.391,281.0,0.58,0.728629,0.520167,0.525833,0.62,4
3,-0.46,-0.450806,-0.429167,-0.424583,-0.33,57517.760212,2.467369,3.024138,2.580845,2.34195,...,0.421,0.406,0.392,281.0,0.58,0.727446,0.5035,0.355,0.48,4
4,-0.46,-0.442231,-0.436083,-0.384583,-0.32,57517.760212,2.467369,3.072524,2.505152,2.311311,...,0.426,0.409,0.392,281.0,0.59,0.737688,0.436,0.476667,0.62,4


In [17]:
predicted_proba = model.predict_proba(X)

In [18]:
complete_df["flood_proba"] = np.round(predicted_proba[:,1] * 100)

In [126]:
px.scatter_map(complete_df[complete_df["date_id"] == 0], lat="lat", lon="lon", color="flood_proba")

In [19]:
predict_type_model = joblib.load("../4 flood predict ml/output/model_XGBC_flood_type.pkl")
predicted_type = predict_type_model.predict(X)
complete_df["flood_type"] = predicted_type

In [20]:
gdf["last_update"] = datetime.today() - delta_1_day

In [4]:
def update_gdf(row , df):
    df_location = df[(df["lat"] == row["representative_point_lat"]) & (df["lon"] == row["representative_point_lon"])]
    date_ids = df_location["date_id"].unique()

    row["mode_flood_type"] = df_location[f"flood_type"].mode().iloc[0]
    for date_id in date_ids :
        row[f"flood_type_{date_id}"] = df_location[(df_location["date_id"] == date_id)]["flood_type"].iloc[0]

    row["max_flood_proba"] = df_location[f"flood_proba"].max()
    row["mean_flood_proba"] = df_location[f"flood_proba"].mean()
    row["median_flood_proba"] = df_location[f"flood_proba"].median()
    for date_id in date_ids :
        row[f"flood_proba_{date_id}"] = df_location[(df_location["date_id"] == date_id)]["flood_proba"].iloc[0]
    return row

In [22]:
update = gdf.loc[:1,:].apply(lambda x : update_gdf(x,complete_df), axis=1)

In [24]:
new_columns = update.columns.difference(gdf.columns)

In [26]:
for col in new_columns:
    gdf[col] = pd.NA  

In [7]:
one_min_ago

datetime.datetime(2025, 4, 3, 4, 22, 16, 269386)

In [8]:
CHUNK_SIZE = 100
TOTAL_ROWS = len(gdf)

predict_flood_model = joblib.load("../4 flood predict ml/output/model_XGBC_predict_flood.pkl")
predict_type_model = joblib.load("../4 flood predict ml/output/model_XGBC_flood_type.pkl")
for start_idx in range(0, TOTAL_ROWS, CHUNK_SIZE):
    end_idx = min(start_idx + CHUNK_SIZE, TOTAL_ROWS)
    chunk = gdf.iloc[start_idx:end_idx].copy()
    
    # Get current time once per chunk to ensure consistency
    now = datetime.now()
    end_date = now + delta_7_days
    one_min_ago = now - timedelta(minutes=1)
    
    # Check conditions
    date_condition = (chunk['last_update'].dt.date != now.date()).any()
    time_condition = (gdf['last_update'] < one_min_ago).all()
    while not time_condition :
        time.sleep(65)
        now = datetime.now()
        one_min_ago = now - timedelta(minutes=1)
        time_condition = (gdf['last_update'] < one_min_ago).all()
        


    if date_condition:
        try:
            lat = chunk["representative_point_lat"].tolist()
            lon = chunk["representative_point_lon"].tolist()
            sea_lat = chunk["Sea latitude"].tolist()
            sea_lon = chunk["Sea longitude"].tolist()
            sea_distance = chunk["Sea distance"].tolist()

            weather_data = Get_previous_month_weather(lat,lon, end_date)
            soil_moisture_data = Get_soil_moisture(lat, lon, end_date)
            river_data = get_river_discharge(lat, lon, end_date)
            marine_weather_data = Get_marine_weather(lat, lon, sea_lat, sea_lon, sea_distance, end_date)
            weather_df = pd.concat(weather_data)
            soil_moisture_df = pd.concat(soil_moisture_data)
            river_df = pd.concat(river_data)
            marine_df = pd.concat(marine_weather_data)
            complete_df = pd.merge(weather_df, soil_moisture_df,on=["date", "lat", "lon", "date_id"])
            complete_df = pd.merge(complete_df, river_df,on=["date", "lat", "lon", "date_id"])
            complete_df = pd.merge(complete_df, marine_df,on=["date", "lat", "lon", "date_id"])
            complete_df["month"] = complete_df['date'].dt.month

            ordered_features = predict_flood_model.feature_names_in_
            X = complete_df[ordered_features]
            predicted_flood_proba = predict_flood_model.predict_proba(X)
            predicted_type = predict_type_model.predict(X)

            complete_df["flood_proba"] = np.round(predicted_flood_proba[:,1] * 100)
            complete_df["flood_type"] = predicted_type

            gdf.loc[chunk.index,:] = gdf.loc[chunk.index,:].apply(lambda x : update_gdf(x,complete_df), axis=1)
            gdf.loc[chunk.index, 'last_update'] = now
            
            print(f"Processed rows {start_idx}-{end_idx-1} at {now}")

            
        except Exception as e:
            print(f"Error processing chunk {start_idx}-{end_idx-1}: {str(e)}")
            time.sleep(65)
    else:
        print(f"Skipping chunk {start_idx}-{end_idx-1} - already up to date")

Skipping chunk 0-99 - already up to date


  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_

Processed rows 100-199 at 2025-04-03 04:25:50.470186


  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_

Processed rows 200-299 at 2025-04-03 04:27:02.674475


  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_{key}_1": np.nanmax(value[slice_1])
  f"median_{key}_30": np.nanmedian(value[slice_30])
  f"mean_{key}_{suffix}": np.nanmean(value[slice_])
  f"max_

Processed rows 300-373 at 2025-04-03 04:28:14.299503


In [9]:
flood_types = ["Côtière", "Éclair", "Fluviale", "Fluviale/Côtière"]
gdf["mode_flood_type_name"] = gdf["mode_flood_type"].apply(lambda x : flood_types[x])

In [10]:
gdf.to_file("output/europe_admin.geojson", driver="GeoJSON")

In [None]:
# Access the Hugging Face token from the environment variable
hf_token = os.getenv("HF_TOKEN")

In [None]:
# Create a repository (if it doesn't exist)
create_repo(
    repo_id="AdrienD-Skep/geo_flood_data",  # Repository name
    repo_type="dataset",                # Type of repository
    token=hf_token,                        # Your Hugging Face token
)

RepoUrl('https://huggingface.co/datasets/AdrienD-Skep/geo_flood_data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='AdrienD-Skep/geo_flood_data')

In [12]:
# Upload files
api = HfApi(token=hf_token)
api.upload_file(
    path_or_fileobj="output/europe_admin.geojson",  # Path to the local file
    path_in_repo="europe_admin.geojson",     # Path in the repository
    repo_id="AdrienD-Skep/geo_flood_data",       # Repository name
    repo_type="dataset",                     # Type of repository
)

CommitInfo(commit_url='https://huggingface.co/datasets/AdrienD-Skep/geo_flood_data/commit/e201da3b1ddcb2b3c42bca31c491b3dd667a36ea', commit_message='Upload europe_admin.geojson with huggingface_hub', commit_description='', oid='e201da3b1ddcb2b3c42bca31c491b3dd667a36ea', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/AdrienD-Skep/geo_flood_data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='AdrienD-Skep/geo_flood_data'), pr_revision=None, pr_num=None)

In [6]:
geojson_path = hf_hub_download(
    repo_id="AdrienD-Skep/geo_flood_data",  # Repository name
    filename="europe_admin.geojson",    # File name in the repository
    repo_type="dataset",                # Type of repository
    token=hf_token,                        # Your Hugging Face token
)


europe_admin.geojson:   0%|          | 0.00/6.95M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
with open(geojson_path) as f:
        geojson_data = json.load(f)

In [11]:
geojson_data["features"][0]["properties"]["last_update"]

'2025-04-11T13:01:46.869'

In [16]:
t = gpd.read_file(geojson_path)

In [17]:
t["last_update"].value_counts()

last_update
2025-04-11 13:01:46.869    100
2025-04-11 13:03:10.001    100
2025-04-03 04:27:02.674    100
2025-04-03 04:28:14.299     74
Name: count, dtype: int64