In [None]:
#from urllib.request import urlopen
import pandas as pd
import plotly.express as px

import xarray as xr
import numpy as np

In [None]:
from dask.distributed import Client
client = Client()
client

In [None]:
# Open multiple files at once
xds = xr.open_mfdataset("/home/susannaioni/s3/data/era5/monthly/2m_temperature*", parallel=True)
# Different versions of data use version one but replace with version 5 if present
xds = xds.sel(expver=1).combine_first(xds.sel(expver=5))

In [None]:
# Convert longitude from 0 - 360 -> -180 - 180
xds.coords['longitude'] = (xds.coords['longitude'] + 180) % 360 - 180
xds = xds.sortby(xds.longitude)

In [None]:
# Invert latitude order
xds = xds.isel(latitude=slice(None, None, -1))

# GBIF data

Check only 2 rows of data for columns we need

In [None]:
# Example load csv in order to choose the columns we need
pdf_gbif = pd.read_csv("/home/susannaioni/s3/data/gbif/R_Ferr/occurrence.txt", sep="\t", nrows=2)

# We need this setting to display all columns
pd.set_option('display.max_columns', None)
pdf_gbif

Load whole dataset, we now know the stuff

In [None]:
# https://docs.dask.org/en/stable/dataframe-best-practices.html
# Load occurance dataset with selected columns and specified datatypes
pdf_gbif = pd.read_csv("/home/susannaioni/s3/data/gbif/R_Ferr/occurrence.txt", 
                  sep="\t", 
                  index_col="gbifID",
                  usecols=["gbifID", 
                           "collectionCode", 
                           "basisOfRecord", 
                           "eventDate", 
                           "year",
                           "month",
                           "day",
                           "decimalLatitude", 
                           "decimalLongitude",
                           "coordinateUncertaintyInMeters",
                           "scientificName"
                          ],
                  #parse_dates=[4],
                  dtype={"year":"Int32",
                         "month":"Int32",
                         "day":"Int32",
                         "decimalLatitude":"Float32",
                         "decimalLongitude":"Float32",
                         "coordinateUncertaintyInMeters":"Float32",
                         "scientificName":"str"},
                   #na_values=["NA"]
                 )

In [None]:
# Drop all rows with nan values in year or month
pdf_gbif = pdf_gbif.dropna(subset=['year','month'])
# Only use data with year bigger/equal 1940 and smaller/equal 2023 and sort by date
pdf_gbif = pdf_gbif[(pdf_gbif.year>=1940) & (pdf_gbif.year<=2023)].sort_values(by=["year","month","day"])
# New time column with year and month as string
pdf_gbif['date'] = pdf_gbif['year'].astype(str) + '-' + pdf_gbif['month'].astype(str)
# Convert string to datetime
pdf_gbif['date'] = pd.to_datetime(pdf_gbif['date'], format='%Y-%m')

# Combine era5 and gbif data

In [None]:
# Use gbif occurance data and extract nearest temperature value with sel
xds_nearest = xds.sel(latitude=pdf_gbif.decimalLatitude.to_xarray(),
                      longitude=pdf_gbif.decimalLongitude.to_xarray(), 
                      time=pdf_gbif.date.to_xarray(), 
                      method="nearest")

In [None]:
# Convert xarray object to dataframe
#pd_xds = xds_nearest.to_dask_dataframe(set_index="gbifID")
pdf_era5 = xds_nearest.to_dataframe().add_prefix("era5_")

In [None]:
# Merge dataframes
merged_df = pd.merge(pdf_gbif, pdf_era5, on=['gbifID'], how='inner')

In [None]:
# Write merged dataframe to disk
merged_df.to_parquet("/home/susannaioni/s3/data/merged/gbif_era5_t2m.parquet", index="gbifID")

### Plotting Example Data

#### ERA5 subset

In [None]:
xda_t2m = xds.sel(latitude=slice(33.431441,72.816074), longitude=slice(-26.191406, 39.375000), time=slice("1940","1945"))["t2m"]

In [None]:
fig = px.imshow(xda_t2m, 
                animation_frame='time', 
                zmin=250, 
                zmax=300, 
                color_continuous_scale='RdBu_r',
                width=800, 
                height=800)
fig.update_layout(
    yaxis = dict(autorange=False)
)
fig.show()

#### GBIF Subset

In [None]:
fig = px.scatter_geo(pdf_gbif[(pdf_gbif.year>=1940) & (pdf_gbif.year<=1950)],
                     lon = "decimalLongitude", 
                     lat = "decimalLatitude", 
                     width=800, 
                     height=400, 
                     fitbounds="locations", 
                     hover_data=["scientificName","coordinateUncertaintyInMeters"])
fig.show()