In [1]:
!pip install basemap --quiet

import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
import xarray as xr
from mpl_toolkits.basemap import Basemap

# Descriptive approach for the data

There is 3 differents files

In [2]:
PATH = './data/'
aggregated_path = PATH+'aggregated-values.csv'
probability_of_occurrence_path = PATH+'probability-of-occurrence.csv'
time_serie_path = PATH+'time-series.csv'

In [3]:
df_agg = pd.read_csv(aggregated_path, header=14)
df_prob = pd.read_csv(probability_of_occurrence_path, header=15)
df_series = pd.read_csv(time_serie_path, header=17, index_col=0)

In [4]:
#Let's have a look at the data

In [5]:
def print_box(df, key_lon='xlong', key_lat='xlat'):
    print('Min lon {} and Max lon {}'.format(df[key_lon].min(),
    df[key_lon].max()))
    print('Min lat {} and Max lat {}'.format(df[key_lat].min(),
    df[key_lon].max()))

print_box(df_prob)
print_box(df_agg)

In [6]:
df_prob.describe()

In [7]:
df_prob.info()

In [8]:
df_agg.describe()

In [9]:
df_agg

In [10]:
#Let's consider that if sst_min is 0, it is a marker that the data point is in land
print('Percentage of aggregated data on land is {}'.format(len(df_agg[df_agg['sst_min']==0])/len(df_agg)))

In [11]:
m = Basemap(width=12000000, height=9000000,
                resolution='i', 
                llcrnrlon=1, urcrnrlon=36,
                llcrnrlat=24, urcrnrlat=57
               )
# draw coastlines.
m.bluemarble()
m.drawcoastlines()
m.fillcontinents()
x, y = m(df_prob['xlong'], df_prob['xlat'])
x2, y2 = m(df_agg['xlong'], df_agg['xlat'])
m.scatter(x, y, 10, marker='o', color='r', label='Prob_Occur')
m.scatter(x2, y2, 10, marker='o', color='g', label='Aggregated')
plt.legend()
plt.title('Sample Example Location for Hackathon')
plt.savefig('Location_agg_and_prob_data.jpeg', dpi=399)

In [12]:
[*df_series]


In [13]:
#Get list of variables we have
df_series.columns

# Suggestions

- Give data sample at same location, if possible in land
- Create a section on the data description docx with metadata such "wdir_* represents the wind direction in [m/s] at level *" or will there be description included in the API ?
- A netcdf/zarr/grib data format file would be easier to handle. It would allow to extract variables at specific height with easy indexing.
- These formats would allow to include a self description of variable metadata within a notebook


# Play a bit with data

In [14]:
var1 = 'wsp'
var2 = 'wdir'
var3 = 'rho'
df, df2, df3 = (pd.DataFrame(index=df_series.index) for _ in range(3))
altitude = []

for col in df_series:
    if var1 in col:
        altitude.append(col.replace(var1+'_', ""))
        df[col] = df_series[col]
    elif var2 in col:
        df2[col] = df_series[col]
    elif var3 in col:
        df3[col] = df_series[col]
        
height = np.array(altitude, dtype = float)

In [15]:
x_time = df_series.index
y = height

#Get xticks in right format
xticks = x[::60]

fig, ax = plt.subplots() 
plt.contourf(x_time,y,df.to_numpy().T)
ax.set_xticks(xticks)
ax.set_yticks(height)
plt.colorbar()
plt.title("Temporal evolution of wind speed in a vertical section")

In [16]:
wsp, wdir, rho = df.to_numpy(), df2.to_numpy(), df3.to_numpy()
np.shape(wsp)

In [17]:
ds = xr.Dataset(
    {
        "wsp": (["time", "altitude"], wsp,
               {'units': 'Meter per Second', 'long_name': 'Wind Speed'}),
        "wdir": (["time", "altitude"], wdir,
                {'units': 'Degrees', 'long_name': 'Wind Direction'}),
        "rho": (["time", "altitude"], rho,
               {'units': 'kg/m3', 'long_name': 'Density'})
    },
    coords={
        "time": pd.date_range(x_time[0], periods=len(x_time), freq='H'),
        "altitude": height
    }
)

ds

In [18]:
#Example of extraction of all variables across the time
ds_sel = ds.sel(altitude=100.0)
ds_sel