# Correlation Analysis
Notebook to perform preliminar analysis on BigQuery data.<br>
The notebook will focus on studies about Correlation between meteo and pollen data, both for daily-aggregation values and weekly-aggregation values.<br>
Considered data cover the last 10-years.

### Data Description

Correlation is studied on 4 different tables representing datasets:
- <b>ALL_METEO_FEATS</b>: it contains all the features extracted from all_meteo.
- <b>ALL_METEO_FEATS_POL_DAT</b>: it joins the extracted features with the pol_data values.
- <b>ALL_METEO_WEEK_FEATS</b>: it contains all the features aggregated by week from all_meteo.
- <b>ALL_METEO_WEEK_FEATS_POL_DAT</b>: it joins the features aggregated by week with the pol_data values.

<h3>Import</h3>

In [1]:
from tqdm.auto import tqdm
import json
import math
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
from google.cloud import bigquery
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import ipywidgets as widgets
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error
from plotly.offline import init_notebook_mode, iplot

import warnings
warnings.filterwarnings('ignore')

my_cmap = plt.get_cmap("Paired")
init_notebook_mode(connected=True)  
tqdm.pandas()

<h3>Config</h3>

In [2]:
# Config

PROJECT_ID = 'arpae-prod-ml'

# BigQuery
BQ_DATASET = 'SAMPLE_DATA'
JOINED_BQ_DATASET = 'JOINED_DATA'

# Paths
BCODES_PATHS = "../data/b_codes.json"

# Const
COMMON_PERIOD_INIT = '2011-01-01'
COMMON_PERIOD_END = '2021-12-31' 
#COMMON_PERIOD_YEAR = '2021'

# Feats
METEO_FEATS = ['min', 'max', 'mean', 'std']
METEO_VARS = ['B13011', 'B14198', 'PREC', 'TEMP']
FEATS = ['B13011_min', 'B13011_max', 'B13011_mean', 'B13011_std', 
         'B14198_min', 'B14198_max', 'B14198_mean', 'B14198_std', 
         'TEMP_min', 'TEMP_max', 'TEMP_mean', 'TEMP_std', 'PREC']
WEEK_FEATS = ['B13011_min', 'B13011_max', 'B13011_mean', 'B13011_std', 
              'B14198_min', 'B14198_max', 'B14198_mean', 'B14198_std', 
              'TEMP_min', 'TEMP_max', 'TEMP_mean', 'TEMP_std', 'PREC', 'PREC_sum']
SHIFT_DAYS_LIST = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Layout
COLOR_PALETTE = px.colors.qualitative.Prism


<h3>Methods</h3>

In [3]:
# Read Methods

def _run_query(client, query): 
    df = client.query(query).to_dataframe()
    return df

def _read_table(client, project_id, dataset, table):
    query = "SELECT * FROM `{}.{}.{}` ".format(project_id, dataset, table)
    df = _run_query(client, query)
    return df

def _read_table_delta(client, project_id, dataset, table, date_col, init, end):
    query = "SELECT * FROM `{}.{}.{}` WHERE {} > '{}' AND {} < '{}' ".format(project_id, dataset, table, date_col, init, date_col, end)
    df = _run_query(client, query)
    if 'reftime' in df.columns:
        df.sort_values(by='reftime', inplace=True)
    elif date_col in df.columns:
        df.sort_values(by=date_col, inplace=True)
    else:
        return None
    return df

def _extract_date(x):
    return str(x)[:10]

def _set_var_descr(x):
    if x in b_codes:
        return b_codes[x]
    else:
        return x

def _create_station_init_end_widget(df):
    station_ids = df.station_id.sort_values().unique()
    station_wdgt = widgets.Dropdown(options=station_ids, description='Station Id:', layout={"width":"50%"})
    init_wdgt = widgets.DatePicker(description='Init Date', value=datetime.date(2021,5,1))
    end_wdgt = widgets.DatePicker(description='End Date', value=datetime.date(2021,5,5))
    return station_wdgt, init_wdgt, end_wdgt

def _create_pol_var_id_widget(df):
    pol_var_ids = df.pol_var_id.sort_values().unique()
    pol_wdgt = widgets.Dropdown(options=pol_var_ids, description='Pol var id:', layout={"width":"50%"})
    return pol_wdgt

def _create_station_id_widget(df):
    station_ids = df.station_id.sort_values().unique()
    station_wdgt = widgets.Dropdown(options=station_ids, description='Station Id:', layout={"width":"50%"})
    return station_wdgt


In [4]:
# Comp Methods

def _shift_date(x, shift_days):
    date = pd.to_datetime(x)
    date = date - timedelta(days=shift_days)
    return date.strftime("%Y-%m-%d")

def _shift_week(x, shift_week):
    return x-shift_week

def _corr(x, y):
    #return scipy.stats.pearsonr(x, y)[0]
    #return scipy.stats.spearmanr(x, y).statistic
    return scipy.stats.spearmanr(x, y).correlation

def _get_species_corr(df, meteo_vars, meteo_feats, label_col):
    species_corr = {}
    for pol_var_id in tqdm(df.pol_var_id.unique()):
        # get species df
        species_dataset_df = df[df['pol_var_id']==pol_var_id].dropna()
        # get mean corr for current species
        for var in meteo_vars:
            corrs = []
            for feat in meteo_feats:
                if var+"_"+feat in species_dataset_df.columns:
                    corr = _corr(species_dataset_df[var+"_"+feat], species_dataset_df['POL_sum'])
                    corrs.append(corr)
            species_corr[pol_var_id] = np.round(np.mean([np.abs(c) for c in corrs]), 2)
    # create df
    species_corr_df = pd.DataFrame([species_corr]).T.reset_index()
    species_corr_df.rename(columns={'index':'pol_var_id', 0:'corr'}, inplace=True)
    species_corr_df.sort_values(by='corr', inplace=True)
    return species_corr_df

def _windows_corr(df, shift_days_list, feats):    
    rows = []
    for shift_days in shift_days_list:
        row = []
        row.append(shift_days)
        # Get features & label
        df_feats = df[['station_id', 'date', 'pol_var_id'] + feats]
        df_label = df[['station_id', 'date', 'pol_var_id', 'pol_value']]
        # Shift label
        df_label['date'] = df_label['date'].apply(lambda x: _shift_date(x, shift_days))
        # Join features with shifted labels
        df_data = pd.merge(df_feats, df_label, on=['station_id', 'date', 'pol_var_id'])
        df_data.dropna(inplace=True)
        # Remove where there is no pollen
        df_data = df_data[df_data['pol_value']!=0]
        # Get correlation for each feat
        corrs = []
        for feat in feats:
            corr = np.round(_corr(df_data[feat], df_data['pol_value']), 2)
            row.append(corr)
            corrs.append(corr)
        # Get mean correlation
        mean_corr = np.round(np.mean([np.abs(c) for c in corrs if not math.isnan(c)]), 2)
        row.append(mean_corr)
        rows.append(row)
    df_corr = pd.DataFrame(rows, columns=['shift_days'] + feats + ['mean'])
    return df_corr

def _week_windows_corr(df, shift, week_feats):    
    # Get features & label
    df_feats = df[['station_id', 'year', 'week', 'pol_var_id'] + week_feats]
    df_label = df[['station_id', 'year', 'week', 'pol_var_id', 'POL_mean', 'POL_sum']]
    # Shift label (nb: error on last week of the year)
    df_label['week'] = df_label['week'].apply(lambda x: _shift_week(x, shift))
    # Join features with shifted labels
    df_data = pd.merge(df_feats, df_label, on=['station_id', 'year', 'week', 'pol_var_id'])
    df_data.dropna(inplace=True)
    # Remove where there is no pollen
    df_data = df_data[df_data['POL_sum']!=0]
    # Get correlation for each feat
    corrs = []
    for feat in week_feats:
        corr = np.round(_corr(df_data[feat], df_data['POL_sum']), 2)
        corrs.append(corr)
    # Get mean correlation
    mean_corr = np.round(np.mean([np.abs(c) for c in corrs if not math.isnan(c)]), 2)
    df_corr = pd.DataFrame([corrs + [mean_corr]], columns=week_feats + ['mean'])
    return df_corr

def _regression(df, pol_var_id):
    # Get data of selected species
    dataset_df_top_species = df[df['pol_var_id']==pol_var_id]
    dataset_df_top_species = dataset_df_top_species[['B13011_min', 'B13011_max', 'B13011_mean', 'B13011_std', 'B14198_min', 'B14198_max', 
                                                     'B14198_mean', 'B14198_std', 'TEMP_min', 'TEMP_max', 'TEMP_mean', 'TEMP_std', 
                                                     'PREC_min', 'PREC_max', 'PREC_mean', 'PREC_std', 'pol_value']]
    dataset_df_top_species.dropna(inplace=True)
    dataset_df_top_species.reset_index(drop=True, inplace=True)
    # Split train-test
    dataset_df_top_species_train, dataset_df_top_species_test = train_test_split(dataset_df_top_species, test_size=0.2)
    # Create Model
    X = dataset_df_top_species_train.drop(columns=['pol_value'])
    y = dataset_df_top_species_train['pol_value']
    regressor = LinearRegression()
    regressor = regressor.fit(X, y)
    # Predict
    preds = regressor.predict(dataset_df_top_species_test.drop(columns=['pol_value']))
    preds_df = pd.DataFrame([dataset_df_top_species_test['pol_value'].values, preds]).T.rename(columns={0:'truth', 1:'pred'}).reset_index()
    return preds_df


In [5]:
# Plot Methods

def _plot_value_distr(df_name, df, var_id, var_col):
    fig = go.Figure()
    fig = px.bar(title=df_name,
                 data_frame=species_corr_df, 
                 x=var_id, 
                 y=var_col,
                 #color=var_id,
                 color_discrete_sequence=COLOR_PALETTE,
                 template="simple_white")
    fig.update_xaxes(title_text='b_code', showgrid=True)
    fig.update_yaxes(title_text='correlation', showgrid=True)
    fig.show()
    
def _plot_windows_correlation(df, feats, feat_name=""):
    # mean corr
    fig = go.Figure()
    fig = px.scatter(title="{} Windows Mean Correlation".format(feat_name),
                     data_frame=df,
                     x='shift_days', 
                     y='mean',
                     color='shift_days',
                     hover_name='mean',
                     template="simple_white")
    fig.update_traces(marker=dict(size=6), mode="markers+lines")
    fig.update_xaxes(title_text='shift_days', showgrid=True, tickmode = 'linear')
    fig.update_yaxes(title_text='val', showgrid=True)    
    fig.show()
    # feats corr
    fig = go.Figure()
    fig = px.scatter(title="{} Windows Feats Correlation".format(feat_name),
                     data_frame=df,
                     x='shift_days', 
                     y=feats,
                     template="simple_white")
    fig.update_traces(marker=dict(size=6), mode="markers+lines")
    fig.update_xaxes(title_text='shift_days', showgrid=True, tickmode = 'linear')
    fig.update_yaxes(title_text='val', showgrid=True)
    fig.show()
    
def _plot_all_species_corr(df_corr_list, df_corr_names):
    fig = go.Figure()
    for df, name in zip(df_corr_list, df_corr_names):
        fig.add_scatter(x=df['shift_days'], 
                        y=df['mean'],  
                        name=name)
        fig.update_traces(marker=dict(size=6), mode="markers+lines")
        fig.update_xaxes(title_text='shift_days', showgrid=True, tickmode = 'linear')
        fig.update_yaxes(title_text='mean', showgrid=True)
        fig.update_layout(template='simple_white', title="Windows Mean Correlation")
    fig.show()

def _plot_week_corr(df_corr):
    fig = go.Figure()
    fig = px.bar(title='Week Feats Correlations',
                 x=df_corr.to_dict().keys(), 
                 y=df_corr.loc[0].values,
                 color_discrete_sequence=COLOR_PALETTE,
                 template="simple_white")
    fig.update_xaxes(title_text='feats', showgrid=True)
    fig.update_yaxes(title_text='val', showgrid=True)
    fig.show()
    
def _plot_preds(df):
    fig = go.Figure()
    fig.add_scatter(x=df['index'], 
                    y=df['pred'],  
                    name='pred')
    fig.add_scatter(x=df['index'], 
                    y=df['truth'], 
                    name='truth')
    fig.update_traces(marker=dict(size=6), mode="markers+lines")
    fig.update_xaxes(title_text='datetime', showgrid=True)
    fig.update_yaxes(title_text='val', showgrid=True)
    fig.update_layout(template='simple_white', title='Pred VS Ground Truth')
    fig.show()
    

<h3>1. Config</h3>

<h4>1.1 Config BigQuery</h4>

In [6]:
# Setup Client

bq_client = bigquery.Client(project=PROJECT_ID)
bq_client

<google.cloud.bigquery.client.Client at 0x1069f9640>

<h3>2. Read Data</h3>

<h4>2.1 Read Tables</h4>

<b>ALL_METEO_FEATS</b> contains features extracted by meteo values on daily aggregation, such as: min, max, mean, std values for B13, B14, TEMP and PREC.

In [7]:
# Read ALL_METEO_FEATS

all_meteo_feats_df = _read_table_delta(bq_client, PROJECT_ID, JOINED_BQ_DATASET, 
                                       "ALL_METEO_FEATS", "date",
                                       COMMON_PERIOD_INIT, COMMON_PERIOD_END)
all_meteo_feats_df['date'] = all_meteo_feats_df['date'].astype("str")
print(all_meteo_feats_df.shape)
all_meteo_feats_df.head(3)

(40666, 28)


Unnamed: 0,station_id,date,B13011_min,B13011_max,B13011_mean,B13011_std,B13011_sum,B14198_min,B14198_max,B14198_mean,...,id_gepo,station_lat,station_lon,station_nome,station_H_piano_strada,station_H_mslm,arkimet_id,arkimet_lat,arkimet_lon,meteo_id
10968,4,2011-01-02,0.0,0.0,0.0,0.0,0.0,-9.0,306.0,49.6,...,4182,44.8103,11.5876,Ferrara,8.0,29.0,Ferrara urbana,44.8325,11.6211,1573
12915,13,2011-01-02,0.0,0.0,0.0,0.0,0.0,-3.0,204.0,31.88,...,4190,44.6552,11.6231,San Pietro Capofiume,10.0,31.0,San Pietro Capofiume,4.65378,11.6226,1618
24156,2,2011-01-02,0.0,0.0,0.0,0.0,0.0,-5.0,225.0,31.92,...,4187,44.1346,12.2587,Cesena,31.0,52.0,Cesena urbana,44.1382,12.2436,1989


<b>ALL_METEO_FEATS_POL_DAT</b> joins features ALL_METEO_FEATS with POL_DAT.

In [8]:
# Read ALL_METEO_FEATS_POL_DAT

all_meteo_feats_pol_dat_df = _read_table_delta(bq_client, PROJECT_ID, JOINED_BQ_DATASET, 
                                               "ALL_METEO_FEATS_POL_DAT", "date",
                                               COMMON_PERIOD_INIT, COMMON_PERIOD_END)
all_meteo_feats_pol_dat_df['date'] = all_meteo_feats_pol_dat_df['date'].astype("str")
print(all_meteo_feats_pol_dat_df.shape)
all_meteo_feats_pol_dat_df.head(3)

(920108, 33)


Unnamed: 0,station_id,date,B13011_min,B13011_max,B13011_mean,B13011_std,B13011_sum,B14198_min,B14198_max,B14198_mean,...,arkimet_id,arkimet_lat,arkimet_lon,meteo_id,pol_var_id,pol_value,modified,pol_var_descr,week,year
340130,1,2011-01-02,0.0,0.0,0.0,0.0,0.0,-8.0,149.0,27.48,...,Bologna urbana,44.5008,11.3288,1421,B48008,0.0,1,"Corilacee_Nocciolo, POLLEN/M**3",1,2011
547444,11,2011-01-02,0.0,0.2,0.001176,0.015339,0.2,-5.0,289.0,29.72,...,Rimini urbana,44.0592,12.5735,2231,B48003,0.0,1,"Betulacee_Betulla, POLLEN/M**3",1,2011
252376,10,2011-01-02,0.0,0.0,0.0,0.0,0.0,-7.0,141.0,24.08,...,Reggio nell'Emilia urbana,44.6978,10.6337,977,B48003,0.0,1,"Betulacee_Betulla, POLLEN/M**3",1,2011


<b>ALL_METEO_WEEK_FEATS</b> contains features extracted by meteo values on week aggregation, such as: min, max, mean, std, sum values for B13, B14, TEMP and PREC.

In [9]:
# Read ALL_METEO_WEEK_FEATS

all_meteo_week_feats_df = _read_table_delta(bq_client, PROJECT_ID, JOINED_BQ_DATASET, 
                                            "ALL_METEO_WEEK_FEATS", "init_date",
                                            COMMON_PERIOD_INIT, COMMON_PERIOD_END)
print(all_meteo_week_feats_df.shape)
all_meteo_week_feats_df.head(3)

(5938, 29)


Unnamed: 0,station_id,week,year,init_date,end_date,B13011_min,B13011_max,B13011_mean,B13011_std,B14198_min,...,id_gepo,station_lat,station_lon,station_nome,station_H_piano_strada,station_H_mslm,arkimet_id,arkimet_lat,arkimet_lon,meteo_id
3337,5,1,2011,2011-01-02,2011-01-08,0.0,0.0,0.0,0.0,-10.0,...,4186,44.218,12.0328,Forlì,30.0,51.0,Forli' urbana,44.2204,12.0418,1867
5351,8,1,2011,2011-01-02,2011-01-08,0.0,1.0,0.004874,0.043518,-13.0,...,4176,45.0575,9.67732,Piacenza,55.0,76.0,Piacenza urbana,45.0549,9.67965,369
715,11,1,2011,2011-01-02,2011-01-08,0.0,0.2,0.000168,0.005798,-9.0,...,4188,44.045,12.5914,Rimini,7.0,28.0,Rimini urbana,44.0592,12.5735,2231


<b>ALL_METEO_WEEK_FEATS_POL_DAT</b> joins features ALL_METEO_WEEK_FEATS with POL_DAT, aggregated by week.

In [10]:
# Read ALL_METEO_WEEK_FEATS_POL_DAT

all_meteo_week_feats_pol_dat_df = _read_table_delta(bq_client, PROJECT_ID, JOINED_BQ_DATASET, 
                                                    "ALL_METEO_WEEK_FEATS_POL_DAT", "init_date",
                                                    COMMON_PERIOD_INIT, COMMON_PERIOD_END)
print(all_meteo_week_feats_pol_dat_df.shape)
all_meteo_week_feats_pol_dat_df.head(3)

(140149, 32)


Unnamed: 0,station_id,week,year,init_date,end_date,B13011_min,B13011_max,B13011_mean,B13011_std,B14198_min,...,station_H_piano_strada,station_H_mslm,arkimet_id,arkimet_lat,arkimet_lon,meteo_id,pol_var_id,pol_var_descr,POL_mean,POL_sum
13210,6,1,2011,2011-01-02,2011-01-08,0.0,0.6,0.002521,0.032195,-13.0,...,28.0,49.0,Modena urbana,44.6564,10.917,1138,B48021,Cupressacee - Taxacee indistinte_Cupressacee -...,0.691429,4.84
86844,2,1,2011,2011-01-02,2011-01-08,0.0,0.0,0.0,0.0,-7.0,...,31.0,52.0,Cesena urbana,44.1382,12.2436,1989,B48016,"Oleacee_Frassino, POLLEN/M**3",0.0,0.0
24186,9,1,2011,2011-01-02,2011-01-08,0.0,0.0,0.0,0.0,-7.0,...,2.0,23.0,Ravenna urbana,44.415,12.2,1983,B48016,"Oleacee_Frassino, POLLEN/M**3",0.048571,0.34


<h3>3. Day Windows Correlation Analysis</h3>

The consequences of meteo features are not immediatly visible on pollen data: the <b>effect should be seen in the next 1-10 days</b>.<br>
We analyze the <b>correlation</b> between meteo features and pollen concentration on <b>different time-tange</b> (1day, 2days, ..) to understand where is the period with the higher correlation.<br>
Of course we can estimate both a <b>generic</b> and a <b>species period</b>, since each species behave potentially different from the others.

<h4>3.1 Generic Correlation Analysis</h4>

The first attempt simply considers the mean correlation between features columns and label column.

In [11]:
# Get windows correlation
df_corr = _windows_corr(all_meteo_feats_pol_dat_df, SHIFT_DAYS_LIST, FEATS)

# Plot windows Mean & Feats correlation
_plot_windows_correlation(df_corr, FEATS)

<h4>3.2 Per Station-Species Correlation Analysis</h4>

The second attempt compute the correlation for each station and for each species for each day-shift, and then extract the mean values for each feature.

In [12]:
# Plot Mean Feat corr & Mean corr for each station_id and pol_var_id

df_corr_list = []
for station_id in tqdm(all_meteo_feats_pol_dat_df.station_id.unique()):
    for pol_var_id in all_meteo_feats_pol_dat_df.pol_var_id.unique():
        df = all_meteo_feats_pol_dat_df[(all_meteo_feats_pol_dat_df['station_id']==station_id) & 
                                        (all_meteo_feats_pol_dat_df['pol_var_id']==pol_var_id)]
        df_corr = _windows_corr(df, SHIFT_DAYS_LIST, FEATS)
        df_corr_list.append(df_corr)
df_corr = pd.concat(df_corr_list).groupby('shift_days').apply(lambda x: np.round(np.mean(np.abs(x)), 2))

_plot_windows_correlation(df_corr, FEATS)

  0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
# Selected Species

#species_ids = ['B48039', 'B48041', 'B48037']
#for species_id in species_ids:
#    species_df = all_meteo_feats_pol_dat_df[all_meteo_feats_pol_dat_df['pol_var_id']==species_id]
#    df_corr = _windows_corr(species_df, SHIFT_DAYS_LIST, FEATS)
#    _plot_windows_correlation(df_corr, FEATS, species_id)

<h3>4. Week Correlation Analysis</h3>

Starting from ALL_METEO_FEATS_WEEK_POL_DAT we can study the <b>correlation</b> between <b>meteo data aggregated on week</b> with the <b>pollen data aggregated on the next week</b>.

<h4>4.1 Generic Week Correlation Analysis</h4>

The first attempt simply computes correlation between meteo & pollen data columns.

In [16]:
# Get corr
df_corr = _week_windows_corr(all_meteo_week_feats_pol_dat_df, shift=1, week_feats=WEEK_FEATS)

# Plot corr
_plot_week_corr(df_corr)

<h4>4.2 Station_id/Pol_var_id Week Correlation Analysis</h4>

The second attempt computes the correlation for each station_id and pol_var_id combination; then it extracts the mean for each features and the mean of the mean values.

In [27]:
# For each station_id
df_corr_list = []
for station_id in all_meteo_week_feats_pol_dat_df.station_id.unique():
    for pol_var_id in all_meteo_week_feats_pol_dat_df.pol_var_id.unique():
        # filter
        df = all_meteo_week_feats_pol_dat_df[(all_meteo_week_feats_pol_dat_df['station_id']==station_id) & 
                                             (all_meteo_week_feats_pol_dat_df['pol_var_id']==pol_var_id)]
        # get corr
        df_corr = _week_windows_corr(df, shift=1, week_feats=WEEK_FEATS)
        df_corr_list.append(df_corr)
df_corr = pd.concat(df_corr_list)
df_corr = df_corr.groupby(df_corr.index).apply(lambda x: np.round(np.mean(np.abs(x)), 2))

# plot corr
_plot_week_corr(df_corr)

<h4>4.3 Station_id/Pol_var_id Species Mean Week Correlation Analysis</h4>

The graph shows the mean correlation of each species with the label. The mean values are computed from the features.

In [28]:
# Plot mean corr for each species

species_corr_df = _get_species_corr(all_meteo_week_feats_pol_dat_df, METEO_VARS, METEO_FEATS, 'POL_sum')
_plot_value_distr('Species Week Mean Corr', species_corr_df, 'pol_var_id', 'corr')

  0%|          | 0/33 [00:00<?, ?it/s]

<h4>4.4 Focus on single Pollen B_Code</h4>

In [29]:
# Select Station_id & Pol_var_id 

pol_wdgt = _create_pol_var_id_widget(all_meteo_week_feats_pol_dat_df)
station_wdgt = _create_station_id_widget(all_meteo_week_feats_pol_dat_df)
display(station_wdgt, pol_wdgt)

Dropdown(description='Station Id:', layout=Layout(width='50%'), options=(1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13), …

Dropdown(description='Pol var id:', layout=Layout(width='50%'), options=('B48001', 'B48002', 'B48003', 'B48005…

In [30]:
df = all_meteo_week_feats_pol_dat_df
feats = FEATS
df = df[(df['pol_var_id']==pol_wdgt.value) & (df['station_id']==station_wdgt.value)]

In [31]:
n_col = 4
figures = []
for feat in feats:
    figures.append(px.scatter(title=feat,
                              data_frame=df,
                              x=feat, 
                              y='POL_sum',
                              color='pol_var_id',
                              hover_name='pol_var_id'))

fig = make_subplots(rows=int(len(figures)/n_col+1), cols=n_col) 
for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=int(i/n_col+1), col=int(i%n_col+1))
        fig.update_yaxes(showgrid=True)        
        fig.update_xaxes(showgrid=True)  

fig.update_layout(title_text="Features Correlation for station_id:{} & pol_var_id: {}".format(station_wdgt.value, pol_wdgt.value), 
                  template="simple_white", height=1200)
fig.update_traces(marker=dict(size=6), mode="markers", showlegend=False)
fig.show()


<h4>4.4 Weekly Aggregation for meteo VS next-day aggregation for pollen</h4>

In order to create a model, we suppose to be more interestinga study about:
- a week-aggregation for meteo features
- the next-day aggregation for pollen

The result will reveal if there is some correlation between data collected in the previous week and the forecasting for the next day, that could be a realistic scenario.

In [32]:
# For each station_id
df_corr_list = []
for station_id in tqdm(all_meteo_week_feats_pol_dat_df.station_id.unique()):
    # filter meteo for current station
    df_feats = all_meteo_week_feats_df[all_meteo_week_feats_df['station_id']==station_id]
    df_feats = df_feats[['station_id', 'end_date'] + FEATS]
    df_feats.rename(columns={'end_date': 'date'}, inplace=True)
    df_feats['date'] = df_feats['date'].astype(str)        
    # for each pol_var_id
    for pol_var_id in all_meteo_week_feats_pol_dat_df.pol_var_id.unique():        
        # filter & shift pollen of current pol_var_id    
        df_label = all_meteo_feats_pol_dat_df[(all_meteo_feats_pol_dat_df['station_id']==station_id) &
                                              (all_meteo_feats_pol_dat_df['pol_var_id']==pol_var_id)]
        df_label = df_label[['station_id', 'date', 'pol_var_id', 'pol_value']]
        df_label['date'] = df_label['date'].apply(lambda x: _shift_date(x, shift_days=1))
        # merge with meteo df
        df_data = pd.merge(df_feats, df_label, on=['station_id', 'date'])
        df_data.dropna(inplace=True)
        # get corr
        corrs = []
        for feat in FEATS:
            corr = np.round(_corr(df_data[feat], df_data['pol_value']), 2)
            corrs.append(corr)
        # Get mean correlation
        mean_corr = np.round(np.mean([np.abs(c) for c in corrs if not math.isnan(c)]), 2)
        df_corr = pd.DataFrame([corrs + [mean_corr]], columns=FEATS + ['mean'])        
        df_corr_list.append(df_corr)        
        
df_corr = pd.concat(df_corr_list)
df_corr = df_corr.groupby(df_corr.index).apply(lambda x: np.round(np.mean(np.abs(x)), 2))

# plot corr
_plot_week_corr(df_corr)

  0%|          | 0/11 [00:00<?, ?it/s]