# Overall deviation from Baseline

This notebook contains the code used to load a subset of fire events from the database and calculate the deviation from the baseline for each of the selected fire events. Next, some descriptive statistics regarding the deviation from the baseline for all the selected fire events are visualized in a line graph. 

In [None]:
# load the required packages
import os
import math
import sys
import psycopg2
import pandas as pd
from pathlib import Path

import plotly.io as pio
import plotly.graph_objects as go

# adds the package path to the Python path to make sure all the local imports work fine 
if os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) not in sys.path:
    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))

# local imports 
from wp4.constants import POLLUTANTS, DB_HOST, DB_NAME, DB_USER, DB_PASS, DATA_DIR_CAMS, DATA_DIR_PLOTS
from wp4.baseline.spatial import get_spatial_baseline
from wp4.baseline.temporal import get_temporal_baseline

## Initiate connection to the database and load information for the group of fire event to analyse from the fire event table.  

In [None]:
# initiate connection to database
conn = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASS, host=DB_HOST)

# Query to retrieve fire event info from the database.
# This particular example loads all fire events picked up by the Aqua/Terra satellites.

query = """  
    SELECT id, datetime, ST_X(geometry), ST_Y(geometry), source, location, reference, type, info
    FROM public.fire_events
    WHERE reference = 'Aqua' OR reference = 'Terra'
"""

# Load as a dataframe
df_fire_events = pd.read_sql_query(query,con=conn).rename(columns = {'st_x':'longitude', 'st_y':'latitude'})

# close the connection
conn.close()

In [None]:
def create_dataframe(main_df, df_to_join, index, new_col_name):
    """
    Function to join baseline deviation dataframes into a single dataframe
    :param main_df: The main pandas dataframe, to which all other deviation dataframes are joined
    :param df_to_join: The dataframe to join with the main dataframe
    :param index: the hour_from_event data from the baseline dataframe, this will serve as the join column
    :param new_col_name: name of the new column, storing the baseline deviation information 
    :return:
    """

    if main_df is None:
        main_df = pd.DataFrame({
            'hours_from_fe': index,
            new_col_name: df_to_join}
        )
    else:
        df_to_join = pd.DataFrame({
            'hours_from_fe': index,
            new_col_name: df_to_join}
        )
        main_df = pd.merge(
            main_df, df_to_join,
            left_on='hours_from_fe',
            right_on='hours_from_fe'
        )

    return main_df

## Next, for each of the fire events in the database the difference and percent difference between the concentration level during a time window around the fire event and the baselines is calculated and combined into a single dataframe.

This can take a while to run

In [None]:
pollutant_difference_pol_tp = {}
pollutant_difference_percent_pol_tp = {}

pollutant_difference_pol_sp = {}
pollutant_difference_percent_pol_sp = {}


for pol in ['CO', 'O3', 'NO', 'NO2', 'PM25', 'PM10', 'SO2']:

    # some variables to store the deviation dataframes in
    pollutant_difference_tp = None
    pollutant_difference_percent_tp = None
    
    pollutant_difference_sp = None
    pollutant_difference_percent_sp = None


    for ind, fe in df_fire_events.head(1).iterrows(): # iterate over the fire event dataframe
        
        # get the baseline information
        df_temporal_baseline = get_temporal_baseline(
            fe_lat=fe['latitude'],
            fe_long=fe['longitude'],
            timestamp=fe['datetime'],
            days=5,
            pollutant=pol,
        )
        
        df_spatial_baseline, _, _, _ = get_spatial_baseline(
            fe['latitude'],
            fe['longitude'],
            fe['datetime'],
            5,
            pol,
            meteo_dataset='MERA',
            min_distance_km=30,
            max_distance_km=200,
            number_of_neighbours=50,
            mask_ocean=True,
        )
        
        del _  # remove unused variable from memory
        
        if df_spatial_baseline is None or df_temporal_baseline is None:
            continue  # skip the fire event in case no baseline could be retrieved
        
        # Calculate difference between baseline and concentration levels during fe
        difference_tp = df_temporal_baseline['fire_event'] - df_temporal_baseline['temporal_baseline_median']
        difference_percent_tp = ((df_temporal_baseline['fire_event'] - df_temporal_baseline[
                'temporal_baseline_median']) / df_temporal_baseline['temporal_baseline_median']) * 100

        difference_sp = df_spatial_baseline['fire_event'] - df_spatial_baseline['spatial_baseline_median']
        difference_percent_sp = ((df_spatial_baseline['fire_event'] - df_spatial_baseline[
                'spatial_baseline_median']) / df_spatial_baseline['spatial_baseline_median']) * 100
        
        
        # create/combine dataframe containing the deviation for each fire event as a column
        pollutant_difference_tp =  create_dataframe(
            pollutant_difference_tp,
            difference_tp,
            list(df_temporal_baseline['hour_from_event']),
            ind,
        )
        
        pollutant_difference_percent_tp = create_dataframe(
            pollutant_difference_percent_tp,
            difference_percent_tp,
            list(df_temporal_baseline['hour_from_event']),
            ind,
        )
        
        pollutant_difference_sp = create_dataframe(
            pollutant_difference_sp,
            difference_sp,
            list(df_spatial_baseline['hour_from_event']),
            ind,
        )
        
        pollutant_difference_percent_sp = create_dataframe(
            pollutant_difference_percent_sp,
            difference_percent_sp,
            list(df_spatial_baseline['hour_from_event']),
            ind,
        )
        
    # store the final dataframe with deviation info for each pollutant in a dict for later use
    pollutant_difference_pol_tp[pol] = pollutant_difference_tp
    pollutant_difference_percent_pol_tp[pol] = pollutant_difference_percent_tp
    pollutant_difference_pol_sp[pol] = pollutant_difference_sp
    pollutant_difference_percent_pol_sp[pol] = pollutant_difference_percent_sp  

## Write the dataframes to csv, just in case

In [None]:
for pol in pollutant_difference_percent_pol_tp:
    
    if pollutant_difference_percent_pol_tp[pol] is None:
        continue
    
    df_perc = pollutant_difference_percent_pol_tp[pol].transpose()
    df_conc = pollutant_difference_pol_tp[pol].transpose()
    
    csv_loc = Path(DATA_DIR_PLOTS).joinpath('notebooks/baseline_deviation/temporal/csv')
    
    if not os.path.exists(csv_loc):
            os.makedirs(csv_loc)
            
    df_perc.to_csv(f'{csv_loc}/{pol}_percentage.csv')
    df_conc.to_csv(f'{csv_loc}/{pol}_concentration.csv')

In [None]:
for pol in pollutant_difference_percent_pol_sp:
    
    if pollutant_difference_percent_pol_sp[pol] is None:
        continue
    
    df_perc = pollutant_difference_percent_pol_sp[pol].transpose()
    df_conc = pollutant_difference_pol_sp[pol].transpose()
    
    csv_loc = Path(DATA_DIR_PLOTS).joinpath('notebooks/baseline_deviation/spatial/csv')
    
    if not os.path.exists(csv_loc):
            os.makedirs(csv_loc)
            
    df_perc.to_csv(f'{csv_loc}/{pol}_percentage.csv')
    df_conc.to_csv(f'{csv_loc}/{pol}_concentration.csv')

## Create a plotly graph visualising some group statistics  

In [None]:
TEMPLATE = pio.templates["ggplot2"]  # layout template to use for the figure styling
BASELINE = 'TP' # SP or TP

dict_baseline = {
    'SP': pollutant_difference_percent_pol_sp,
    'TP': pollutant_difference_percent_pol_tp,
}

for ind, pol in enumerate(dict_baseline[BASELINE]):
    
    if dict_baseline[BASELINE][pol] is None:
        continue # if no data, skip to the next pollutant
    
    df = dict_baseline[BASELINE][pol].set_index('hours_from_fe').transpose() # select the data gathered
    pol_df = df.quantile([.1, .25, .5, .75, .9]) # calculate percentiles
    mean_df = df.mean() # mean
    std_dev_df = df.std()  # standard deviation
    
    cols = [int(col) for col in df.columns] # get the value range for the x axis, 
    
    # create plotly figure and add the traces
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=cols,
        y=pol_df.loc[0.25],
        fill=None,
        mode='lines',
        line_color='#ffd6a5',
        name='25th percentile'
        ))
    fig.add_trace(go.Scatter(
        x=cols,
        y=pol_df.loc[0.75],
        fill='tonexty', # fill area between trace0 and trace1
        mode='lines',
        line_color='#ffd6a5',
        name='75th percentile',
    ))
    fig.add_trace(go.Scatter(
        x=cols,
        y=pol_df.loc[0.5],
        mode='lines',
        line_color='red',
        name='Median'
    ))
    
    fig.add_trace(go.Scatter(
        x=cols,
        y=mean_df,
        mode='lines',
        line_color='blue',
        name='Mean'
    ))
        
    fig.update_layout(
                title=f'{POLLUTANTS[pol]["FULL_NAME"]} ({POLLUTANTS[pol]["FORMULA_HTML"]}) deviation from baseline (in %)',
                xaxis_title="Days from Fire Event",
                yaxis_title="Deviation from Baseline in %",
                legend_title="Legend",
                )# Add shapes

    fig.update_layout(
                legend = {'font_size': 14},
                template=TEMPLATE,
                xaxis = dict(
                    tickmode = 'array',
                    tick0 = df.T.reset_index()['hours_from_fe'][0],
                    dtick = 24))
    fig.update_xaxes(
        ticktext = [
            f'{int(x/24)} Days Before' if x < 0 else f'{int(x/24)} Days After'.replace("0 days after", "Fire Event") for x in list(
                range(
                    list(df.T.reset_index()['hours_from_fe'])[0],
                    list(df.T.reset_index()['hours_from_fe'])[-1],
                    24
                )
            )
        ],
        tickvals=[str(x) for x in list(range(
                list(df.T.reset_index()['hours_from_fe'])[0],
                list(df.T.reset_index()['hours_from_fe'])[-1],
                24
            ))]
    )



    fig.show()
    
    output_loc = Path(DATA_DIR_PLOTS).joinpath('notebooks/baseline_deviation/spatial/figures')

    if not os.path.exists(output_loc):
        os.makedirs(output_loc)

    fig.write_html(output_loc.joinpath(f"{pol}.html"))