In [None]:
import os, sys, glob
from pathlib import Path

# adds the package path to the Python path to make sure all the local imports work fine 
if os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) not in sys.path:
    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))

from wp4.constants import POLLUTANTS, DATA_DIR_CAMS_AN, DB_HOST, DB_NAME, DB_USER, DB_PASS, EXTENTS, DATA_DIR_PLOTS
from wp4.baseline.spatial import get_spatial_baseline
from wp4.baseline.spatiotemporal import get_spatiotemporal_baseline
from wp4.processing.ground_stations import get_closest_active_epa_ground_station, get_epa_data
from wp4.processing.helpers import create_dataset

import pandas as pd
import xarray as xr
from datetime import datetime, timedelta
import geopandas
import json

import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# For transforming xarray data to images
from matplotlib import cm
import matplotlib.pyplot as plt
import datashader.transfer_functions as tf
import datashader as ds

from scipy.stats import pearsonr, spearmanr

OUTPUT_DIR = Path(DATA_DIR_PLOTS).joinpath('ground_measurements')

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
colors = {
        'temporal_median':'#38b000',
        'temporal_range':'#b5e48c',
        'spatial_median':'#822160',
        'spatial_range':'#F881C6',
        'spatiotemporal_median':'#38b000',
        'spatiotemporal_range':'#b5e48c',
        'fire_event':'red',
        'ground':'#ff7f00',
        'ground_hourly':'#33a02c',
        'ground_station_1':'#1f78b4',
        'ground_station_2':'#ffadad',
        'ground_station_3':'#ffd6a5',
        'ground_station_4':'#fdffb6',
        'ground_station_5':'#caffbf',
        'ground_station_6':'#9bf6ff',
        'ground_station_7':'#a0c4ff',
        'ground_station_8':'#bdb2ff',
        'ground_station_9':'#ffc6ff',
        'ground_station_10':'#fffffc',
    }

mapbox_token = 'pk.eyJ1IjoidGhlZG1lciIsImEiOiJja2luZ213cDUxMzFzMzJxamtsN29lcml4In0.44D6tYS0kX8VEverXMJY_Q'

POL_VAR_NAMES = {
    'PM25':{
        'CAMS':POLLUTANTS['PM25']['CAMS'],
        'GROUND':'pm2_5_atm',
        'STATION':'PM2.5',
    },
    'PM10':{
        'CAMS':POLLUTANTS['PM10']['CAMS'],
        'GROUND':'pm10_0_atm',
        'STATION':'PM10',
    },
    'PMWF':{
        'CAMS':'pmwf_conc',
    },
}

In [None]:
gdf_burnscars = geopandas.read_file("data/burnscars.geojson")
burnscars = json.loads(gdf_burnscars.to_json())

gdf_fireevents = geopandas.read_file("data/fires_wick_field.geojson")

In [None]:
# Functions for graphs

def calc_correlations(df_gs, df_hourly, dataset_var):
    
    col_name = df_gs.columns[0] 
    
    right = df_gs.reset_index().rename(columns={'index':'time'}).set_index('time')
    left = df_hourly.rename(columns={'UTCDateTime':'time'}).set_index('time')[dataset_var]
    
    # Applying merge_asof on data and store it in a variable
    merged_dataframe = pd.merge_asof(right, left, on="time").dropna()
    
    pearson = pearsonr(
        merged_dataframe[dataset_var],
        merged_dataframe[col_name]
    )
    
    spearman = spearmanr(
        merged_dataframe[dataset_var],
        merged_dataframe[col_name]
    )
    
    return pearson, spearman

def process_ground_data(df):
    
    df = df.copy()
    
    df['UTCDateTime'] = df['UTCDateTime'].apply(lambda x: x.replace('_x001A_', ''))
    df['UTCDateTime'] = df['UTCDateTime'][df['UTCDateTime'] != ""]
    df = df.dropna(subset=['UTCDateTime'])
    df['UTCDateTime'] = df['UTCDateTime'].apply(lambda x: datetime.strptime(x, '%Y/%m/%dT%H:%M:%Sz'))
    
    return df


def get_cams_data(pollutant, start_time, end_time, lat, long):
    
    name_pollutant_var = POLLUTANTS[pollutant]['CAMS']
    
    years = list(set([start_time.year, end_time.year]))
    
    ds = create_dataset(pollutant, years=years)

    ds_time = ds.sel(time=slice(start_time, end_time))

    ds_loc = ds_time.sel(latitude=lat, longitude=long, method='nearest')

    df_cams = ds_loc.to_dataframe().reset_index()[['time', name_pollutant_var]]
    df_cams[name_pollutant_var] = df_cams[name_pollutant_var]
    
    return df_cams
    

def create_multiplot(pollutant, dataset_var, df, df_cams, df_spatial, df_sptp, gs, name=None, title=None):
    
    # Convert to hourly
    df_hourly = df.set_index('UTCDateTime').resample('H').mean().reset_index()
    
    plot_position = {
        0:{'row':1, 'col':1},
        1:{'row':2, 'col':1},
        2:{'row':3, 'col':1},
    }
    
    name_pollutant = POLLUTANTS[pollutant]['FULL_NAME']
    name_pollutant_html = POLLUTANTS[pollutant]['FORMULA_HTML']
    name_pollutant_var = POLLUTANTS[pollutant]['CAMS']
    
    

    for row in [1,2,3]:
        fig = make_subplots()
        
        if title is not None:
            fig.update_layout(title=title)
        
        # Add range slider
        fig.update_layout(
            xaxis=dict(
                rangeslider=dict(visible=True),
                type="date"
            )
        )
        
        # Update xaxis properties
        fig.update_xaxes(title_text="Date")

        # Update yaxis properties
        fig.update_yaxes(title_text=f"{name_pollutant_html} Concentration (µg m<sup>-3</sup>)")
        
        if row == 1:

            right = df_cams.set_index('time')[name_pollutant_var]
            left = df_hourly.rename(columns={'UTCDateTime':'time'}).set_index('time')[dataset_var]

            # Applying merge_asof on data and store it
            # in a variable
            merged_dataframe = pd.merge_asof(right, left, on="time")
            
            pearson = pearsonr(
                merged_dataframe[dataset_var],
                merged_dataframe[name_pollutant_var]
            )

            spearman = spearmanr(
                merged_dataframe[dataset_var],
                merged_dataframe[name_pollutant_var]
            )
            
#             fig.add_annotation(text=f"Pearson R: {round(pearson[0], 3)} - p-value: {round(pearson[1], 3)} <br>Spearman R: {round(spearman[0], 3)} - p-value: {round(spearman[1], 3)}", 
#                     align='left',
#                     showarrow=False,
#                     xref='paper',
#                     yref='paper',
#                     x=0.51,
#                     y=-0.8,
#                     bordercolor='black',
#                     borderwidth=1)
            
            
            
#             fig.add_trace(
#                 go.Scatter(
#                     x=df['UTCDateTime'],
#                     y=df[dataset_var],
#                     mode='lines', 
#                     name='Ground Measurements',
#                     line={'color':colors['ground']}
#                 ),
#             )
            
            fig.add_trace(
                go.Scatter( 
                    x=df_hourly['UTCDateTime'],
                    y=df_hourly[dataset_var],
                    mode='lines', 
                    name='Ground Measurements - Hourly',
                    line={'color':colors['ground_hourly']}
                )
            )

            fig.add_trace(
                go.Scatter(  
                    x=df_cams['time'],
                    y=df_cams[name_pollutant_var],
                    mode='lines', 
                    name='CAMS Analysis',
                    line={'color':colors['fire_event']}
                )            )
            
        if row == 2:
            fig.add_trace(go.Scatter(  # add the baseline values
                    x=df_spatial['time'],
                    y=df_spatial['spatial_baseline_lower_quartile'],
                    mode='lines',
                    name='25th percentile (Spatial)',
                    line={'color': colors['spatial_range']}),
                )

            fig.add_trace(go.Scatter(  # add the baseline values
                                x=df_spatial['time'],
                                y=df_spatial['spatial_baseline_upper_quartile'],
                                fill='tonexty', # fill area between trace0 and trace1
                                mode='lines',
                                name='75th percentile (Spatial)',
                                line={'color': colors['spatial_range']}),
                            )

            fig.add_trace(go.Scatter(  # add the baseline values
                                x=df_spatial['time'],
                                y=df_spatial['spatial_baseline_median'],
                                mode='lines',
                                name='Median (Spatial)',
                                line={'color': colors['spatial_median']}),
                            )

            fig.add_trace(go.Scatter(  # add the baseline values
                                x=df_sptp['time'],
                                y=df_sptp['spatiotemporal_baseline_lower_quartile'],
                                mode='lines',
                                name='25th percentile (Spatiotemporal)',
                                line={'color': colors['spatiotemporal_range']}),
                            )

            fig.add_trace(go.Scatter(  # add the baseline values
                                x=df_sptp['time'],
                                y=df_sptp['spatiotemporal_baseline_upper_quartile'],
                                fill='tonexty', # fill area between trace0 and trace1
                                mode='lines',
                                name='75th percentile (Spatiotemporal)',
                                line={'color': colors['spatiotemporal_range']}),
                            )

            fig.add_trace(go.Scatter(  # add the baseline values
                                x=df_sptp['time'],
                                y=df_sptp['spatiotemporal_baseline_median'],
                                mode='lines',
                                name='Median (Spatiotemporal)',
                                line={'color': colors['spatiotemporal_median']}),
                            )
            
            fig.add_trace(
                go.Scatter(
                    x=df['UTCDateTime'],
                    y=df[dataset_var],
                    mode='lines', 
                    name='Ground Measurements',
                    line={'color':colors['ground']}
                )
            )
            
            fig.add_trace(
                go.Scatter(  
                    x=df_cams['time'],
                    y=df_cams[name_pollutant_var],
                    mode='lines', 
                    name='CAMS Analysis',
                    line={'color':colors['fire_event']}
                )            )
            
        if row == 3:
            fig.add_trace(
                    go.Scatter(
                        x=df['UTCDateTime'],
                        y=df[dataset_var],
                        mode='lines', 
                        name='Ground Measurements',
                        line={'color':colors['ground']}
                    )
                )
            
            fig.add_trace(
                go.Scatter(  
                    x=df_cams['time'],
                    y=df_cams[name_pollutant_var],
                    mode='lines', 
                    name='CAMS Analysis',
                    line={'color':colors['fire_event']}
                )            )
            

            if gs is None:
                pass
            else:
                i = 0
                for gs_name, info in gs.items():
                    i = i + 1
                    

                    df_gs = info[0]
                    distance = info[1]
                    pearson  = info[2]
                    spearman  = info[3]
                    
                    col_name = df_gs.columns[0]
                    
                    

                    if distance > 1000:
                        distance = round(distance / 1000)
                        trace_name = f'{gs_name} - distance: {distance}KM - Pearson R: {round(pearson[0], 3)}(p:{round(pearson[1], 3)}) - Spearman R: {round(spearman[0], 3)}(p:{round(spearman[1], 3)})'
                    else:
                        distance = round(distance)
                        trace_name = f'{gs_name} - distance: {distance}M - '

                    fig.add_trace(go.Scatter(  # add the CAMS analysis concentration data closest to the fireevent
                            x=df_gs.index,
                            y=df_gs[df_gs.columns[0]].rolling(window=3, min_periods=1).mean(),
                            mode='lines',
                            name=trace_name,
#                             line={'color':colors[f'ground_station_{i}']}
                    ))
                    
                    
                    
                    
        fig.update_layout( legend=dict(  # position the legend
            yanchor="top",
            y=-0.6,
            xanchor="left",
            x=0.01)
        )
        
        fig.show()
    
        if name is not None:
            if row == 1:
                fig.write_html(OUTPUT_DIR.joinpath(f'{name}_vs_cams.html'))
                
            if row == 2:
                fig.write_html(OUTPUT_DIR.joinpath(f'{name}_vs_baselines.html'))
            
            if row == 3:
                fig.write_html(OUTPUT_DIR.joinpath(f'{name}_vs_groundstations.html'))


In [None]:
# Moving Window Correlation Plot

def preprocess_and_calc_correlation(pollutant, df_hourly, df_cams, df_best_gs, window=12):
    
    df_ground = df_hourly[['UTCDateTime',POL_VAR_NAMES[pollutant]['GROUND']]].rename(columns={'UTCDateTime':'time'})
    df_cams = df_cams[['time', POL_VAR_NAMES[pollutant]['CAMS']]]
    
    df_cams_merged = pd.merge_asof(df_cams, df_ground, on="time").dropna()
    df_best_gs_merged = pd.merge_asof(df_best_gs, df_ground, on="time").dropna()
    
    df_cams_merged['rolling_corr_pearson'] = df_cams_merged[POL_VAR_NAMES[pollutant]['GROUND']].rolling(window).corr(
        df_cams_merged[POL_VAR_NAMES[pollutant]['CAMS']])
    df_best_gs_merged['rolling_corr_pearson'] = df_best_gs_merged[POL_VAR_NAMES[pollutant]['GROUND']].rolling(window).corr(
        df_best_gs_merged[POL_VAR_NAMES[pollutant]['STATION']])
    
    return df_ground, df_cams_merged, df_best_gs_merged

def create_rolling_correlation_plot(df_ground, df_cams, df_gs, start_date, pollutant,
                                    best_gs_name='Best Ground Station', name=None):
    fig = make_subplots(
        rows=2,
        cols=1,
        specs=[
            [{"type": "xy"}],
            [{"type": "xy"}],
        ],
    )
    
    ## GROUND MEASUREMENTS vs. CAMS & BEST GROUND STATION

    fig.add_trace(
        go.Scatter( 
            x=df_hourly['UTCDateTime'],
            y=df_hourly[POL_VAR_NAMES[pollutant]['GROUND']],
            mode='lines', 
            name='Ground Measurements - Hourly',
            line={'color':colors['ground_hourly']}
        ),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(  
            x=df_cams['time'],
            y=df_cams[POL_VAR_NAMES[pollutant]['CAMS']],
            mode='lines', 
            name='CAMS Analysis',
            line={'color':colors['fire_event']}),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(  
            x=df_best_gs['time'],
            y=df_best_gs[POL_VAR_NAMES[pollutant]['STATION']],
            mode='lines', 
            name=best_gs_name,
            line={'color':colors['ground_station_1']}
        ),
        row=1, col=1
    )

    fig.add_shape(
          type="line",
          x0=start_date,
          x1=start_date,
          y0=0,
          y1=100,
          line=dict(color="Black",width=1),
          row=1,
          col=1
          )


    ## CORRELATION 

    fig.add_trace(
        go.Scatter(
            x=df_cams_merged['time'],
            y=df_cams_merged['rolling_corr_pearson'],
            mode='lines', 
            name='CAMS',
        ),
        row=2,
        col=1
    )

    fig.add_trace(
        go.Scatter(
            x=df_best_gs_merged['time'],
            y=df_best_gs_merged['rolling_corr_pearson'],
            mode='lines', 
            name=best_gs_name,
        ),
        row=2,
        col=1
    )

    fig.add_shape(
          type="line",
          x0=start_date,
          x1=start_date,
          y0=-1.1,
          y1=1.1,
          line=dict(color="Black",width=1),
          row=2,
          col=1
          )

    fig.add_annotation(text=f"Pearson R CAMS: <br>Pearson R GS:", 
                        align='left',
                        showarrow=False,
                        xref='paper',
                        yref='paper',
                        x=1.15,
                        y=0.2,
                        bordercolor='black',
                        borderwidth=1)

    # Create and add slider
    steps = []
    for ind, stamp in enumerate(list(df_cams.time)):

        try:
            cams_corr = round(df_cams_merged[df_cams_merged.time == stamp]['rolling_corr_pearson'].iloc[0], 2)
        except Exception as e:
            cams_corr = 'No Data'

        try:
            gs_corr = round(df_best_gs_merged[df_best_gs_merged.time == stamp]['rolling_corr_pearson'].iloc[0], 2)
        except Exception as e:
            gs_corr = 'No Data'


        label = f"R CAMS: {cams_corr} <br>R GS: {gs_corr}"
        
        step = dict(
            method="relayout",
            label=pd.Timestamp(stamp).strftime('%d %b %H:%M'),

    #         label=label,
            args=[{
                # update the line based on the date in the slider
                "shapes[0].x0":pd.Timestamp(stamp),
                "shapes[0].x1":pd.Timestamp(stamp),
                "shapes[1].x0":pd.Timestamp(stamp),
                "shapes[1].x1":pd.Timestamp(stamp),
                # update the correlation values in the annotation box based on the slider date 
                "annotations[0].text":label,
            }],

        )

        steps.append(step)

    # set the slider parameters
    sliders = [dict(
        steps=steps,
        xanchor='left',
        x=0,
        ticklen=0,
        tickwidth=0,
        pad={'t':30, 'b':10}
    )]
    
    name_pollutant_html = POLLUTANTS[pollutant]['FORMULA_HTML']

    fig.update_layout(
        title=f'Correlation {name_pollutant_html} Ground Measurements vs. CAMS & Best EPA Ground Station',
        sliders=sliders,
    )
    
    # Update yaxis properties
    fig.update_yaxes(
        title_text=f"{name_pollutant_html} Concentration (µg m<sup>-3</sup>)",
        title_font_size=10,
        row=1,
        col=1,
    )
    
    # Update yaxis properties Correlation Graph
    fig.update_yaxes(
        title_text="Pearson Correlation",
        title_font_size=10,
        row=2,
        col=1
    )
    
    
    if name is not None:
        fig.write_html(OUTPUT_DIR.joinpath(f'{name}_{pollutant}.html'))
    
    return fig

In [None]:
# Function for the Graph vs CAMS maps & Burnscar plots

def create_dataset_images(ds_pol, pollutant):
    
    dff = ds_pol[POL_VAR_NAMES[pollutant]['CAMS']].to_dataframe().reset_index()
    cvs = ds.Canvas(plot_width=len(ds_pol.longitude), plot_height=len(ds_pol.latitude))
    agg = cvs.points(dff, x='longitude', y='latitude')
    # agg is an xarray object, see http://xarray.pydata.org/en/stable/ for more details
    coords_lat, coords_lon = agg.coords['latitude'].values, agg.coords['longitude'].values
    # Corners of the image, which need to be passed to mapbox
    coordinates = [[coords_lon[0], coords_lat[0]],
                   [coords_lon[-1], coords_lat[0]],
                   [coords_lon[-1], coords_lat[-1]],
                   [coords_lon[0], coords_lat[-1]]]

    images = []

    for ind,stamp in enumerate(list(ds_pol.time)):

        img = tf.shade(ds_pol[POL_VAR_NAMES[pollutant]['CAMS']][ind], cmap=cmap, span=[min_val, max_val], how='linear')[::-1].to_pil()
        params = {
            "sourcetype": "image",
            "source": img,
            "coordinates": coordinates,
            "name":ind,
            "visible":True,
            "opacity":0.85,
        }

        images += [params]
    
    return images

def create_map_vs_graph_plot(pollutant, df_hourly, df_cams, start_date, lat, long,
                             min_val, max_val, images, gdf_burnscars, gdf_fireevents, name=None):
    
    fig = make_subplots(
        rows=3,
        cols=2,
        specs=[
            [None, {"type": "mapbox", "rowspan": 2}],
            [None, None],
            [{"type": "xy", "colspan": 2}, None]
        ],
    )


    fig.add_trace(
        go.Scatter( 
            x=df_hourly['UTCDateTime'],
            y=df_hourly[POL_VAR_NAMES[pollutant]['GROUND']],
            mode='lines', 
            name='Ground Measurements - Hourly',
            line={'color':colors['ground_hourly']}
        ),
        row=3, col=1
    )
    
    name_pollutant_html = ""


    # Update yaxis properties
    fig.update_yaxes(title_text=f"{name_pollutant_html} Concentration (µg m<sup>-3</sup>)", title_font_size=9)
    fig.update_xaxes(tickfont={'size':9})


    fig.add_shape(
          type="line",
          x0=start_date,
          x1=start_date,
          y0=0,
          y1=100,
          line=dict(color="Black",width=1),
          name='Slider Date',
          row=3,
          col=1
          )

    fig.add_trace(
        go.Scatter(  
            x=df_cams['time'],
            y=df_cams[POL_VAR_NAMES[pollutant]['CAMS']],
            mode='lines', 
            name='CAMS Analysis',
            line={'color':colors['fire_event']}),
        row=3, col=1
    )

    fig.add_trace(go.Scattermapbox(
            mode='markers',
            marker=go.scattermapbox.Marker(
                size=14
            ),
        ), row=1, col=2)



    fig.update_layout(
        margin={"r":0,"t":0,"l":0,"b":0},
        mapbox= go.layout.Mapbox(
            accesstoken=mapbox_token,
            zoom=5,
            domain={'x': [0.4, 1], 'y': [0.4, 1]},
            center=go.layout.mapbox.Center(
                lat=lat,
                lon=long
            ),
            layers = [images[0]],
        )
    )

    gdf_burnscars = gdf_burnscars.sort_values(by='Date')

    for ind, row in gdf_burnscars.iterrows():
        pts=[]#list of points defining boundaries of polygons
        feature = row.geometry.__geo_interface__

        if feature['type']=='Polygon':
            pts.extend(feature['coordinates'][0])    
            pts.append([None, None])#mark the end of a polygon   
        elif feature['type']=='MultiPolygon':
            for polyg in feature['coordinates']:
                pts.extend(polyg[0])
                pts.append([None, None])#end of polygon
        else: raise ValueError("geometry type irrelevant for map") 

        X, Y=zip(*pts)

        detection_date = datetime.strptime(row['Date'], '%Y%m%d').strftime('%B %d %Y')

        fig.add_trace(
            go.Scattermapbox(
                lat=Y,
                lon=X,
                mode='lines',
                name=f'Burnscar - {detection_date}',
                hoverinfo='text',
                text=f'Burnscar detected on: {detection_date}',
                showlegend=True,
                fill='toself', 
                fillcolor='#04e762',
                visible='legendonly',
                line = go.scattermapbox.Line(width=0,
                                             color='#04e762',

                                        )),
            row=1,
            col=2,

        )

    # Create and add slider
    steps = []
    for ind, stamp in enumerate(list(ds_pol.time)):

        layers = images.copy()

        layers[ind]['visible'] = True

        layer = layers[ind]

        step = dict(
            method="relayout",
            label=pd.Timestamp(stamp.values).strftime('%d %b %H:%M'),
            args=[{
                "mapbox.layers":[layer],
                "shapes[0].x0":pd.Timestamp(stamp.values),
                "shapes[0].x1":pd.Timestamp(stamp.values),
            }],  # layout attribute
        )

        steps.append(step)

    fig.add_trace(
        go.Scattermapbox(
            lat=[lat],
            lon=[long],
            hoverinfo=None,
            showlegend=False,
            mode='markers',
            marker=dict(
                size=0,
                showscale=True,
                colorbar=dict(len=.65, y=.7, title='µg m<sup>-3</sup>'),
                colorscale=[[0, f"rgb({cmap(0)[0]},{cmap(0)[1]},{cmap(0)[2]})"],
                            [0.5, f"rgb({cmap(0.5)[0]},{cmap(0.5)[1]},{cmap(0.5)[2]})"],
                            [1, f"rgb({cmap(1.01)[0]},{cmap(1.01)[1]},{cmap(1.01)[2]})"]],
                cmin=min_val,
                cmax=max_val,
            ),
            line={'color': 'red'},
        ),
        row=1,
        col=2,
    )


    fig.add_trace(go.Scattermapbox(
        lat=[lat],
        lon=[long],
        text='UCC Measurement Site',
        hoverinfo='text',
        mode='markers',
        name='UCC Measurement Site',
        below="",
        showlegend=True,
        marker=go.scattermapbox.Marker(
                size=9,
                color='Blue',
                opacity=1
            )
    ), row=1, col=2)
    
    df_fe = gdf_fireevents[gdf_fireevents['Source'].isin(['EFFIS', 'FIRMS'])]
    hoverinfo = df_fe[['Date', 'Reference']].values.tolist()
    labels = [f'Detected on {x[0]}<br>by: {x[1]}' for x in hoverinfo]

    fig.add_trace(go.Scattermapbox(
    #         customdata=gdf_fireevents[['Reference', 'Date']],
            lat=df_fe.Latitude,
            lon=df_fe.Longitude,
            text=labels,
            hoverinfo='text',
            mode='markers',
            name='FE Satellite',
            showlegend=True,
            marker=go.scattermapbox.Marker(
                    symbol='circle',
                    color='#f15bb5',
                    allowoverlap=True, 
                    size=10,
                    opacity=1
                )
        ))
    
    df_fb = gdf_fireevents[gdf_fireevents['Source'].isin(['Fire Brigade'])]
    hoverinfo = df_fb[['Date', 'Reference']].values.tolist()
    labels = [f'Reported on {x[0]}<br>by: {x[1]}' for x in hoverinfo]

    fig.add_trace(go.Scattermapbox(
    #         customdata=gdf_fireevents[['Reference', 'Date']],
            lat=df_fb.Latitude,
            lon=df_fb.Longitude,
            text=labels,
            hoverinfo='text',
            mode='markers',
            name='FE Fire Brigade',
            showlegend=True,
            marker=go.scattermapbox.Marker(
                    symbol='circle',
                    color='#00bbf9',
                    allowoverlap=True, 
                    size=10,
                    opacity=1
                )
        ))

    fig.update_layout(legend=dict(
        font=dict(size= 10),
        yanchor="top",
        y=.95,
        xanchor="left",
        x=0.05,
    ))


    sliders = [dict(
        currentvalue={"prefix": "Date&Time: "},
        steps=steps,
        xanchor='left',
        x=0,
        ticklen=0,
        tickwidth=0
    )]

    fig.update_layout(
        title={
            'text': f'CAMS {name_pollutant_html} & Burnscars visualization',
            'y':0.95
        },
        sliders=sliders,
    )
    
    if name is not None:
        fig.write_html(OUTPUT_DIR.joinpath(f'{name}_{pollutant}.html'))
    
    return fig

In [None]:
df = pd.read_excel("data/Wicklow_purpleair_l_comb.xlsx")
df = process_ground_data(df)
# Convert to hourly

df_hourly = df.set_index('UTCDateTime').resample('H').mean().reset_index()
# Copy of the dataframe to combine all the data arrays with 
df_all_data = df_hourly.copy().set_index('UTCDateTime')

In [None]:
start_time = df['UTCDateTime'].iloc[0]
end_time = df['UTCDateTime'].iloc[-1]
dif = df['UTCDateTime'].median() - df['UTCDateTime'].iloc[0]

lat = 53.19836189538404
long = -6.292349981271148

## PM10

In [None]:
pollutant = 'PM10'
name_pollutant_var = POLLUTANTS[pollutant]['CAMS']

df_cams = get_cams_data(pollutant, start_time, end_time, lat, long)

df_spatial, ds_fe, _, _ = get_spatial_baseline(
            fe_lat=lat,
            fe_long=long,
            timestamp=df['UTCDateTime'].median().round('H'),
            pollutant=pollutant,
            days=dif.days + 1,
            meteo_dataset='MERA',
            min_distance_km=50,
            max_distance_km=250,
            number_of_neighbours=50,
            mask_ocean=True,   
        )

df_sptp = get_spatiotemporal_baseline(
            fe_lat=lat,
            fe_long=long,
            timestamp=df['UTCDateTime'].median().round('H'),
            days=dif.days + 1, # time window around fire event
            pollutant=pollutant,
            meteo_dataset='MERA',
            min_distance_km=50,
            max_distance_km=250,
            number_of_neighbours=50,
#             upwind_downwind='upwind',
            mask_ocean=True,          
        )

gs_info = get_closest_active_epa_ground_station(lat, long, name_pollutant_var, quantity=15)

gs_data = {}

for ind, gs in gs_info.iterrows():
    df_gs = get_epa_data(
        gs['epa_code'],
        df['UTCDateTime'].median().round('H'),
        name_pollutant_var,
        days= dif.days + 1
    )
    
    if df_gs is None:
        continue
    
    pearson_r, spearman_r = calc_correlations(df_gs, df_hourly, 'pm10_0_atm')
    
    gs_info.at[ind, 'spearman_r'] = spearman_r[0]
    gs_info.at[ind, 'spearman_p'] = spearman_r[1]
    gs_info.at[ind, 'pearson_r'] = pearson_r[0]
    gs_info.at[ind, 'pearson_p'] = pearson_r[1]
    
    gs_data[gs['epa_name']] = (df_gs, gs['distance'], pearson_r, spearman_r)

In [None]:
best_gs = gs_info.sort_values(by='pearson_r', ascending=False).head(5)
gs_data_best = {x:gs_data[x] for x in best_gs['epa_name'].tolist()}

In [None]:
title = f"CAMS vs. UCC Ground Measurements<br><sup>PM10<sup>"
create_multiplot(pollutant, 'pm10_0_atm', df, df_cams, df_spatial, df_sptp, gs_data_best, title=title)  # name='PM10'

In [None]:
best_gs = gs_info.sort_values(by='pearson_r', ascending=False).head(15)
gs_data_best = {x:gs_data[x] for x in best_gs['epa_name'].tolist()}
df_gs_map = best_gs[['epa_name', 'latitude', 'longitude', 'spearman_r','spearman_p', 'pearson_r', 'pearson_p']].copy()
df_gs_map['type'] = 'EPA Ground Station'
df_gs_map['pearson_r'] = df_gs_map['pearson_r'].round(2)
df_gs_map['spearman_r'] = df_gs_map['spearman_r'].round(2)
df_gs_map.loc[len(df_gs_map)] = ['UCC Ground Measurement', lat, long, None, None, None, None, 'UCC Measurement Location']

fig = px.scatter_mapbox(
    df_gs_map,
    lat="latitude",
    lon="longitude",
    hover_name="epa_name",
    hover_data=["epa_name", 'pearson_r', 'spearman_r'],
    zoom=6,
    center={'lat':df_gs_map.latitude.mean(), 'lon':df_gs_map.longitude.mean()} ,
    color='type',
)
fig.update_layout(mapbox_style="dark", mapbox_accesstoken=mapbox_token, title='PM 10 Ground Stations vs. UCC Ground Measurements')
fig.update_geos(fitbounds="locations")
# fig.update_layout(height=400, margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Rolling window correlation plot

In [None]:
# Preprocessing
pollutant = 'PM10' 

# Ground measurements hourly dataframe
best_gs_name = list(gs_data_best.keys())[0]
df_best_gs = gs_data_best[best_gs_name][0]
df_best_gs = df_best_gs[POL_VAR_NAMES[pollutant]['STATION']].reset_index().rename(columns={'index':'time'})

# Set some initial variables

start_date = None # Set custom date by using -> datetime(year=2021, month=4, day=20)
end_date = None # datetime(year=2021, month=4, day=25)

if start_date is None:
    start_date = df_hourly['UTCDateTime'].iloc[0]
if end_date is None:
    end_date = df_hourly['UTCDateTime'].iloc[-1]
    
df_ground, df_cams_merged, df_best_gs_merged = preprocess_and_calc_correlation(
    pollutant, df_hourly, df_cams, df_best_gs)
fig = create_rolling_correlation_plot(df_ground, df_cams_merged, df_best_gs_merged, start_date,
                                      pollutant, best_gs_name=best_gs_name, name='correlation')
fig

In [None]:
pollutant = 'PM10'
pm_wildfire = False # Use the CAMS analysis wildfire dataset for the maps

if pm_wildfire:
    ds_pmwf = xr.open_dataset(Path(DATA_DIR_CAMS_AN).joinpath("pmwf_conc.nc")).sel(
        time=slice(start_date, end_date)).squeeze()

years = list(set([start_date.year, end_date.year]))
ds_pol = create_dataset(pollutant, years=years)
ds_pol = ds_pol.sel(time=slice(start_date, end_date)).squeeze()

cmap = plt.get_cmap('YlOrBr')

gdf_burnscars = geopandas.read_file(r"F:\randbee\flares\data\wicklow\burnscars.geojson")
burnscars = json.loads(gdf_burnscars.to_json())

if pm_wildfire:
    min_val = float(ds_pmwf[POL_VAR_NAMES['PMWF']['CAMS']].quantile(.025))
    max_val = float(ds_pmwf[POL_VAR_NAMES['PMWF']['CAMS']].quantile(.975))
    images = create_dataset_images(ds_pmwf, 'PMWF')
else:
    
    min_val = float(ds_pol[POL_VAR_NAMES[pollutant]['CAMS']].quantile(.025))
    max_val = float(ds_pol[POL_VAR_NAMES[pollutant]['CAMS']].quantile(.975))
    images = create_dataset_images(ds_pol, pollutant)

fig = create_map_vs_graph_plot(
    pollutant, df_hourly, df_cams, start_date,
    lat, long, min_val, max_val, images, gdf_burnscars, gdf_fireevents, name='cams_and_fires')

In [None]:
df_cams_merge = df_cams.set_index('time').rename(columns={'pm10_conc':'cams_pm_10'})
df_all_data = df_all_data.merge(df_cams_merge, how='left', left_index=True, right_index=True)

df_best_gs_merge = df_best_gs.set_index('time').rename(columns={'PM10': f"{best_gs_name.replace(', ', '_').replace(' ', '_').lower()}_pm_10"})
df_all_data = df_all_data.merge(df_best_gs_merge, how='left', left_index=True, right_index=True)

# PM25

In [None]:
pollutant = 'PM25'
name_pollutant_var = POLLUTANTS[pollutant]['CAMS']

df_cams = get_cams_data(pollutant, start_time, end_time, lat, long)

df_spatial, ds_fe, _, _ = get_spatial_baseline(
            fe_lat=lat,
            fe_long=long,
            timestamp=df['UTCDateTime'].median().round('H'),
            pollutant=pollutant,
            days=dif.days + 1,
            meteo_dataset='MERA',
            min_distance_km=50,
            max_distance_km=250,
            number_of_neighbours=50,
            mask_ocean=True,   
        )

df_sptp = get_spatiotemporal_baseline(
            fe_lat=lat,
            fe_long=long,
            timestamp=df['UTCDateTime'].median().round('H'),
            days=dif.days + 1, # time window around fire event
            pollutant=pollutant,
            meteo_dataset='MERA',
            min_distance_km=50,
            max_distance_km=250,
            number_of_neighbours=50,
#             upwind_downwind='upwind',
            mask_ocean=True,          
        )

In [None]:
gs_info = get_closest_active_epa_ground_station(lat, long, name_pollutant_var, quantity=15)

gs_info = gs_info.reindex(columns = gs_info.columns.tolist() 
                                  + ['spearman_r','spearman_p', 'pearson_r', 'pearson_p'])

gs_data = {}

for ind, gs in gs_info.iterrows():
    
    df_gs = get_epa_data(
        gs['epa_code'],
        df['UTCDateTime'].median().round('H'),
        name_pollutant_var,
        days= dif.days + 1
    )
    
    if df_gs is None:
        continue
        
    pearson_r, spearman_r = calc_correlations(df_gs, df_hourly, 'pm2_5_atm')
    
    gs_info.at[ind, 'spearman_r'] = spearman_r[0]
    gs_info.at[ind, 'spearman_p'] = spearman_r[1]
    gs_info.at[ind, 'pearson_r'] = pearson_r[0]
    gs_info.at[ind, 'pearson_p'] = pearson_r[1]
    
    gs_data[gs['epa_name']] = (df_gs, gs['distance'], pearson_r, spearman_r)

In [None]:
best_gs = gs_info.sort_values(by='pearson_r', ascending=False).head(5)
gs_data_best = {x:gs_data[x] for x in best_gs['epa_name'].tolist()}

In [None]:
title = f"CAMS vs. UCC Ground Measurements<br><sup>PM2.5<sup>"
create_multiplot(pollutant, 'pm2_5_atm', df, df_cams, df_spatial, df_sptp, gs_data_best, title=title)  # name='PM25'

## Map the 15 Ground Stations with the highest correlations

In [None]:
best_gs = gs_info.sort_values(by='pearson_r', ascending=False).head(15)
gs_data_best = {x:gs_data[x] for x in best_gs['epa_name'].tolist()}
df_gs_map = best_gs[['epa_name', 'latitude', 'longitude', 'spearman_r','spearman_p', 'pearson_r', 'pearson_p']].copy()
df_gs_map['type'] = 'EPA Ground Station'
df_gs_map['pearson_r'] = df_gs_map['pearson_r'].round(2)
df_gs_map['spearman_r'] = df_gs_map['spearman_r'].round(2)
df_gs_map.loc[len(df_gs_map)] = ['UCC Ground Measurement', lat, long, None, None, None, None, 'UCC Measurement Location']

fig = px.scatter_mapbox(
    df_gs_map,
    lat="latitude",
    lon="longitude",
    hover_name="epa_name",
    hover_data=["epa_name", 'pearson_r', 'spearman_r'],
    zoom=7,
    center={'lat':df_gs_map.latitude.mean(), 'lon':df_gs_map.longitude.mean()} ,
    color='type',
)
fig.update_layout(mapbox_style="dark", mapbox_accesstoken=mapbox_token, title='PM 2.5 Ground Stations vs. UCC Ground Measurements')
fig.update_geos(fitbounds="locations")
# fig.update_layout(height=400, margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Rolling window correlation plot

In [None]:
# Preprocessing
pollutant = 'PM25' 

# Ground measurements hourly dataframe
best_gs_name = list(gs_data_best.keys())[0]
df_best_gs = gs_data_best[best_gs_name][0]
df_best_gs = df_best_gs[POL_VAR_NAMES[pollutant]['STATION']].reset_index().rename(columns={'index':'time'})

# Set some initial variables

start_date = None # Set custom date by using -> datetime(year=2021, month=4, day=20)
end_date = None # datetime(year=2021, month=4, day=25)

if start_date is None:
    start_date = df_hourly['UTCDateTime'].iloc[0]
if end_date is None:
    end_date = df_hourly['UTCDateTime'].iloc[-1]
    
df_ground, df_cams_merged, df_best_gs_merged = preprocess_and_calc_correlation(
    pollutant, df_hourly, df_cams, df_best_gs)
fig = create_rolling_correlation_plot(df_ground, df_cams_merged,
                                      df_best_gs_merged, start_date, pollutant, best_gs_name=best_gs_name, name='correlation')
fig

## Create interactive Map vs. Graph Plot

In [None]:
pollutant = 'PM25'
pm_wildfire = False # Use the CAMS analysis wildfire dataset for the maps

if pm_wildfire:
    ds_pmwf = xr.open_dataset(Path(DATA_DIR_CAMS_AN).joinpath("pmwf_conc.nc")).sel(
        time=slice(start_date, end_date)).squeeze()

years = list(set([start_date.year, end_date.year]))
ds_pol = create_dataset(pollutant, years=years)
ds_pol = ds_pol.sel(time=slice(start_date, end_date)).squeeze()

cmap = plt.get_cmap('YlOrBr')

if pm_wildfire:
    min_val = float(ds_pmwf[POL_VAR_NAMES['PMWF']['CAMS']].quantile(.025))
    max_val = float(ds_pmwf[POL_VAR_NAMES['PMWF']['CAMS']].quantile(.975))
    images = create_dataset_images(ds_pmwf, 'PMWF')
else:
    
    min_val = float(ds_pol[POL_VAR_NAMES[pollutant]['CAMS']].quantile(.025))
    max_val = float(ds_pol[POL_VAR_NAMES[pollutant]['CAMS']].quantile(.975))
    images = create_dataset_images(ds_pol, pollutant)

fig = create_map_vs_graph_plot(
    pollutant, df_hourly, df_cams, start_date,
    lat, long, min_val, max_val, images, gdf_burnscars, gdf_fireevents, name='cams_and_fires')

In [None]:
df_cams_merge = df_cams.set_index('time').rename(columns={'pm2p5_conc':'cams_pm2_5'})
df_all_data = df_all_data.merge(df_cams_merge, how='left', left_index=True, right_index=True)

df_best_gs_merge = df_best_gs.set_index('time').rename(columns={'PM2.5': f"{best_gs_name.replace(', ', '_').replace(' ', '_').lower()}_pm2_5"})
df_all_data = df_all_data.merge(df_best_gs_merge, how='left', left_index=True, right_index=True)

In [None]:
df_all_data.to_csv('data/all_data.csv')