In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('airquality.csv')
df

Unnamed: 0,site_id,site,country,site_type,site_area,elevation,date,pm10,pm2.5,no2,o3,so2
0,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-01,18.279,16.070,6.234,28.100,0.380
1,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-02,13.359,12.360,9.243,24.121,0.339
2,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-03,11.934,10.135,17.199,16.999,0.404
3,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-04,8.834,7.792,6.280,50.592,0.408
4,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-05,5.947,5.075,4.881,69.127,0.383
...,...,...,...,...,...,...,...,...,...,...,...,...
24525,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-27,13.754,5.612,10.912,27.125,
24526,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-28,13.238,4.188,17.721,25.500,
24527,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-29,15.954,7.617,18.112,21.250,
24528,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-30,28.319,15.081,22.683,26.250,


In [3]:
who_standards = {
    'pm10': {'limit':45, 'permitted':3},
    'pm2.5': {'limit':15, 'permitted':3},
    'no2' : {'limit':25, 'permitted':3},
    'o3' : {'limit':100, 'permitted':3},
    'so2' : {'limit':40, 'permitted':3},
}

# change ['pm10', 'pm2.5', 'no2', 'o3', 'so2'] to float
df['pm10'] = pd.to_numeric(df['pm10'], errors='raise')
df['pm2.5'] = pd.to_numeric(df['pm2.5'], errors='raise')
df['no2'] = pd.to_numeric(df['no2'], errors='raise')
df['o3'] = pd.to_numeric(df['o3'], errors='raise')
df['so2'] = pd.to_numeric(df['so2'], errors='raise')

dff = pd.DataFrame(
    index=df['site_id'].unique(),
    columns=who_standards.keys()
)

for i in who_standards:
    dff[i] = df[df[i] > who_standards[i]['limit']].groupby('site_id').size()

dff.reset_index(inplace=True, names=['site_id'])
dff

Unnamed: 0,site_id,pm10,pm2.5,no2,o3,so2
0,at0ill1,,41.0,2.0,101.0,
1,at31401,,34.0,19.0,66.0,
2,at32701,,46.0,24.0,74.0,
3,at4s406,2.0,49.0,58.0,67.0,
4,at4s416,3.0,76.0,124.0,48.0,
...,...,...,...,...,...,...
63,pt02019,1.0,46.0,1.0,82.0,
64,pt03063,9.0,43.0,79.0,33.0,
65,pt03071,1.0,30.0,142.0,44.0,
66,pt03072,2.0,23.0,220.0,25.0,


In [4]:
df[df[i] > who_standards['so2']['limit']].groupby('site_id').size()

site_id
dehh015     3
gr0030a     1
hu0057a    52
it0461a     1
dtype: int64

In [5]:
who_standards = {
    'pm10': {'limit':45, 'permitted':3},
    'pm2.5': {'limit':15, 'permitted':3},
    'no2' : {'limit':25, 'permitted':3},
    'o3' : {'limit':100, 'permitted':3},
    'so2' : {'limit':40, 'permitted':3},
}

### Number of total excedences by date

In [6]:
df_date = pd.DataFrame(
    index=df['date'].unique(),
    columns=who_standards.keys()
)
for i in who_standards:
    df_date[i] = df[df[i] > who_standards[i]['limit']].groupby('date').size()

df_date["total"] = df_date.sum(axis=1)
df_date

Unnamed: 0,pm10,pm2.5,no2,o3,so2,total
2023-01-01,8.0,20.0,11,,,39.0
2023-01-02,6.0,15.0,21,,,42.0
2023-01-03,7.0,20.0,34,,,61.0
2023-01-04,6.0,20.0,21,,,47.0
2023-01-05,5.0,16.0,20,,,41.0
...,...,...,...,...,...,...
2023-12-27,5.0,14.0,30,,1.0,50.0
2023-12-28,4.0,24.0,25,,1.0,54.0
2023-12-29,9.0,15.0,24,,1.0,49.0
2023-12-30,6.0,20.0,26,,1.0,53.0


In [7]:
fig = px.line(df_date, x=df_date.index, y='total', title='Exceedances per day', labels={'pm10':'Count', 'date':'Date'})
fig.show()

### Spain

In [8]:
df_spain = df.query('country == "spain"')
df_spain

Unnamed: 0,site_id,site,country,site_type,site_area,elevation,date,pm10,pm2.5,no2,o3,so2
7608,es0118a,ESCUELAS AGUIRRE,spain,traffic,urban,672.0,2023-01-01,21.542,13.583,37.417,42.044,2.542
7609,es0118a,ESCUELAS AGUIRRE,spain,traffic,urban,672.0,2023-01-02,15.250,8.250,38.143,37.875,2.091
7610,es0118a,ESCUELAS AGUIRRE,spain,traffic,urban,672.0,2023-01-03,15.792,9.625,42.739,39.250,2.292
7611,es0118a,ESCUELAS AGUIRRE,spain,traffic,urban,672.0,2023-01-04,18.500,9.667,51.750,31.000,2.792
7612,es0118a,ESCUELAS AGUIRRE,spain,traffic,urban,672.0,2023-01-05,19.375,10.792,52.958,29.875,2.875
...,...,...,...,...,...,...,...,...,...,...,...,...
9758,es1610a,FONERS,spain,traffic,urban,23.0,2023-12-27,34.625,5.000,32.583,56.375,3.292
9759,es1610a,FONERS,spain,traffic,urban,23.0,2023-12-28,44.625,5.042,35.391,59.250,2.783
9760,es1610a,FONERS,spain,traffic,urban,23.0,2023-12-29,53.250,5.739,43.478,60.625,2.522
9761,es1610a,FONERS,spain,traffic,urban,23.0,2023-12-30,51.391,3.957,37.304,48.500,2.435


In [9]:
df_spain = df.query('country == "spain"')


In [10]:
def calculate_mean_diff(site_id_1, site_id_2):
    return abs(df_spain.pivot(index='date', columns='site_id', values=['pm10'])['pm10'][site_id_1] - df_spain.pivot(index='date', columns='site_id', values=['pm10'])['pm10'][site_id_2]).mean()

l = df_spain.site_id.unique()
means = []
for li in l:
    for lj in l:
        if li != lj:
            means.append((calculate_mean_diff(li, lj), li, lj))

fig = px.bar(pd.DataFrame(means,
                          columns=['mean', 'site_id_1', 'site_id_2']).sort_values(by='mean', ascending=False),
                          x='site_id_1', 
                          labels={'site_id_1':'Site 1', 'site_id_2':'Site 2', 'mean':'Mean difference'},
                          y='mean', 
                          color='site_id_2', 
                          title='Mean difference between sites')
fig.show()

es0118a MADRID 

es1239a VALENCIA

es1269a OVIEDO

es1271a GIJON

es1353a OVIEDO

es1610a MALLORCA

In [11]:
display(df)
df.country.unique()

Unnamed: 0,site_id,site,country,site_type,site_area,elevation,date,pm10,pm2.5,no2,o3,so2
0,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-01,18.279,16.070,6.234,28.100,0.380
1,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-02,13.359,12.360,9.243,24.121,0.339
2,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-03,11.934,10.135,17.199,16.999,0.404
3,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-04,8.834,7.792,6.280,50.592,0.408
4,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-05,5.947,5.075,4.881,69.127,0.383
...,...,...,...,...,...,...,...,...,...,...,...,...
24525,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-27,13.754,5.612,10.912,27.125,
24526,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-28,13.238,4.188,17.721,25.500,
24527,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-29,15.954,7.617,18.112,21.250,
24528,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-30,28.319,15.081,22.683,26.250,


array(['austria', 'belgium', 'germany', 'spain', 'france', 'greece',
       'hungary', 'italy', 'netherlands', nan, 'poland', 'portugal'],
      dtype=object)

In [12]:
df.query('country == "spain" or country == "portugal" or country == "italy" or country == "greece"')
df.query('country == "germany" or country == "belgium" or country == "poland" or country == "hungary"')

Unnamed: 0,site_id,site,country,site_type,site_area,elevation,date,pm10,pm2.5,no2,o3,so2
2186,betn043,41N043 - HAREN,belgium,industrial,suburban,17.0,2023-01-01,12.571,6.696,10.521,52.125,2.215
2187,betn043,41N043 - HAREN,belgium,industrial,suburban,17.0,2023-01-02,10.441,7.083,20.935,49.875,2.500
2188,betn043,41N043 - HAREN,belgium,industrial,suburban,17.0,2023-01-03,11.375,6.067,25.938,51.750,2.423
2189,betn043,41N043 - HAREN,belgium,industrial,suburban,17.0,2023-01-04,8.643,5.160,11.375,60.625,2.226
2190,betn043,41N043 - HAREN,belgium,industrial,suburban,17.0,2023-01-05,17.219,8.516,21.977,60.875,2.380
...,...,...,...,...,...,...,...,...,...,...,...,...
22337,pl0212a,Wschowa ul. Kazimierza Wielkiego,poland,background,urban,90.0,2023-12-27,12.038,8.345,9.816,67.191,7.280
22338,pl0212a,Wschowa ul. Kazimierza Wielkiego,poland,background,urban,90.0,2023-12-28,15.999,11.276,16.043,34.463,12.401
22339,pl0212a,Wschowa ul. Kazimierza Wielkiego,poland,background,urban,90.0,2023-12-29,7.387,5.321,8.965,53.523,12.876
22340,pl0212a,Wschowa ul. Kazimierza Wielkiego,poland,background,urban,90.0,2023-12-30,14.073,10.653,11.161,66.304,12.836


Dia de la semana

    MONDAY = 0

### Normalized mean air quality per weekday

In [13]:
df_week = pd.read_csv('airquality.csv')
df_week['date'] = pd.to_datetime(df_week['date'])
df_week['weekday'] = df_week['date'].dt.weekday

df_week

Unnamed: 0,site_id,site,country,site_type,site_area,elevation,date,pm10,pm2.5,no2,o3,so2,weekday
0,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-01,18.279,16.070,6.234,28.100,0.380,6
1,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-02,13.359,12.360,9.243,24.121,0.339,0
2,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-03,11.934,10.135,17.199,16.999,0.404,1
3,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-04,8.834,7.792,6.280,50.592,0.408,2
4,at0ill1,Illmitz am Neusiedler See,austria,background,rural_regional,117.0,2023-01-05,5.947,5.075,4.881,69.127,0.383,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24525,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-27,13.754,5.612,10.912,27.125,,2
24526,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-28,13.238,4.188,17.721,25.500,,3
24527,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-29,15.954,7.617,18.112,21.250,,4
24528,pt03083,Laranjeiro,portugal,background,urban,63.0,2023-12-30,28.319,15.081,22.683,26.250,,5


In [None]:
df_week_gb = df_week.groupby('weekday')[['pm10','pm2.5','no2','o3','so2']].mean()
df_week_gb = df_week_gb / df_week_gb.mean()
df_week_gb = round(df_week_gb,2)

fig = go.Figure()
for i in df_week_gb.columns:
    fig.add_scatter(x=df_week_gb.index, 
                    y=df_week_gb[i], 
                    mode='lines+markers', 
                    name=i,
                    opacity=1 if i == 'no2' else 0.25)

fig.update_layout(yaxis=dict(showgrid=True,
                             tickmode='array',
                             gridcolor='rgba(0,0,0,0.2)', 
                             gridwidth=2,
                             griddash='dash',                             
                             tickvals=[1],), 
                  xaxis=dict(showgrid=True,
                             tickmode='array', 
                             tickvals=[0,1,2,3,4,5,6], 
                             ticktext=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']),
                  title=dict(
                      text='Normalized by mean air quality per weekday', 
                      font=dict(size=25, color='black', weight='bold'), 
                      subtitle=dict(text='pm10, pm2.5, no2, o3, so2', 
                                    font=dict(size=15))),
                  plot_bgcolor='white',
                  hovermode='x unified',
                  hoverlabel=dict(align='right'),
                  spikedistance=10)
fig.add_annotation(x=5.96, 
                   y=0.7960,
                   showarrow=True, 
                   arrowwidth=2, 
                   arrowhead=4, 
                   arrowside='end', 
                   arrowcolor='black',
                   ax=-130,
                   ay=-33)
fig.add_annotation(x=5, 
                   y=0.89, 
                   showarrow=True, 
                   arrowwidth=2, 
                   arrowhead=4, 
                   arrowside='end', 
                   arrowcolor='black',
                   ax=-80,
                   ay=30)
fig.add_annotation(x=3.5,
                   y=.84,
                   showarrow=False,
                   text='no2 messurements are improving on the weekends',
                   font=dict(size=15, color='black', weight='bold'),)
fig.add_annotation(x=.50,
                   y=0.995,
                   showarrow=True,
                   arrowwidth=1, 
                   arrowhead=4, 
                   arrowside='end', 
                   arrowcolor='gray',
                   ax=0,
                   ay=80,
                   text='1 means the mean value of all weekdays',
                   font=dict(size=12, color='gray'),)

fig.show()

### Which is the improvment of no2 on weekends by country?

In [417]:
df_week_gb_country = df_week.groupby(['weekday','country'])[['no2']].mean()
# df_week_gb_country = df_week_gb_country / df_week_gb_country.mean()
df_week_gb_country = round(df_week_gb_country,2)

df_week_gb_country.reset_index(inplace=True, level=1)
df_week_gb_country.dropna(inplace=True, how='any')

fig = go.Figure()
for country in df_week_gb_country['country'].unique():
    fig.add_scatter(x=df_week_gb_country[df_week_gb_country.country==country].index, 
                    y=df_week_gb_country[df_week_gb_country.country==country]['no2'], 
                    mode='lines+markers', 
                    name=country,
                    marker=dict(
                        color='lightgrey',
                        size=8),
                    opacity=0.8)
  
fig.add_scatter(x=df_week_gb_country.reset_index().groupby('weekday').no2.mean().index,
                y=df_week_gb_country.reset_index().groupby('weekday').no2.mean(),
                mode='lines+markers',
                name='mean',
                marker=dict(
                    color='black',
                    size=10),
                line=dict(color='black', width=2),
                opacity=1)
    
fig.update_layout(yaxis=dict(showgrid=True,
                             tickmode='array',
                             gridcolor='rgba(0,0,0,0.2)', 
                             gridwidth=2,
                             griddash='dash',                             
                             tickvals=[1],), 
                  xaxis=dict(showgrid=True,
                             tickmode='array', 
                             tickvals=[0,1,2,3,4,5,6], 
                             ticktext=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']),
                  title=dict(
                      text='Normalized by mean air quality per weekday', 
                      font=dict(size=25, color='black', weight='bold'), 
                      subtitle=dict(text='pm10, pm2.5, no2, o3, so2', 
                                    font=dict(size=20))),
                  plot_bgcolor='white',
                #   hovermode='x unified',
                #   hoverlabel=dict(align='right'),
                  showlegend=False,
                  spikedistance=10)

In [16]:
df_week_gb_country[df_week_gb_country.country=='greece']

Unnamed: 0_level_0,country,no2
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
0,greece,36.48
1,greece,38.86
2,greece,39.06
3,greece,40.15
4,greece,40.49
5,greece,34.29
6,greece,30.75


In [17]:
df_week_gb_country.country.unique()

array(['austria', 'belgium', 'france', 'germany', 'greece', 'hungary',
       'italy', 'netherlands', 'poland', 'portugal', 'spain'],
      dtype=object)

In [18]:
df_site_area = pd.read_csv('airquality.csv')

In [19]:
df_week_gb_site_area = df_week.groupby(['weekday','site_area'])[['no2']].mean()
display(df_week_gb_site_area)
# df_week_gb_site_area = df_week_gb_site_area / df_week_gb_site_area.mean()
df_week_gb_site_area = round(df_week_gb_site_area,2)

df_week_gb_site_area.reset_index(inplace=True, level=1)
df_week_gb_site_area.dropna(inplace=True, how='any')

fig = go.Figure()
for site_area in df_week_gb_site_area['site_area'].unique():
    fig.add_scatter(x=df_week_gb_site_area[df_week_gb_site_area.site_area==site_area].index, 
                    y=df_week_gb_site_area[df_week_gb_site_area.site_area==site_area]['no2'], 
                    mode='lines+markers', 
                    name=site_area,
                    opacity=1 if site_area == 'greece' else 1)
    
fig.update_layout(yaxis=dict(showgrid=True,
                             tickmode='array',
                             gridcolor='rgba(0,0,0,0.2)', 
                             gridwidth=2,
                             griddash='dash',                             
                             tickvals=[1],), 
                  xaxis=dict(showgrid=True,
                             tickmode='array', 
                             tickvals=[0,1,2,3,4,5,6], 
                             ticktext=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']),
                  title=dict(
                      text='Normalized by mean air quality per weekday', 
                      font=dict(size=25, color='black', weight='bold'), 
                      subtitle=dict(text='pm10, pm2.5, no2, o3, so2', 
                                    font=dict(size=20))),
                  plot_bgcolor='white',
                #   hovermode='x unified',
                #   hoverlabel=dict(align='right'),
                  spikedistance=10)

fig.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,no2
weekday,site_area,Unnamed: 2_level_1
0,rural,5.462773
0,rural_regional,4.696866
0,suburban,18.172574
0,urban,22.36959
1,rural,6.08416
1,rural_regional,5.14296
1,suburban,18.757258
1,urban,23.13694
2,rural,6.51125
2,rural_regional,5.06851


### Pollution given the elevation of the site

In [119]:
df_elevation = pd.read_csv('airquality.csv')

# plot number of rows by elevation
fig = px.histogram(df_elevation, 
                   x='elevation', 
                   title='Number of rows by elevation',
                #    subtitle='each bar represents the number of rows with +/- 50 meters elevation',
                   nbins=50)
fig.show()

In [129]:
# group elevation by 50 meters
df_elevation['elevation_rounded'] = df_elevation['elevation'] // 50 * 50
df_elevation = df_elevation[df_elevation['elevation_rounded'] < 400]

In [None]:
def plot_elevation(pollutant):
    df_elevation_gb = df_elevation.groupby('elevation_rounded')[[pollutant]].mean()

    fig = go.Figure()
    fig.add_scatter(
        x=df_elevation_gb.index,
        y=df_elevation_gb[pollutant],
        mode='lines+markers',
        name=pollutant,
    )

    fig.update_layout(yaxis=dict(showgrid=True,
                                tickmode='array',
                                gridcolor='rgba(0,0,0,0.2)', 
                                gridwidth=2,
                                griddash='dash',                             
                                tickvals=[1],), 
                    xaxis=dict(showgrid=True,),
                    title=dict(
                        text='Mean air quality per elevation', 
                        font=dict(size=25, color='black', weight='bold'), 
                        subtitle=dict(text=pollutant, 
                                        font=dict(size=15))),
                    plot_bgcolor='white',
                    hovermode='x unified',
                    hoverlabel=dict(align='right'),
                    spikedistance=10)

    fig.show()


df_elevation
pollutants = ['pm10', 'pm2.5', 'no2', 'o3', 'so2']
for i in pollutants:
    plot_elevation(i)

### Number of exceedances per area

In [None]:
df_excee_area = pd.read_csv('airquality.csv')

who_standards = {
    'pm10': 45,
    'pm2.5': 15,
    'no2': 25,
    'o3': 100,
    'so2': 40,
}

for i in who_standards:
    df_excee_area[f"{i}_exc"] = df_excee_area[i] > who_standards[i]

df_excee_area_gb = df_excee_area.groupby('site_area')[['pm10_exc', 'pm2.5_exc', 'no2_exc', 'o3_exc', 'so2_exc']]

# how many times each site area exceeded the limits
df_excee_area_frec = df_excee_area_gb.sum() / df_excee_area_gb.count()

fig = go.Figure()
for i in df_excee_area_frec.columns:
    fig.add_bar(
        x=df_excee_area_frec.index, 
        y=round(df_excee_area_frec[i] * 100,1), 
        name=i, 
        text=round(df_excee_area_frec[i] * 100,1), 
        textposition='auto',
        opacity=1 if i == 'no2_exc' else 0.40
    )
    
fig.update_layout(
    title='Percentage of exceedances per site area',
    xaxis=dict(title='Site area'),
    yaxis=dict(title='Exceedances'),
    barmode='group')

fig.update_layout(
    barmode='group',
    yaxis=dict(
        showgrid=False,
        tickvals=[],
        title='',
    ),
    xaxis=dict(
        title='',
    ),          
)
fig.show()

### Measurements over the eu-limit of no2 from urban spain

In [456]:
standards = {
    'pm10': {'who':{'limit':45, 'permitted':3}, 'eu':{'limit':50, 'permitted':35}},
    'pm2.5': {'who':{'limit':15, 'permitted':3}, 'eu':{'limit':None, 'permitted':None}},
    'no2': {'who':{'limit':25, 'permitted':3}, 'eu':{'limit':None, 'permitted':None}},
    'o3': {'who':{'limit':100, 'permitted':3}, 'eu':{'limit':120, 'permitted':25/3}},
    'so2': {'who':{'limit':40, 'permitted':3}, 'eu':{'limit':125, 'permitted':3}},
}

def plot_urban_spain(pollutant):
    limit1 = standards[pollutant]['who']['limit']
    limit2 = standards[pollutant]['eu']['limit']

    df_urban_spain = pd.read_csv('airquality.csv')
    df_urban_spain = df_urban_spain.query('country == "spain" and site_area == "urban"')

    df_urban_spain = df_urban_spain[['site_id','date', pollutant]]
    df_urban_spain['date'] = pd.to_datetime(df_urban_spain['date'])

    df_urban_spain = df_urban_spain.pivot(index='date', columns='site_id', values=pollutant)

    colors = px.colors.qualitative.Plotly
    color_map = {col: colors[idx % len(colors)] for idx, col in enumerate(df_urban_spain.columns)}

    fig = go.Figure()
    for i in df_urban_spain.columns:
        df_urban_spain_i_lower = df_urban_spain[i][df_urban_spain[i] <= limit1]
        df_urban_spain_i_higher = df_urban_spain[i][df_urban_spain[i] > limit1]

        fig.add_scatter(
            x=df_urban_spain_i_lower.index, 
            y=df_urban_spain_i_lower, 
            mode='markers', 
            name=i,
            opacity=1,
            marker=dict(color='rgb(255,255,255)'),        
            showlegend=False,
        )

        fig.add_scatter(
            x=df_urban_spain_i_lower.index, 
            y=df_urban_spain_i_lower, 
            mode='markers', 
            name=i,
            opacity=0.4,
            marker=dict(color=color_map[i]),        
            showlegend=False
        )

        fig.add_scatter(
            x=df_urban_spain_i_higher.index, 
            y=df_urban_spain_i_higher, 
            mode='markers', 
            name=i,
            opacity=1,
            marker=dict(color=color_map[i]),        
        )

    fig.update_layout(
        title=f'{pollutant} levels in urban areas in Spain',
        plot_bgcolor='white',
        paper_bgcolor='white',
        yaxis=dict(
            showgrid=False,
            tickvals=[limit1, limit2] if limit2 != None else [limit1],
            title=f'{pollutant} levels',
        ),
        xaxis=dict(
            showgrid=False,
        ),
    )

    fig.add_shape(
        type="rect",
        x0=0, x1=1,
        y0=df_urban_spain.min().min(), y1=limit1,
        xref="paper",
        yref="y",
        fillcolor="green",
        opacity=0.25,
        layer="below",
        line=dict(width=0)
    )

    if limit2 != None:
        fig.add_shape(
            type="rect",
            x0=0, x1=1, 
            y0=limit1, y1=limit2,
            xref="paper",
            yref="y",
            fillcolor="yellow",
            opacity=0.25,
            layer="below",
            line=dict(width=0)
        )

    fig.add_shape(
        type="rect",
        x0=0, x1=1, 
        y0=limit2 if limit2 != None else limit1, y1=df_urban_spain.max().max(),
        xref="paper",
        yref="y",
        fillcolor="red",
        opacity=0.25,
        layer="below",
        line=dict(width=0)
    )

    if limit1 != None:
        fig.add_shape(
            type="line",
            x0=0, x1=1,
            y0=limit1, y1=limit1,
            xref="paper",
            yref="y",
            layer="above",
            line=dict(color="black", width=2)
        )

    if limit2 != None:
            fig.add_shape(
            type="line",
            x0=0, x1=1,
            y0=limit2, y1=limit2,
            xref="paper",
            yref="y",
            layer="above",
            line=dict(color="black", width=2)
    )


    fig.show()

plot_urban_spain('no2')

### Check worse no2 site in spain

(most values over limit)

In [645]:
df_worse_site_spain = pd.read_csv('airquality.csv')
df_worse_site_spain = df_worse_site_spain.query('country == "spain" and site_area == "urban" and no2 > 25')

df_worse_site_spain = df_worse_site_spain.groupby('site_id')[['no2']].count()
df_worse_site_spain = df_worse_site_spain.sort_values(by='no2', ascending=False)

colors = [
    'rgba(26, 26, 63, 1)',
    'rgba(112, 157, 147, 0.7)',
    'rgba(141, 178, 163, 0.7)',
    'rgba(170, 198, 182, 0.7)',
    'rgba(200, 219, 204, 0.7)',
    'rgba(230, 239, 230, 0.7)',
]

fig = go.Figure()
fig.add_pie(
    labels=df_worse_site_spain.index, 
    values=df_worse_site_spain['no2'], 
    text=df_worse_site_spain['no2'], 
    textposition='auto',
    textinfo='value',
    marker=dict(
        colors=colors,
    ),
    hole=0.15,
)


fig.update_layout(
    title=dict(
        text='Number of times each site exceeded the no2 limit in Spain',
        font=dict(
                size=30,
        ),
        subtitle=dict(
            text='out of all exceedances, how many times each site exceeded the limit',
            font=dict(
                size=20,
            ),
        ),
        pad=dict(
            b=100,
        ),
    ),
    xaxis=dict(
        title=''
    ),
    yaxis=dict(
        title='',
        tickvals=[],
    ),
    plot_bgcolor='rgba(0,0,0,0)',
)

fig.add_annotation(
    text="site \'es01108a\' exceeded the most",
    xref="paper",
    yref="paper",
    x=0.67,
    y=0.5,
    showarrow=True,
    ay=0,
    ax=150,
    arrowhead=1,
    arrowwidth=2,
    arrowcolor="black",
    font=dict(
        size=12,
        color="black"
    ),
    align="left",
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)
fig.show()

### Analyse the most no2 polluted site in spain

In [None]:
limit = 25
most_polluted_site = 'es0118a'
df_most_site_urban_spain = pd.read_csv('airquality.csv')
df_most_site_urban_spain = df_most_site_urban_spain.query(f'site_id == "{most_polluted_site}"')[['site_id','date','no2']]
df_most_site_urban_spain['date'] = pd.to_datetime(df_most_site_urban_spain['date'])

df_most_site_urban_spain['month'] = df_most_site_urban_spain['date'].dt.month
df_most_site_urban_spain_month = df_most_site_urban_spain.groupby('month')[['no2']].mean()

fig = go.Figure()
fig.add_scatter(
    x=df_most_site_urban_spain_month.index, 
    y=df_most_site_urban_spain_month['no2'], 
    mode='lines+markers', 
    name='no2', 
    opacity=1
)

fig.add_scatter(
    x=df_most_site_urban_spain_month.query(f"no2 > {limit}").index, 
    y=df_most_site_urban_spain_month.query(f"no2 > {limit}")['no2'], 
    mode='markers', 
    name='no2', 
    opacity=1,
    marker=dict(
        size=10,
        color='red',
    ),
)

fig.update_layout(
    title=dict(
        text='Levels of no2 in the most polluted site in Spain',
        font=dict(
                size=30,
        ),
        subtitle=dict(
            text='almost every month the no2 levels exceed the limit',
            font=dict(
                size=20,
            ),
        ),
        pad=dict(
            b=100,
        ),
    ),
    xaxis=dict(
        showgrid=False,
        tickvals=[1,2,3,4,5,6,7,8,9,10,11,12],
        ticktext=['January','February','March','April','May','June','July','August','September','October','November','December'],
    ),
    yaxis=dict(
        showgrid=False,
        title='NO2 levels (µg/m³)',
    ),
    plot_bgcolor='rgba(0,0,0,0)',
    showlegend=False
)

fig.add_shape(
    type="line",
    x0=0, x1=1,
    y0=limit, y1=limit,
    xref="paper",
    yref="y",
    layer="above",
    line=dict(
        color="red", 
        width=2,
        dash="dash"        
    )
)

fig.show()

### Number of no2 exceedances per site in urban spain

In [422]:
df_number_urban_spain = pd.read_csv('airquality.csv')
df_number_urban_spain = df_number_urban_spain.query('country == "spain" and site_area == "urban"')

for i in who_standards:
    df_number_urban_spain[f"{i}_exc"] = df_number_urban_spain[i] > 25

df_number_urban_spain = df_number_urban_spain[['site_id','pm10_exc']]
df_number_urban_spain_by = df_number_urban_spain.groupby('site_id').sum()
df_number_urban_spain_by = df_number_urban_spain_by / df_number_urban_spain_by.sum() *100
df_number_urban_spain_by = round(df_number_urban_spain_by, 3)

fig = go.Figure()

fig.add_pie(
    labels=df_number_urban_spain_by.index, 
    values=df_number_urban_spain_by['pm10_exc'], 
    name='no2_exc',
    hole=.3,
    opacity=1,
    textinfo='percent',
)
fig.show()

### Most polluted site by country

In [54]:
pollutant = 'no2'
df_most_site_country = pd.read_csv('airquality.csv')
df_most_site_country = df_most_site_country.groupby(['country','site_id'])[[pollutant]].mean()
df_most_site_country.sort_values(by=pollutant, ascending=False, inplace=True)
df_most_site_country.reset_index(inplace=True)
df_most_site_country

colors = [
    'blue' if country == "greece" 
    else 'yellow' if country == "germany" 
    else 'green' if country == "italy" 
    else 'red' if country == "spain" 
    else 'green' if country == "portugal" 
    else 'gray' if country == "hungary" 
    else 'grey'
        for country in df_most_site_country['country'].head(10)
]

texts = [
    f'{country}<br>{round(value,2)}' 
    for country, value in
    zip(df_most_site_country['country'].head(10), df_most_site_country[pollutant].head(10))
]

fig = go.Figure()
fig.add_bar(
    x=df_most_site_country['site_id'].head(10), 
    y=df_most_site_country[pollutant].head(10), 
    text=texts, 
    textangle=0,
    textposition='auto',
    marker=dict(
        color=colors,
    ),
    opacity=0.8,
)

fig.update_layout(
    title=dict(
        text='Most polluted sites in Europe',
        font=dict(
                size=30,
        ),
        subtitle=dict(
            text='Displaying no2 levels in the top 10 most polluted sites',
            font=dict(
                size=20,
            ),
        ),
        pad=dict(
            b=100,
        ),
    ),
    xaxis=dict(
        title='Site ID'
    ),
    yaxis=dict(
        title=f'{pollutant} (µg/m³)',
        tickvals=[],
    ),
    plot_bgcolor='rgba(0,0,0,0)',
    barmode='group',
)
