In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

Air Quality Dataset from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Air+Quality).

In [15]:
df = pd.read_csv('../../Datasets/AirQualityUCI_ready.csv', parse_dates=['Date_Time'], index_col=['Date_Time'])
df.head()

Unnamed: 0_level_0,CO_true,CO_sensor,NMHC_true,C6H6_true,NMHC_sensor,NOX_true,NOX_sensor,NO2_true,NO2_sensor,O3_sensor,T,RH,AH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-10-03 18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-10-03 19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-10-03 20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-10-03 21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-10-03 22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


In [10]:
df.plot(subplots=True, figsize=(25, 25));


<img src='./plots/air-pollution-raw-data-plot.png'>

Due to data quality issues we will for this demo:

- reduce the data to the span between March 2004 and April 2005 (best quality).

- resample the time series to ensure that the time between each row is one hour.

- work only with the sensor values.

- remove negative values, that is, remove outliers.


In [17]:
# reduce the data : April 2004 - April-2005

data = df.query("index >= '2004-04-01' and index <= '2005-04-30' ")
print('Start date :',data.index.min(), 'End date :', data.index.max())

Start date : 2004-04-04 00:00:00 End date : 2005-04-04 14:00:00


In [18]:
# Resample the data
# if there are any gaps larger than 1hr they will be represented as NaN

data = data.asfreq(freq='1H')

In [21]:
# Remove measurements from fixed stations.
# We'll only be using sensor data.
cols_to_remove = [f for f in data.columns if '_true' in f ]
# Remove adjusted humidity.
cols_to_remove.append('AH')

data.drop(columns=cols_to_remove, inplace=True)

In [30]:
# Impute negative values with NaN
data[data < 0]=np.nan

In [31]:
data.isna().sum()

CO_sensor      1382
NMHC_sensor    1382
NOX_sensor     1382
NO2_sensor     1382
O3_sensor      1382
T              1395
RH             1382
dtype: int64

In [121]:
data.plot(subplots=True, figsize=(25,25));

<img src='./plots/air-pollution-working-data.png'>

## Lets look for patterns 

In [41]:
data['date'] = data.index.date
data['hour_of_day'] = data.index.hour
data['day_of_week'] = data.index.day_of_week
data['month'] = data.index.month
data['week_of_year'] = data.index.isocalendar().week 

## Daily pattern

In [119]:
cols = [c for c in data.columns if '_sensor' in c]
data.groupby(by='hour_of_day').mean()[cols].plot(subplots=True, figsize=(10, 8));

<img src='./plots/daily pattern - mean.png'>

In [120]:
fig, ax = plt.subplots(nrows=5, ncols=1, figsize=(10, 20), sharex=True)
ax = ax.ravel()
for i, c in enumerate(cols):
    data.pivot_table(values=c, index='hour_of_day', columns='date').plot(legend=False, color='lightblue', alpha=0.5, title=c, ax=ax[i])
    data.groupby('hour_of_day').mean()[c].plot(ax=ax[i], color='b')
plt.tight_layout()

<img src='./plots/daily pattern.png'>

In [118]:
data.groupby(by='hour_of_day').mean()[['T','RH']].plot(subplots=True, figsize=(10, 8), 
title=['Temperature', 'Relative Humidity']);

<img src='./plots/daily pattern - Temp and RH - mean.png'>

In [117]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,4))
ax = ax.ravel()
title = {'T':'Temperature','RH':'Relative Humidity'}
for i, c in enumerate(['T','RH']):
    data.pivot_table(values=c, index='hour_of_day', columns='date').plot(color='lightblue', alpha=0.5, title=title[c], legend=False, ax=ax[i])
    data.groupby('hour_of_day').mean()[c].plot(color='b', ax=ax[i])

<img src='./plots/daily pattern - Temp and RH.png'>

## Weekly pattern

In [113]:
fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(20, 25))
ax = ax.ravel()
for i, c in enumerate(cols):
    sns.lineplot(x='hour_of_day', y=c, hue='day_of_week', data=data, ax=ax[2*i])
    data.groupby('day_of_week').mean()[c].plot(ax=ax[2*i+1])

<img src='./plots/Weekly pattern.png'>

In [128]:
# weekly pattern for Temp and RH
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(20,15))
ax = ax.ravel()
title = {'T':'Temperature','RH':'Relative Humidity'}
for i, c in enumerate(['T', 'RH']):
    sns.lineplot(x='hour_of_day', y=c, hue='day_of_week', data=data, ax=ax[2*i])
    sns.lineplot(x='day_of_week', y=c, data=data, ax=ax[2*i+1])

<img src='./plots/Weekly pattern - Temp and RH.png'>

## Yearly pattern

In [132]:
fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(20, 25))
ax = ax.ravel()
for i, c in enumerate(cols):
    sns.lineplot(x='hour_of_day', y=c, hue='month', data=data, ax=ax[2*i])
    data.groupby('month').mean()[c].plot(ax=ax[2*i+1])

<img src='./plots/Yearly pattern.png'>

In [131]:
# weekly pattern for Temp and RH
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(20,15))
ax = ax.ravel()
title = {'T':'Temperature','RH':'Relative Humidity'}
for i, c in enumerate(['T', 'RH']):
    sns.lineplot(x='hour_of_day', y=c, hue='month', data=data, ax=ax[2*i])
    sns.lineplot(x='month', y=c, data=data, ax=ax[2*i+1])

<img src='./plots/Yearly pattern Temp and RH.png'>