In [2]:
import glob
import math
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from fbprophet import Prophet
from sklearn import neighbors, ensemble, tree, metrics
from statsmodels.graphics import tsaplots
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import *
import calendar

%matplotlib notebook

package_dir = os.getcwd()

print(package_dir)

/home/sebastian/Programming/Bachelorthesis


In [3]:
df = pd.DataFrame()

for file in glob.glob('res/*2013_timeseries.csv'):
    # read single file, index on StationEoI and DatetimeEnd
    read = pd.read_csv(file,
                 encoding="utf-16", parse_dates=[13, 14],
                 infer_datetime_format=True,
                 index_col=[14])
    # drop 'bulk' files because they have different averaging
    bulks = read.SamplingPoint.str.lower().str.contains('bulk')
    clean = read[~bulks].copy()
    
    # ignore unnecessary columns
    clean.drop(columns=['Countrycode', 'Namespace', 'AirQualityNetwork',
                 'AirQualityStation', 'SamplingPoint', 'Sample',
                 'SamplingProcess', 'AirPollutantCode',
                 'DatetimeBegin', 'Validity', 'Verification',
                 'AveragingTime'],
        inplace=True)
    
    
    df = pd.concat([df, clean])

# make pollutant a column for better memory usage
df = df.pivot_table(columns='AirPollutant',
                   index=[df.index, 'AirQualityStationEoICode', 'UnitOfMeasurement'],
                   values='Concentration').reset_index(level=[1,2])

# make names shorter    
df.index.names = ['Timestamp']
#df.columns.names = [None, 'Pollutant']

df = df.sort_index()
#df = df.groupby(level=[0]).first()



In [19]:
def create_artificial_features(series, frequency='H', steps=7):
    nondups = series[~series.index.duplicated()]
    lagged = create_lagged_features(nondups, frequency, steps)
    
    statistics = lagged
    statistics['sum'] = lagged.sum(axis=1)
    statistics['mean'] = lagged.mean(axis=1)
    statistics['median'] = lagged.median(axis=1)
    
    weekdays = pd.get_dummies(lagged.index.weekday_name)
    weekdays = weekdays.applymap(lambda x: bool(x))
    weekdays.index = lagged.index
    
    months = pd.get_dummies(lagged.index.month.map(lambda x: calendar.month_abbr[x]))
    months = months.applymap(lambda x: bool(x))
    months.index = lagged.index
    
    out = statistics.join(weekdays).join(months)
    
    return out

def create_lagged_features(series, frequency='H', steps=7):
    lagged = pd.DataFrame()


    for i in range(0, steps):
        lagged['lag {}{}'.format(i, frequency)] = series.shift(i, freq=frequency)

    print(lagged.head())
    print(series.head())

    return lagged.interpolate()

In [20]:
test = create_artificial_features(df[df.AirQualityStationEoICode == 'DESN025'].SO2)

                     lag 0H  lag 1H  lag 2H  lag 3H  lag 4H  lag 5H  lag 6H
Timestamp                                                                  
2013-01-01 00:00:00  27.458     NaN     NaN     NaN     NaN     NaN     NaN
2013-01-01 01:00:00  10.514  27.458     NaN     NaN     NaN     NaN     NaN
2013-01-01 02:00:00   5.561  10.514  27.458     NaN     NaN     NaN     NaN
2013-01-01 03:00:00   4.948   5.561  10.514  27.458     NaN     NaN     NaN
2013-01-01 04:00:00   4.043   4.948   5.561  10.514  27.458     NaN     NaN
Timestamp
2013-01-01 00:00:00    27.458
2013-01-01 01:00:00    10.514
2013-01-01 02:00:00     5.561
2013-01-01 03:00:00     4.948
2013-01-01 04:00:00     4.043
Name: SO2, dtype: float64


In [21]:
test

Unnamed: 0_level_0,lag 0H,lag 1H,lag 2H,lag 3H,lag 4H,lag 5H,lag 6H,sum,mean,median,...,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01 00:00:00,27.4580,,,,,,,27.4580,27.458000,27.458000,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 01:00:00,10.5140,27.4580,,,,,,37.9720,25.314667,26.386333,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 02:00:00,5.5610,10.5140,27.4580,,,,,43.5330,21.766500,21.766500,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 03:00:00,4.9480,5.5610,10.5140,27.4580,,,,48.4810,19.392400,14.953200,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 04:00:00,4.0430,4.9480,5.5610,10.5140,27.4580,,,52.5240,17.508000,10.514000,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 05:00:00,4.0750,4.0430,4.9480,5.5610,10.5140,27.4580,,56.5990,16.171143,8.037500,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 06:00:00,3.7150,4.0750,4.0430,4.9480,5.5610,10.5140,27.4580,60.3140,15.078500,5.561000,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 07:00:00,4.1070,3.7150,4.0750,4.0430,4.9480,5.5610,10.5140,36.9630,9.240750,4.948000,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 08:00:00,3.8400,4.1070,3.7150,4.0750,4.0430,4.9480,5.5610,30.2890,7.572250,4.107000,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 09:00:00,4.0030,3.8400,4.1070,3.7150,4.0750,4.0430,4.9480,28.7310,7.182750,4.075000,...,False,False,True,False,False,False,False,False,False,False


In [5]:
df[df.AirQualityStationEoICode == 'DESN025'].SO2

Timestamp
2013-01-01 00:00:00    27.458
2013-01-01 01:00:00    10.514
2013-01-01 02:00:00     5.561
2013-01-01 03:00:00     4.948
2013-01-01 04:00:00     4.043
2013-01-01 05:00:00     4.075
2013-01-01 06:00:00     3.715
2013-01-01 07:00:00     4.107
2013-01-01 08:00:00     3.840
2013-01-01 09:00:00     4.003
2013-01-01 10:00:00     3.859
2013-01-01 11:00:00     3.792
2013-01-01 12:00:00     4.139
2013-01-01 13:00:00     3.926
2013-01-01 14:00:00     4.165
2013-01-01 15:00:00     3.803
2013-01-01 16:00:00     3.824
2013-01-01 17:00:00     3.713
2013-01-01 18:00:00     3.697
2013-01-01 19:00:00     3.467
2013-01-01 20:00:00     3.401
2013-01-01 21:00:00     3.547
2013-01-01 22:00:00     3.744
2013-01-01 23:00:00       NaN
2013-01-01 23:00:00     3.593
2013-01-02 00:00:00     4.013
2013-01-02 01:00:00     4.964
2013-01-02 02:00:00     5.103
2013-01-02 03:00:00     5.452
2013-01-02 04:00:00     3.372
                        ...  
2013-12-30 19:00:00     2.320
2013-12-30 20:00:00     2.027
