In [105]:
import glob
import math
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from fbprophet import Prophet
from sklearn import neighbors, ensemble, tree, metrics
from statsmodels.graphics import tsaplots
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import *
import calendar

%matplotlib notebook

package_dir = os.getcwd()

print(package_dir)

/home/sebastian/Programming/Bachelorthesis


In [2]:
df = pd.DataFrame()

for file in glob.glob('res/*2013_timeseries.csv'):
    # read single file, index on StationEoI and DatetimeEnd
    read = pd.read_csv(file,
                 encoding="utf-16", parse_dates=[13, 14],
                 infer_datetime_format=True,
                 index_col=[14])
    # drop 'bulk' files because they have different averaging
    bulks = read.SamplingPoint.str.lower().str.contains('bulk')
    clean = read[~bulks].copy()
    
    # ignore unnecessary columns
    clean.drop(columns=['Countrycode', 'Namespace', 'AirQualityNetwork',
                 'AirQualityStation', 'SamplingPoint', 'Sample',
                 'SamplingProcess', 'AirPollutantCode',
                 'DatetimeBegin', 'Validity', 'Verification',
                 'AveragingTime'],
        inplace=True)
    
    
    df = pd.concat([df, clean])

# make pollutant a column for better memory usage
df = df.pivot_table(columns='AirPollutant',
                   index=[df.index, 'AirQualityStationEoICode', 'UnitOfMeasurement'],
                   values='Concentration').reset_index(level=[1,2])

# make names shorter    
df.index.names = ['Timestamp']
#df.columns.names = [None, 'Pollutant']

df = df.sort_index()
#df = df.groupby(level=[0]).first()



In [121]:
def create_artificial_features(series, frequency='H', steps=7):
    nondups = series[~series.index.duplicated()]
    lagged = create_lagged_features(nondups, frequency, steps)
    
    statistics = lagged
    statistics['sum'] = lagged.sum(axis=1)
    statistics['mean'] = lagged.mean(axis=1)
    statistics['median'] = lagged.median(axis=1)
    
    weekdays = pd.get_dummies(lagged.index.weekday_name)
    weekdays = weekdays.applymap(lambda x: bool(x))
    weekdays.index = lagged.index
    
    months = pd.get_dummies(lagged.index.month.map(lambda x: calendar.month_abbr[x]))
    months = months.applymap(lambda x: bool(x))
    months.index = lagged.index
    
    out = statistics.join(weekdays).join(months)
    
    return out

def create_lagged_features(series, frequency='H', steps=7):
    lagged = pd.DataFrame()


    for i in range(0, steps):
        lagged['lag {}{}'.format(i, frequency)] = series.shift(i, freq=frequency)

    lagged.index = series.index
    lagged = lagged[steps:]

    return lagged.interpolate()

In [122]:
test = create_artificial_features(df[df.AirQualityStationEoICode == 'DESN025'].SO2)

In [123]:
test

Unnamed: 0_level_0,lag 0H,lag 1H,lag 2H,lag 3H,lag 4H,lag 5H,lag 6H,sum,mean,median,...,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01 07:00:00,4.1070,3.7150,4.0750,4.0430,4.9480,5.5610,10.5140,36.9630,9.240750,4.9480,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 08:00:00,3.8400,4.1070,3.7150,4.0750,4.0430,4.9480,5.5610,30.2890,7.572250,4.1070,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 09:00:00,4.0030,3.8400,4.1070,3.7150,4.0750,4.0430,4.9480,28.7310,7.182750,4.0750,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 10:00:00,3.8590,4.0030,3.8400,4.1070,3.7150,4.0750,4.0430,27.6420,6.910500,4.0430,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 11:00:00,3.7920,3.8590,4.0030,3.8400,4.1070,3.7150,4.0750,27.3910,6.847750,4.0030,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 12:00:00,4.1390,3.7920,3.8590,4.0030,3.8400,4.1070,3.7150,27.4550,6.863750,4.0030,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 13:00:00,3.9260,4.1390,3.7920,3.8590,4.0030,3.8400,4.1070,27.6660,6.916500,4.0030,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 14:00:00,4.1650,3.9260,4.1390,3.7920,3.8590,4.0030,3.8400,27.7240,6.931000,4.0030,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 15:00:00,3.8030,4.1650,3.9260,4.1390,3.7920,3.8590,4.0030,27.6870,6.921750,4.0030,...,False,False,True,False,False,False,False,False,False,False
2013-01-01 16:00:00,3.8240,3.8030,4.1650,3.9260,4.1390,3.7920,3.8590,27.5080,6.877000,3.9260,...,False,False,True,False,False,False,False,False,False,False
