# ETL of air pollution time series
## The deliverables
Data set containing all available recordings of hourly averaged pollutant concentrations measured in Hamburg in years 2013-2019

In [1]:
import urllib.request
import xml.etree.ElementTree as ET
from lxml import etree
import pandas as pd
import numpy as np

import re, collections
from io import StringIO
import os, fnmatch

import matplotlib.pyplot as plt

import geopandas as gpd
import mplleaflet

%matplotlib inline

In [2]:
## Download and decompress the dataset (2019) itself:
#!mkdir Correlaid.rawData
#!mkdir Correlaid.rawData/AQD_DE_E1a_2019
#!ls -l Correlaid.rawData/
#urllib.request.urlretrieve("https://datahub.uba.de/server/rest/directories/arcgisforinspire/INSPIRE/aqd_MapServer/Daten/AQD_DE_E1a_2019.zip", "Correlaid.rawData/AQD_DE_E1a_2019.zip")
#!mv Correlaid.rawData/AQD_DE_E1a_2019.zip Correlaid.rawData/AQD_DE_E1a_2019/
#!unzip Correlaid.rawData/AQD_DE_E1a_2019/AQD_DE_E1a_2019.zip -d Correlaid.rawData/
#!rm Correlaid.rawData/AQD_DE_E1a_2019/AQD_DE_E1a_2019.zip
#!unzip Correlaid.rawData/DISKO.zip -d Correlaid.rawData/AQD_DE_E1a_2019/
#!unzip Correlaid.rawData/KONTI.zip -d Correlaid.rawData/AQD_DE_E1a_2019/
#!rm Correlaid.rawData/DISKO.zip Correlaid.rawData/KONTI.zip

#Download the rdf
#urllib.request.urlretrieve("https://www.govdata.de/ckan/dataset/luftqualitatsdaten-datenstrom-e1a-validierte-einzelwerte-2019-datensatz.rdf", "Correlaid.rawData/AQD_DE_E1a_2019/luftqualitatsdaten-datenstrom-e1a-validierte-einzelwerte-2019-datensatz.rdf")

#Download Sensor positions
#urllib.request.urlretrieve("https://datahub.uba.de/server/rest/directories/arcgisforinspire/INSPIRE/aqd_MapServer/Daten/AQD_DE_D_2019.zip", "Correlaid.rawData/AQD_DE_D_2019.zip")
#!unzip Correlaid.rawData/AQD_DE_D_2019.zip -d Correlaid.rawData/
#!rm Correlaid.rawData/AQD_DE_D_2019.zip

# Download Town-county dataset:
#urllib.request.urlretrieve("https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/Archiv/GV100ADQ/GV100AD3107.zip?__blob=publicationFile",
#                           "Correlaid.rawData/GV100AD3107.zip")
#!mkdir Correlaid.rawData/GV100AD3107
#!unzip Correlaid.rawData/GV100AD3107.zip -d Correlaid.rawData/GV100AD3107/
#!rm Correlaid.rawData/GV100AD3107.zip

#!mkdir Correlaid.rawData/Geo
#urllib.request.urlretrieve("https://biogeo.ucdavis.edu/data/diva/adm/DEU_adm.zip", "Correlaid.rawData/Geo/DEU_adm.zip" 
#!unzip Correlaid.rawData/Geo/DEU_adm.zip -d Correlaid.rawData/Geo/

#!ls -la Correlaid.rawData/
#!ls -la Correlaid.rawData/AQD_DE_E1a_2019/
#!ls -la Correlaid.rawData/GV100AD3107/

#!pwd

In [3]:
#Download and decompress the dataset (2018) itself:
#!mkdir Correlaid.rawData/AQD_DE_E1a_2018
#!ls -l Correlaid.rawData/
#urllib.request.urlretrieve("https://datahub.uba.de/server/rest/directories/arcgisforinspire/INSPIRE/aqd_MapServer/Daten/AQD_DE_E1a_2018.zip", "Correlaid.rawData/AQD_DE_E1a_2018.zip")
#!mv Correlaid.rawData/AQD_DE_E1a_2018.zip Correlaid.rawData/AQD_DE_E1a_2018/
#!unzip Correlaid.rawData/AQD_DE_E1a_2018/AQD_DE_E1a_2018.zip -d Correlaid.rawData/
#!rm Correlaid.rawData/AQD_DE_E1a_2018/AQD_DE_E1a_2018.zip
#!mv Correlaid.rawData/E1a/* Correlaid.rawData/AQD_DE_E1a_2018/
#!rm -rf Correlaid.rawData/E1a

In [4]:
#Download and decompress the dataset (2017) itself:
#!mkdir Correlaid.rawData/AQD_DE_E1a_2017
#urllib.request.urlretrieve("https://datahub.uba.de/server/rest/directories/arcgisforinspire/INSPIRE/aqd_MapServer/Daten/AQD_DE_E1a_2017.zip", "Correlaid.rawData/AQD_DE_E1a_2017.zip")
#!mv Correlaid.rawData/AQD_DE_E1a_2017.zip Correlaid.rawData/AQD_DE_E1a_2017/
#!unzip Correlaid.rawData/AQD_DE_E1a_2017/AQD_DE_E1a_2017.zip -d Correlaid.rawData/
#!rm Correlaid.rawData/AQD_DE_E1a_2017/AQD_DE_E1a_2017.zip
#!unzip Correlaid.rawData/AQD_DE_E1a_2017/DISKO.zip -d Correlaid.rawData/AQD_DE_E1a_2017/
#!unzip Correlaid.rawData/AQD_DE_E1a_2017/KONTI.zip -d Correlaid.rawData/AQD_DE_E1a_2017/
#!rm Correlaid.rawData/AQD_DE_E1a_2017/DISKO.zip Correlaid.rawData/AQD_DE_E1a_2017/KONTI.zip


In [5]:
#Download and decompress the dataset (2016) itself:
#!mkdir Correlaid.rawData/AQD_DE_E1a_2016
#urllib.request.urlretrieve("https://datahub.uba.de/server/rest/directories/arcgisforinspire/INSPIRE/aqd_MapServer/Daten/AQD_DE_E1a_2016.zip", "Correlaid.rawData/AQD_DE_E1a_2016.zip")
#!mv Correlaid.rawData/AQD_DE_E1a_2016.zip Correlaid.rawData/AQD_DE_E1a_2016/
#!unzip Correlaid.rawData/AQD_DE_E1a_2016/AQD_DE_E1a_2016.zip -d Correlaid.rawData/
#!rm Correlaid.rawData/AQD_DE_E1a_2016/AQD_DE_E1a_2016.zip

#  !!!!!!!!!! No data for Hamburg available here !!!!!!!!!!

In [6]:
#Download and decompress the dataset (2015) itself:
#!mkdir Correlaid.rawData/AQD_DE_E1a_2015
#urllib.request.urlretrieve("https://datahub.uba.de/server/rest/directories/arcgisforinspire/INSPIRE/aqd_MapServer/Daten/AQD_DE_E1a_2015.zip", "Correlaid.rawData/AQD_DE_E1a_2015.zip")
#!mv Correlaid.rawData/AQD_DE_E1a_2015.zip Correlaid.rawData/AQD_DE_E1a_2015/
#!unzip Correlaid.rawData/AQD_DE_E1a_2015/AQD_DE_E1a_2015.zip -d Correlaid.rawData/
#!rm Correlaid.rawData/AQD_DE_E1a_2015/AQD_DE_E1a_2015.zip


In [7]:
#Download and decompress the dataset (2014) itself:
#!mkdir Correlaid.rawData/AQD_DE_E1a_2014
#urllib.request.urlretrieve("https://datahub.uba.de/server/rest/directories/arcgisforinspire/INSPIRE/aqd_MapServer/Daten/AQD_DE_E1a_2014.zip", "Correlaid.rawData/AQD_DE_E1a_2014.zip")
#!mv Correlaid.rawData/AQD_DE_E1a_2014.zip Correlaid.rawData/AQD_DE_E1a_2014/
#!unzip Correlaid.rawData/AQD_DE_E1a_2014/AQD_DE_E1a_2014.zip -d Correlaid.rawData/
#!rm Correlaid.rawData/AQD_DE_E1a_2014/AQD_DE_E1a_2014.zip


In [8]:
#Download and decompress the dataset (2013) itself:
#!mkdir Correlaid.rawData/AQD_DE_E1a_2013
#urllib.request.urlretrieve("https://datahub.uba.de/server/rest/directories/arcgisforinspire/INSPIRE/aqd_MapServer/Daten/AQD_DE_E1a_2013.zip", "Correlaid.rawData/AQD_DE_E1a_2013.zip")
#!mv Correlaid.rawData/AQD_DE_E1a_2013.zip Correlaid.rawData/AQD_DE_E1a_2013/
#!unzip Correlaid.rawData/AQD_DE_E1a_2013/AQD_DE_E1a_2013.zip -d Correlaid.rawData/
#!rm Correlaid.rawData/AQD_DE_E1a_2013/AQD_DE_E1a_2013.zip


In [9]:
def etl_concentrations_timeseries_from_file(input_file):
    # pick all tags from the XML file
    Etree = ET.parse(input_file)
    Eroot = Etree.getroot()
    Eroot.tag
    Eroot.attrib
    AllTags = [elem.tag for elem in Eroot.iter()]
  
    varFull = [s for s in AllTags if 'value' in s][0]
    ColNamesExp = [re.sub(r'[^a-zA-Z0-9:]*\'{http(.*)$', r'', re.sub(r'^.*AQD\/SPO.DE_', r'', str(varr.attrib))) for varr in Eroot.iter(varFull) if 'AQD' in str(varr.attrib)] 

    
    varFull = [s for s in AllTags if 'values' in s][0]

    dff=[]
    for varr in Eroot.iter(varFull):
        dff.append(pd.read_csv(StringIO((varr.text).replace("@@","\n")), sep=",", header=None))
    
    out_df=pd.concat([dff[s][4] for s in range(0,len(dff))], axis=1)
    out_df.columns=ColNamesExp
    out_df.insert(loc=0, column="observation_period", value=dff[0][0])
    return(out_df)    

def etl_concentrations_timeseries_from_dir_and_mask(input_dir, file_mask):
    files_hour = []
    for file in os.listdir(input_dir):
        if fnmatch.fnmatch(file, file_mask):
            files_hour.append(file)

    # pick all tags from the XML file
    Etree = ET.parse(input_dir + files_hour[0])
    Eroot = Etree.getroot()
    Eroot.tag
    Eroot.attrib
    AllTags = [elem.tag for elem in Eroot.iter()]

#    ColNamesExp=SelectAllXMLsensorID(AllTags)
    varFull = [s for s in AllTags if 'values' in s][0]

    dff=[]
    for varr in Eroot.iter(varFull):
        dff.append(pd.read_csv(StringIO((varr.text).replace("@@","\n")), sep=",", header=None))

    out_df = dff[0][[0]]
    out_df.columns=['observation_period']

# get all tags in xml file; Note, that the actual data is kept as a TEXT of *values* tags 
    for file in files_hour:
        Etree = ET.parse(input_dir + file)
        Eroot = Etree.getroot()
        Eroot.tag
        Eroot.attrib
        AllTags = [elem.tag for elem in Eroot.iter()]
           
        varFull = [s for s in AllTags if 'value' in s][0]
        ColNamesExp = [re.sub(r'[^a-zA-Z0-9:]*\'{http(.*)$', r'', re.sub(r'^.*AQD\/SPO.DE_', r'', str(varr.attrib))) for varr in Eroot.iter(varFull) if 'AQD' in str(varr.attrib)] 

        
        # Compare column names with file names, they should encode same country, state and pollutant
        for ColName in ColNamesExp:
            if ((ColName[0:2]!=file[0:2]) or (ColName[2:4]!=file[3:5]) or (ColName[8:11]!=file[11:14])):
                print("Inconsistency in file and column names: ", file, ColName)
                exit()
    
        varFull = [s for s in AllTags if 'values' in s][0]
    
        dff=[] # Temporary list for DataFrames
        # reading actual pollutant data fiom the text field:    
        for varr in Eroot.iter(varFull):
            dff.append(pd.read_csv(StringIO((varr.text).replace("@@","\n")), sep=",", header=None))

        # checking, that measurment timestamps are identical in the files read    
        bad_s = []
        for s in range(0,len(dff)):
            if not (out_df['observation_period']).equals(dff[s][0]):
                print("Inconsistency of observation times in the following files: ", file, files_hour[0])
                print(out_df['observation_period'])
                print(dff[s][0])
                print(s)
                print(ColNamesExp[s])
                bad_s.append(s) 
                
#                exit()

        for s in bad_s:
            del ColNamesExp[s]
            del dff[s]

        # select column 4 - pollutant concentration:
        dff=pd.concat([dff[s][4] for s in range(0,len(dff))], axis=1)
        dff.columns=ColNamesExp
   
        out_df=pd.concat([out_df, dff], axis=1)    
#    out_df=pd.concat([dff[s][4] for s in range(0,len(dff))], axis=1)
#    out_df.columns=ColNamesExp
#    out_df.insert(loc=0, column="observation_period", value=dff[0][0])
    return(out_df)    

In [10]:
wide_df19 = etl_concentrations_timeseries_from_file("Correlaid.rawData/AQD_DE_E1a_2019/DE_HH_2019_hour.xml")

Now we have wide data frame, containing timeseries of all pollutant concentrations for all sensors. The pollutant type and the sensor ID are encoded in column names. The minimal value of pollutant concentrations -999.0 is equivalent to NA and will be imputted, as well as all negative values (the concentration can not be negative). The limit for imputation will be set to 876, i.e. NA sequences exceeding 10% of the year will not be imputted. Since the number of heavily corrupted columns is below 2%, they will be dropped in favor to the information quality:

In [11]:
wide_df19.head(5)

Unnamed: 0,observation_period,DEHH068_CHB_dataGroup1,DEHH070_CHB_dataGroup1,DEHH008_NO2_dataGroup1,DEHH015_NO2_dataGroup1,DEHH016_NO2_dataGroup1,DEHH026_NO2_dataGroup1,DEHH033_NO2_dataGroup1,DEHH047_NO2_dataGroup1,DEHH050_NO2_dataGroup1,...,DEHH016_SO2_dataGroup1,DEHH059_SO2_dataGroup1,DEHH079_SO2_dataGroup1,DEHH081_SO2_dataGroup1,DEHH008_PM2_dataGroup1,DEHH015_PM2_dataGroup1,DEHH033_PM2_dataGroup1,DEHH059_PM2_dataGroup1,DEHH064_PM2_dataGroup1,DEHH068_PM2_dataGroup1
0,2019-01-01T00:00:00+01:00,2.182,0.977,23.896,16.787,13.292,30.217,14.883,13.441,10.037,...,2.5,2.5,2.5,2.5,98.733,116.412,51.636,88.387,216.47,602.38
1,2019-01-01T01:00:00+01:00,0.693,0.773,13.698,11.791,16.222,19.486,6.349,6.496,4.0,...,2.5,2.5,2.5,9.531,33.534,96.405,75.457,65.468,161.832,80.708
2,2019-01-01T02:00:00+01:00,0.454,0.675,7.991,6.998,15.669,12.586,6.243,4.708,2.0,...,2.5,2.5,2.5,11.27,24.592,25.195,15.651,13.072,18.958,36.882
3,2019-01-01T03:00:00+01:00,0.2,-999.0,7.322,5.273,14.999,12.025,4.714,4.13,2.0,...,2.5,2.5,2.5,2.5,22.92,16.258,11.641,12.416,13.909,36.853
4,2019-01-01T04:00:00+01:00,-999.0,-999.0,6.211,5.665,13.821,9.234,5.18,2.0,2.0,...,5.926,2.5,2.5,2.5,30.757,19.862,15.598,17.161,17.068,47.537


In [12]:
def clean_wide_df(df):
    out_df = df.copy()
    df_observation_period = out_df["observation_period"]
    out_df.drop(["observation_period"], axis=1, inplace=True)
    out_df[out_df.loc[:, out_df.columns != 'observation_period'] < 0.0] = np.NaN # concentration cannot be negative
    out_df.interpolate(method='linear', inplace=True, axis=0, limit=876, limit_direction='both')
    out_df.insert(loc=0, column="observation_period", value=pd.to_datetime(df_observation_period))
    out_df.dropna(axis=1, inplace=True)
    out_df = out_df.set_index("observation_period")
    return(out_df)

In [13]:
clean_df19 = clean_wide_df(wide_df19)

In [14]:
clean_df19

Unnamed: 0_level_0,DEHH008_NO2_dataGroup1,DEHH015_NO2_dataGroup1,DEHH016_NO2_dataGroup1,DEHH026_NO2_dataGroup1,DEHH033_NO2_dataGroup1,DEHH047_NO2_dataGroup1,DEHH050_NO2_dataGroup1,DEHH059_NO2_dataGroup1,DEHH064_NO2_dataGroup1,DEHH068_NO2_dataGroup1,...,DEHH016_SO2_dataGroup1,DEHH059_SO2_dataGroup1,DEHH079_SO2_dataGroup1,DEHH081_SO2_dataGroup1,DEHH008_PM2_dataGroup1,DEHH015_PM2_dataGroup1,DEHH033_PM2_dataGroup1,DEHH059_PM2_dataGroup1,DEHH064_PM2_dataGroup1,DEHH068_PM2_dataGroup1
observation_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 00:00:00+01:00,23.896,16.787,13.292,30.217,14.883,13.441,10.037,12.743,26.956,34.375,...,2.500,2.500,2.5,2.500,98.733,116.412,51.636,88.387,216.470,602.380
2019-01-01 01:00:00+01:00,13.698,11.791,16.222,19.486,6.349,6.496,4.000,6.534,21.644,20.793,...,2.500,2.500,2.5,9.531,33.534,96.405,75.457,65.468,161.832,80.708
2019-01-01 02:00:00+01:00,7.991,6.998,15.669,12.586,6.243,4.708,2.000,7.821,19.521,16.449,...,2.500,2.500,2.5,11.270,24.592,25.195,15.651,13.072,18.958,36.882
2019-01-01 03:00:00+01:00,7.322,5.273,14.999,12.025,4.714,4.130,2.000,4.483,11.384,13.806,...,2.500,2.500,2.5,2.500,22.920,16.258,11.641,12.416,13.909,36.853
2019-01-01 04:00:00+01:00,6.211,5.665,13.821,9.234,5.180,2.000,2.000,4.407,9.204,11.069,...,5.926,2.500,2.5,2.500,30.757,19.862,15.598,17.161,17.068,47.537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 19:00:00+01:00,18.548,54.225,22.169,47.611,31.095,27.900,8.188,35.967,37.350,57.787,...,2.500,6.883,2.5,2.500,21.517,13.039,22.387,38.361,23.008,185.311
2019-12-31 20:00:00+01:00,29.200,53.365,54.761,43.915,27.776,37.072,12.948,31.421,58.767,54.685,...,2.500,8.485,2.5,2.500,49.821,16.909,32.743,29.221,35.980,98.057
2019-12-31 21:00:00+01:00,38.042,52.547,66.944,49.817,24.601,32.879,14.205,29.640,61.504,44.541,...,2.500,9.448,2.5,2.500,62.207,28.918,36.081,63.075,63.972,122.113
2019-12-31 22:00:00+01:00,36.366,56.707,52.487,57.050,34.435,33.688,16.485,32.305,53.528,40.023,...,2.500,34.947,2.5,2.500,93.029,31.077,48.492,42.006,84.337,124.006


In [15]:
wide_df18 = etl_concentrations_timeseries_from_dir_and_mask("Correlaid.rawData/AQD_DE_E1a_2018/", "DE_HH*hour*")

Inconsistency of observation times in the following files:  DE_HH_2018_PM2_hour.xml DE_HH_2018_O3_hour.xml
0       2018-01-01T00:00:00+01:00
1       2018-01-01T01:00:00+01:00
2       2018-01-01T02:00:00+01:00
3       2018-01-01T03:00:00+01:00
4       2018-01-01T04:00:00+01:00
                  ...            
8755    2018-12-31T19:00:00+01:00
8756    2018-12-31T20:00:00+01:00
8757    2018-12-31T21:00:00+01:00
8758    2018-12-31T22:00:00+01:00
8759    2018-12-31T23:00:00+01:00
Name: observation_period, Length: 8760, dtype: object
0       2018-04-01T00:00:00+01:00
1       2018-04-01T01:00:00+01:00
2       2018-04-01T02:00:00+01:00
3       2018-04-01T03:00:00+01:00
4       2018-04-01T04:00:00+01:00
                  ...            
5851    2018-11-30T19:00:00+01:00
5852    2018-11-30T20:00:00+01:00
5853    2018-11-30T21:00:00+01:00
5854    2018-11-30T22:00:00+01:00
5855    2018-11-30T23:00:00+01:00
Name: 0, Length: 5856, dtype: object
2
DEHH033_PM2_dataGroup1


In [16]:
clean_df18 = clean_wide_df(wide_df18)

In [17]:
clean_df19.head(5)

Unnamed: 0_level_0,DEHH008_NO2_dataGroup1,DEHH015_NO2_dataGroup1,DEHH016_NO2_dataGroup1,DEHH026_NO2_dataGroup1,DEHH033_NO2_dataGroup1,DEHH047_NO2_dataGroup1,DEHH050_NO2_dataGroup1,DEHH059_NO2_dataGroup1,DEHH064_NO2_dataGroup1,DEHH068_NO2_dataGroup1,...,DEHH016_SO2_dataGroup1,DEHH059_SO2_dataGroup1,DEHH079_SO2_dataGroup1,DEHH081_SO2_dataGroup1,DEHH008_PM2_dataGroup1,DEHH015_PM2_dataGroup1,DEHH033_PM2_dataGroup1,DEHH059_PM2_dataGroup1,DEHH064_PM2_dataGroup1,DEHH068_PM2_dataGroup1
observation_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 00:00:00+01:00,23.896,16.787,13.292,30.217,14.883,13.441,10.037,12.743,26.956,34.375,...,2.5,2.5,2.5,2.5,98.733,116.412,51.636,88.387,216.47,602.38
2019-01-01 01:00:00+01:00,13.698,11.791,16.222,19.486,6.349,6.496,4.0,6.534,21.644,20.793,...,2.5,2.5,2.5,9.531,33.534,96.405,75.457,65.468,161.832,80.708
2019-01-01 02:00:00+01:00,7.991,6.998,15.669,12.586,6.243,4.708,2.0,7.821,19.521,16.449,...,2.5,2.5,2.5,11.27,24.592,25.195,15.651,13.072,18.958,36.882
2019-01-01 03:00:00+01:00,7.322,5.273,14.999,12.025,4.714,4.13,2.0,4.483,11.384,13.806,...,2.5,2.5,2.5,2.5,22.92,16.258,11.641,12.416,13.909,36.853
2019-01-01 04:00:00+01:00,6.211,5.665,13.821,9.234,5.18,2.0,2.0,4.407,9.204,11.069,...,5.926,2.5,2.5,2.5,30.757,19.862,15.598,17.161,17.068,47.537


In [19]:
clean_df18

Unnamed: 0_level_0,DEHH008_O3_dataGroup1,DEHH033_O3_dataGroup1,DEHH047_O3_dataGroup1,DEHH050_O3_dataGroup1,DEHH008_NO2_dataGroup1,DEHH015_NO2_dataGroup1,DEHH016_NO2_dataGroup1,DEHH026_NO2_dataGroup1,DEHH033_NO2_dataGroup1,DEHH047_NO2_dataGroup1,...,DEHH033_CO_dataGroup1,DEHH068_CO_dataGroup1,DEHH070_CO_dataGroup1,DEHH008_PM2_dataGroup1,DEHH015_PM2_dataGroup1,DEHH059_PM2_dataGroup1,DEHH064_PM2_dataGroup1,DEHH068_PM2_dataGroup1,DEHH068_CHB_dataGroup1,DEHH070_CHB_dataGroup1
observation_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01 00:00:00+01:00,5.733,48.233,10.995,50.542,80.490,34.222,11.322,77.654,27.589,68.893,...,0.21172,0.43220,0.37622,549.640,162.097,99.554,488.917,743.743,2.415,1.995
2018-01-01 01:00:00+01:00,53.963,57.160,37.245,75.750,28.161,22.282,11.840,31.900,19.627,39.088,...,0.20650,0.38507,0.38644,84.099,247.070,148.521,369.854,127.531,0.953,1.363
2018-01-01 02:00:00+01:00,73.001,73.601,70.628,79.439,10.369,7.401,5.179,18.802,5.665,10.061,...,0.10000,0.23405,0.25444,22.791,56.550,26.843,21.878,21.889,0.560,0.731
2018-01-01 03:00:00+01:00,67.785,72.184,71.419,76.205,13.777,6.856,4.511,19.693,5.671,9.504,...,0.10000,0.25125,0.24649,19.285,10.776,15.367,11.376,14.496,0.475,0.667
2018-01-01 04:00:00+01:00,61.937,69.518,68.152,74.942,18.196,5.681,2.000,23.576,5.711,11.132,...,0.10000,0.20816,0.28465,19.095,8.729,9.942,12.469,12.429,0.200,0.543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31 19:00:00+01:00,43.933,54.628,52.422,55.125,15.400,13.197,8.648,25.456,6.922,5.469,...,0.10000,0.30992,0.32258,10.605,11.059,15.343,19.100,12.254,0.881,0.809
2018-12-31 20:00:00+01:00,47.734,50.970,50.816,56.675,11.987,12.335,11.623,21.631,9.540,7.180,...,0.10000,0.27090,0.24942,16.148,19.236,19.265,13.946,13.029,0.574,0.768
2018-12-31 21:00:00+01:00,47.771,54.388,51.727,52.691,10.930,11.573,16.141,16.907,7.086,5.618,...,0.10000,0.21563,0.27449,21.532,29.982,51.946,14.733,24.327,0.536,0.733
2018-12-31 22:00:00+01:00,51.835,59.942,57.694,58.781,10.401,11.521,13.631,16.352,5.245,4.195,...,0.10000,0.10000,0.25466,16.163,16.609,34.029,16.310,22.890,0.452,0.666


In [20]:
wide_df17 = etl_concentrations_timeseries_from_dir_and_mask("Correlaid.rawData/AQD_DE_E1a_2017/", "DE_HH*hour*")

In [21]:
clean_df17 = clean_wide_df(wide_df17)

In [22]:
clean_df17

Unnamed: 0_level_0,DEHH008_O3_dataGroup1,DEHH033_O3_dataGroup1,DEHH047_O3_dataGroup1,DEHH050_O3_dataGroup1,DEHH008_SO2_dataGroup1,DEHH015_SO2_dataGroup1,DEHH016_SO2_dataGroup1,DEHH059_SO2_dataGroup1,DEHH079_SO2_dataGroup1,DEHH081_SO2_dataGroup1,...,DEHH015_PM1_dataGroup1,DEHH016_PM1_dataGroup1,DEHH026_PM1_dataGroup1,DEHH033_PM1_dataGroup1,DEHH059_PM1_dataGroup1,DEHH068_PM1_dataGroup1,DEHH070_PM1_dataGroup1,DEHH072_PM1_dataGroup1,DEHH079_PM1_dataGroup1,DEHH081_PM1_dataGroup1
observation_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01 00:00:00+01:00,2.981,2.706,2.145,3.396,8.274,2.500,14.998,9.477,5.913,2.500,...,249.448,44.554,266.052,75.456,192.427,469.227,277.583,52.103,71.111,46.581
2017-01-01 01:00:00+01:00,5.201,4.344,3.539,8.354,2.500,2.500,12.102,2.500,2.500,2.500,...,275.973,80.144,247.447,152.603,196.997,141.295,225.350,47.672,170.415,71.369
2017-01-01 02:00:00+01:00,8.517,7.400,6.842,10.545,2.500,2.500,8.622,2.500,2.500,2.500,...,59.263,51.266,51.252,58.362,51.243,98.176,48.333,51.111,97.089,49.493
2017-01-01 03:00:00+01:00,9.286,8.151,8.131,12.567,2.500,2.500,10.477,2.500,2.500,2.500,...,54.294,49.809,58.486,54.551,51.564,105.842,55.979,58.573,57.202,56.631
2017-01-01 04:00:00+01:00,11.734,11.129,10.738,16.297,2.500,2.500,33.806,2.500,2.500,2.500,...,48.903,48.187,59.614,73.600,47.583,82.288,56.283,45.581,55.794,53.274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-31 19:00:00+01:00,53.323,55.409,54.338,70.754,2.500,5.734,7.575,2.500,2.500,6.704,...,6.638,3.036,24.307,7.809,12.539,51.467,21.077,6.735,9.920,6.485
2017-12-31 20:00:00+01:00,56.664,51.968,48.386,69.373,2.500,2.500,9.512,2.500,2.500,2.500,...,11.923,4.220,18.198,11.233,22.416,45.212,29.480,6.411,11.836,6.662
2017-12-31 21:00:00+01:00,35.169,50.740,36.031,69.102,2.500,2.500,9.660,8.649,2.500,2.500,...,18.904,5.162,33.793,16.538,29.383,63.372,27.154,8.836,12.413,8.112
2017-12-31 22:00:00+01:00,33.934,55.421,33.438,76.691,6.163,5.480,2.500,2.500,2.500,9.431,...,22.250,10.204,47.489,26.634,32.953,74.021,24.785,8.177,15.343,6.805


In [23]:
wide_df15 = etl_concentrations_timeseries_from_dir_and_mask("Correlaid.rawData/AQD_DE_E1a_2015/", "DE_HH*hour*")

In [24]:
clean_df15 = clean_wide_df(wide_df15)

In [25]:
clean_df15

Unnamed: 0_level_0,DEHH033_CHT_dataGroup1,DEHH068_CHT_dataGroup1,DEHH070_CHT_dataGroup1,DEHH008_PM1_dataGroup1,DEHH015_PM1_dataGroup1,DEHH016_PM1_dataGroup1,DEHH026_PM1_dataGroup1,DEHH033_PM1_dataGroup1,DEHH059_PM1_dataGroup1,DEHH068_PM1_dataGroup1,...,DEHH015_SO2_dataGroup1,DEHH016_SO2_dataGroup1,DEHH059_SO2_dataGroup1,DEHH079_SO2_dataGroup1,DEHH008_O3_dataGroup1,DEHH021_O3_dataGroup1,DEHH033_O3_dataGroup1,DEHH047_O3_dataGroup1,DEHH049_O3_dataGroup1,DEHH050_O3_dataGroup1
observation_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00+01:00,0.300,0.606,1.361,194.214,273.973,38.586,387.395,83.905,298.431,259.401,...,22.382,6.346,14.330,2.500,19.080,35.683,31.989,29.137,33.607,26.305
2015-01-01 01:00:00+01:00,0.300,1.579,1.544,251.420,193.017,56.151,282.187,110.601,315.916,83.332,...,12.516,8.291,2.500,2.500,30.527,32.959,44.279,42.519,35.086,31.649
2015-01-01 02:00:00+01:00,0.300,0.926,1.837,66.420,53.664,41.400,90.490,39.807,45.689,64.071,...,2.500,9.858,2.500,2.500,27.346,34.243,35.418,36.046,33.607,22.944
2015-01-01 03:00:00+01:00,0.300,2.012,1.573,57.653,50.648,44.934,72.336,35.418,52.353,77.750,...,7.187,6.538,2.500,2.500,18.246,30.559,30.106,30.021,24.218,18.520
2015-01-01 04:00:00+01:00,0.300,0.960,1.208,76.728,68.766,62.745,71.432,50.039,70.809,102.732,...,2.500,22.388,2.500,2.500,13.692,29.339,24.843,21.460,23.251,22.629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31 19:00:00+01:00,1.358,5.788,3.620,47.061,32.717,27.476,56.207,51.889,103.716,72.818,...,20.393,2.500,2.500,5.584,2.369,7.661,1.000,1.000,14.334,26.720
2015-12-31 20:00:00+01:00,1.250,3.586,3.363,55.820,32.906,23.505,85.627,55.953,151.066,74.357,...,2.500,2.500,6.828,7.954,2.172,5.839,1.000,1.000,14.009,29.487
2015-12-31 21:00:00+01:00,1.481,3.428,3.765,68.944,38.116,31.032,277.325,61.059,138.402,82.741,...,2.500,2.500,6.707,7.313,2.306,6.284,1.000,1.000,17.394,26.797
2015-12-31 22:00:00+01:00,1.278,3.189,2.544,72.957,41.059,52.478,1465.851,75.390,170.917,93.051,...,2.500,2.500,15.909,5.042,3.287,5.465,1.000,2.214,14.682,26.261


In [26]:
wide_df14 = etl_concentrations_timeseries_from_dir_and_mask("Correlaid.rawData/AQD_DE_E1a_2014/", "DE_HH*hour*")

In [27]:
clean_df14 = clean_wide_df(wide_df14)

In [28]:
clean_df14

Unnamed: 0_level_0,DEHH008_NO2_dataGroup1,DEHH015_NO2_dataGroup1,DEHH016_NO2_dataGroup1,DEHH021_NO2_dataGroup1,DEHH026_NO2_dataGroup1,DEHH033_NO2_dataGroup1,DEHH047_NO2_dataGroup1,DEHH049_NO2_dataGroup1,DEHH050_NO2_dataGroup1,DEHH059_NO2_dataGroup1,...,DEHH068_CO_dataGroup1,DEHH070_CO_dataGroup1,DEHH033_CHB_dataGroup1,DEHH068_CHB_dataGroup1,DEHH070_CHB_dataGroup1,DEHH008_SO2_dataGroup1,DEHH015_SO2_dataGroup1,DEHH016_SO2_dataGroup1,DEHH059_SO2_dataGroup1,DEHH079_SO2_dataGroup1
observation_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01 00:00:00+01:00,29.605,20.822,16.216,14.953,45.881,31.896,33.575,24.676,33.214,34.440,...,0.83554,0.44270,1.150,9.265,2.139,32.011,103.738,2.500,43.181,9.318
2014-01-01 01:00:00+01:00,28.176,27.445,19.269,17.767,39.718,28.332,30.736,27.230,27.365,21.734,...,0.46830,0.43795,1.068,2.609,1.301,19.786,88.501,2.500,2.500,96.750
2014-01-01 02:00:00+01:00,26.244,23.602,18.785,18.565,32.359,27.216,21.213,21.052,9.967,20.470,...,0.58952,0.42798,0.844,3.155,1.199,21.290,89.381,2.500,2.500,2.500
2014-01-01 03:00:00+01:00,23.282,24.485,18.739,16.758,27.207,28.378,21.322,14.874,9.705,20.270,...,0.44478,0.37875,0.921,1.839,1.097,15.492,199.038,2.500,2.500,2.500
2014-01-01 04:00:00+01:00,28.525,24.979,18.967,16.233,26.184,30.233,21.905,16.395,9.448,22.727,...,0.44657,0.37548,0.926,1.726,1.039,29.537,136.358,2.500,2.500,2.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-12-31 19:00:00+01:00,25.536,26.290,27.161,19.574,36.385,20.227,20.162,11.558,17.444,34.880,...,0.48515,0.47893,0.200,1.252,1.966,2.500,25.526,9.088,2.500,2.500
2014-12-31 20:00:00+01:00,22.994,22.238,23.154,15.406,33.585,13.337,13.745,10.000,14.015,30.200,...,0.41745,0.37626,0.200,1.095,1.045,2.500,12.553,9.197,2.500,2.500
2014-12-31 21:00:00+01:00,20.452,16.853,18.195,11.773,28.833,10.747,11.876,13.314,11.674,25.584,...,0.29884,0.33854,0.200,0.673,1.055,2.500,10.907,9.218,2.500,2.500
2014-12-31 22:00:00+01:00,22.694,20.431,26.749,10.867,30.057,14.763,13.490,12.834,13.284,20.210,...,0.31764,0.34929,0.200,0.586,0.955,2.500,2.500,5.026,2.500,5.477


In [29]:
wide_df13 = etl_concentrations_timeseries_from_dir_and_mask("Correlaid.rawData/AQD_DE_E1a_2013/", "DE_HH*hour*")

In [30]:
clean_df13 = clean_wide_df(wide_df13)

In [31]:
clean_df13

Unnamed: 0_level_0,DEHH008_NO2_dataGroup1,DEHH015_NO2_dataGroup1,DEHH016_NO2_dataGroup1,DEHH021_NO2_dataGroup1,DEHH026_NO2_dataGroup1,DEHH033_NO2_dataGroup1,DEHH047_NO2_dataGroup1,DEHH049_NO2_dataGroup1,DEHH050_NO2_dataGroup1,DEHH059_NO2_dataGroup1,...,DEHH059_CHB_dataGroup1,DEHH064_CHB_dataGroup1,DEHH068_CHB_dataGroup1,DEHH070_CHB_dataGroup1,DEHH079_CHB_dataGroup1,DEHH008_SO2_dataGroup1,DEHH015_SO2_dataGroup1,DEHH016_SO2_dataGroup1,DEHH059_SO2_dataGroup1,DEHH079_SO2_dataGroup1
observation_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01 00:00:00+01:00,34.333,15.839,4.019,5.006,30.318,12.498,20.798,2.000,10.407,9.326,...,0.516,3.171,1.945,1.348,0.483,16.591,5.577,2.5,16.109,2.500
2013-01-01 01:00:00+01:00,26.471,8.469,2.000,2.000,24.006,11.702,10.173,2.000,4.094,9.326,...,0.516,0.733,0.956,0.694,0.200,2.500,2.500,2.5,2.500,2.500
2013-01-01 02:00:00+01:00,19.417,9.232,2.000,2.000,18.457,9.936,12.994,2.000,2.000,7.934,...,0.458,0.651,0.824,1.000,0.200,2.500,2.500,2.5,2.500,2.500
2013-01-01 03:00:00+01:00,14.176,6.735,2.000,2.000,11.583,6.366,9.166,2.000,2.000,14.909,...,0.615,0.567,0.888,0.577,0.200,2.500,2.500,2.5,8.025,2.500
2013-01-01 04:00:00+01:00,12.790,7.622,2.000,2.000,15.205,4.632,8.025,2.000,2.000,14.560,...,0.843,0.655,0.677,0.572,0.200,2.500,2.500,2.5,10.914,2.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-12-31 19:00:00+01:00,23.626,24.213,17.054,12.441,38.495,31.704,28.332,28.804,15.205,27.946,...,1.189,0.730,1.453,1.150,0.452,10.303,46.797,2.5,2.500,2.500
2013-12-31 20:00:00+01:00,22.667,21.000,17.063,13.093,42.163,30.336,29.873,26.902,13.518,23.503,...,1.221,0.816,1.503,0.975,0.452,8.039,38.059,2.5,9.127,2.500
2013-12-31 21:00:00+01:00,22.120,22.290,16.522,14.058,38.964,26.224,25.481,25.986,19.830,21.529,...,0.923,0.752,1.630,0.983,0.452,9.432,80.898,2.5,2.500,2.500
2013-12-31 22:00:00+01:00,23.986,20.672,15.920,12.740,36.594,24.128,22.239,23.593,26.614,22.157,...,1.018,0.837,1.251,0.995,0.452,11.481,34.765,2.5,5.103,11.387


In [32]:
#clean_df13.to_csv("clean13.csv")
#clean_df13.to_csv("clean13.csv")
#clean_df14.to_csv("clean14.csv")
#clean_df15.to_csv("clean15.csv")
#  ?????? No data for Hamburg 2016  ???
#clean_df17.to_csv("clean17.csv")
#clean_df18.to_csv("clean18.csv")
#clean_df19.to_csv("clean19.csv")

In [37]:
column_names = []

In [38]:
column_names.append(list(clean_df13))
column_names.append(list(clean_df14))
column_names.append(list(clean_df15))

column_names.append(list(clean_df17))
column_names.append(list(clean_df18))
column_names.append(list(clean_df19))

column_names_common = list(set(column_names[0]).intersection(*column_names))

In [39]:
len(column_names_common)

68

In [41]:
column_names_common

['DEHH033_NO2_dataGroup1',
 'DEHH026_NO_dataGroup1',
 'DEHH072_NO2_dataGroup1',
 'DEHH033_CO_dataGroup1',
 'DEHH072_PM1_dataGroup1',
 'DEHH070_NO2_dataGroup1',
 'DEHH073_NOx_dataGroup1',
 'DEHH026_PM1_dataGroup1',
 'DEHH033_PM1_dataGroup1',
 'DEHH064_NO_dataGroup1',
 'DEHH047_NOx_dataGroup1',
 'DEHH008_PM2_dataGroup1',
 'DEHH079_PM1_dataGroup1',
 'DEHH064_NO2_dataGroup1',
 'DEHH033_O3_dataGroup1',
 'DEHH070_NO_dataGroup1',
 'DEHH015_PM2_dataGroup1',
 'DEHH008_PM1_dataGroup1',
 'DEHH079_NOx_dataGroup1',
 'DEHH008_NO2_dataGroup1',
 'DEHH015_SO2_dataGroup1',
 'DEHH016_SO2_dataGroup1',
 'DEHH070_PM1_dataGroup1',
 'DEHH072_NOx_dataGroup1',
 'DEHH068_PM1_dataGroup1',
 'DEHH068_CO_dataGroup1',
 'DEHH059_SO2_dataGroup1',
 'DEHH072_NO_dataGroup1',
 'DEHH070_NOx_dataGroup1',
 'DEHH079_SO2_dataGroup1',
 'DEHH064_NOx_dataGroup1',
 'DEHH068_NOx_dataGroup1',
 'DEHH050_NOx_dataGroup1',
 'DEHH047_O3_dataGroup1',
 'DEHH033_NOx_dataGroup1',
 'DEHH073_NO_dataGroup1',
 'DEHH008_O3_dataGroup1',
 'DEHH068_N

We have 68 sensors completely functional between 2013 and 2019. (**No Info on 2016 for HH**)

In [43]:
df_single = clean_df13[column_names_common]

In [44]:
df_single

Unnamed: 0_level_0,DEHH033_NO2_dataGroup1,DEHH026_NO_dataGroup1,DEHH072_NO2_dataGroup1,DEHH033_CO_dataGroup1,DEHH072_PM1_dataGroup1,DEHH070_NO2_dataGroup1,DEHH073_NOx_dataGroup1,DEHH026_PM1_dataGroup1,DEHH033_PM1_dataGroup1,DEHH064_NO_dataGroup1,...,DEHH070_CO_dataGroup1,DEHH015_NO2_dataGroup1,DEHH079_NO2_dataGroup1,DEHH008_NO_dataGroup1,DEHH059_PM1_dataGroup1,DEHH016_NO_dataGroup1,DEHH047_NO_dataGroup1,DEHH059_NO_dataGroup1,DEHH008_NOx_dataGroup1,DEHH026_NO2_dataGroup1
observation_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01 00:00:00+01:00,12.498,16.626,4.891,0.23456,9.001,39.588,7.346,180.464,42.805,17.011,...,0.31186,15.839,14.249,7.736,503.035,2.0,2.000,70.378,46.414,30.318
2013-01-01 01:00:00+01:00,11.702,2.000,2.000,0.24292,26.454,39.910,7.346,120.688,63.788,15.236,...,0.28016,8.469,13.045,2.000,287.001,2.0,2.000,2.000,29.770,24.006
2013-01-01 02:00:00+01:00,9.936,2.000,2.000,0.20673,27.075,43.901,7.346,20.295,12.569,13.308,...,0.39618,9.232,8.060,2.000,9.672,2.0,2.000,2.000,20.886,18.457
2013-01-01 03:00:00+01:00,6.366,2.000,2.000,0.10000,18.673,37.178,7.346,16.554,10.202,11.746,...,0.34203,6.735,8.938,2.000,6.319,2.0,2.000,2.000,15.730,11.583
2013-01-01 04:00:00+01:00,4.632,2.000,2.000,0.10000,12.701,38.163,7.346,11.787,9.881,10.358,...,0.34969,7.622,10.301,2.000,6.958,2.0,2.000,2.000,13.513,15.205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-12-31 19:00:00+01:00,31.704,18.939,32.582,0.28454,41.154,47.110,30.447,22.981,23.284,15.312,...,0.42257,24.213,32.410,2.000,36.893,2.0,5.694,5.282,27.928,38.495
2013-12-31 20:00:00+01:00,30.336,40.835,26.162,0.29572,40.345,45.106,24.043,85.058,28.960,10.659,...,0.38275,21.000,30.893,2.000,76.168,2.0,9.426,35.455,25.973,42.163
2013-12-31 21:00:00+01:00,26.224,15.810,22.797,0.29079,37.536,41.012,23.243,159.023,36.931,13.672,...,0.38447,22.290,31.223,2.000,505.764,2.0,6.770,2.000,25.226,38.964
2013-12-31 22:00:00+01:00,24.128,13.737,21.694,0.28556,34.650,36.775,21.566,41.285,33.222,11.736,...,0.36517,20.672,29.572,2.000,63.512,2.0,2.000,4.871,29.899,36.594


In [45]:
df_single = df_single.append(clean_df14[column_names_common])
df_single = df_single.append(clean_df15[column_names_common])

df_single = df_single.append(clean_df17[column_names_common])
df_single = df_single.append(clean_df18[column_names_common])
df_single = df_single.append(clean_df19[column_names_common])

In [None]:
df_single.to_csv("df_single.csv")

In [None]:
import seaborn as sns
# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(11, 4)})


In [None]:
df_single['DEHH050_NO_dataGroup1'].plot(linewidth=0.5);

In [None]:
df_single.dtypes

In [None]:
df_single.index