In [None]:
import glob
import math
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from fbprophet import Prophet
from sklearn import neighbors, metrics

%matplotlib notebook

package_dir = os.getcwd()

print(package_dir)

In [None]:
df = pd.DataFrame()

print('Loading CSVs\n')
for file in glob.glob('res/*.csv'):
    # read single file, index on StationEoI and DatetimeEnd
    read = pd.read_csv(file,
                 encoding="utf-16", parse_dates=[13, 14],
                 infer_datetime_format=True,
                 index_col=[4, 14])
    # drop 'bulk' files because they have different averaging
    bulks = read.SamplingPoint.str.lower().str.contains('bulk')
    clean = read[~bulks].copy()
    
    # ignore unnecessary columns
    clean.drop(columns=['Countrycode', 'Namespace', 'AirQualityNetwork',
                 'AirQualityStation', 'SamplingPoint', 'Sample',
                 'SamplingProcess', 'AirPollutantCode',
                 'DatetimeBegin', 'Validity', 'Verification',
                 'AveragingTime'],
        inplace=True)
    
    # make pollutant a column for better memory usage
    clean = clean.pivot(columns='AirPollutant')
    df = pd.concat([df, clean])

print('\nFinished Loading')
print('Merging')

# make names shorter    
df.index.names = ['StationEoI', 'Timestamp']
df.columns.names = [None, 'Pollutant']
df = df.sort_index()
df = df.groupby(level=[0,1]).first()


print('Finished merging')

In [None]:
df = pd.DataFrame()

for file in glob.glob('res/*2013_timeseries.csv'):
    # read single file, index on StationEoI and DatetimeEnd
    read = pd.read_csv(file,
                 encoding="utf-16", parse_dates=[13, 14],
                 infer_datetime_format=True,
                 index_col=[4,14])
    # drop 'bulk' files because they have different averaging
    bulks = read.SamplingPoint.str.lower().str.contains('bulk')
    clean = read[~bulks].copy()
    
    # ignore unnecessary columns
    clean.drop(columns=['Countrycode', 'Namespace', 'AirQualityNetwork',
                 'AirQualityStation', 'SamplingPoint', 'Sample',
                 'SamplingProcess', 'AirPollutantCode',
                 'DatetimeBegin', 'Validity', 'Verification',
                 'AveragingTime'],
        inplace=True)
    
    pivoted = clean.pivot(columns='AirPollutant')
    
    df = pd.concat([df, pivoted])

# make pollutant a column for better memory usage
#df = df.pivot(columns='AirPollutant')

# make names shorter    
df.index.names = ['StationEoI', 'Timestamp']
df.columns.names = [None, 'Pollutant']

df = df.sort_index()
df = df.groupby(level=[0,1]).first()

df = df.interpolate()

In [None]:
df.loc['DESN025'].Concentration.count()

In [None]:
lagged = pd.DataFrame()

for i in range(1,24):
    lagged['lag %sh' % i] = df.loc['DESN025'].Concentration.NO.shift(i, freq='H')
    
lagged = lagged.interpolate()

lagged

In [None]:
asdf = '2013-12-10'
end = '2013-10-01'
x_train = lagged[end:asdf].interpolate()
y_train = df.loc['DESN025'].Concentration.NO[end:asdf]
index_train = df.loc['DESN025'].Concentration.NO[end:asdf].index.values

x_test = lagged[asdf:].iloc[:-1]
y_test = df.loc['DESN025'].Concentration.NO[asdf:]
index_test = df.loc['DESN025'].Concentration.NO[asdf:].index.values


# plt.figure(figsize=(20,10))
# #############################################################################
# Fit regression model
n_neighbors = 10

plt.figure(figsize=(9, 4))
plt.scatter(index_train, y_train, c='k', label='train')
plt.scatter(index_test, y_test, c='r', label='test')

colors = ['g', 'c']
solutions = {}

for i, weights in enumerate(['uniform', 'distance']):
    knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
    knn_model = knn.fit(x_train, y_train)
    y_ = knn_model.predict(x_test)
    solutions['knn %s' % weights] = y_

    plt.plot(index_test, y_, '.-', c=colors[i], label=weights)
    plt.axis('tight')
    plt.legend()
    plt.title("KNeighborsRegressor (k = %i)" % (n_neighbors))
    print('{0} mae: {1}\tmse: {2}'.format(weights, 
                                          metrics.mean_absolute_error(y_test, y_),
                                          metrics.mean_squared_error(y_test, y_)))