##Importing all csv files using a MultiIndex structure

Samplingpoint -> Pollutant -> StartTime

Rows containing Bulk in the Samplingpoint name are dropped, because they are only monthly

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from fbprophet import Prophet
import math
import os
import glob
%matplotlib inline

package_dir = os.getcwd()

print(package_dir)

df = pd.DataFrame()

print('Loading CSVs\n')
for file in glob.glob('res/*.csv'):
    read = pd.read_csv(file,
                 encoding="utf-16", parse_dates=[13, 14],
                 infer_datetime_format=True,
                 index_col=[4, 8, 13])
    df = pd.concat([df, read])
print('\nFinished Loading')
print('Sorting')
df = df.sort_index()
print('Finished sorting')

In [None]:
print(df.dtypes)
print(df.head())

Multiindex can be navigated using the loc method.

Row selection is done via the [] operator using counting slices or datetime parsed from strings

In [None]:
df.loc['DESN025', 'C6H6']['2014-01':'2014-02'].plot(y='Concentration', figsize=(14,8))

In [None]:
for eoi, new_df in df.groupby(level=0):
    print(new_df.groupby(['AirPollutant']).count()['AirPollutantCode'], '\n')

In [None]:
ds = pd.to_datetime(df.loc['DESN025', 'PM10'].index.values)
y = df.loc['DESN025', 'PM10', :]['Concentration'].values

print(ds)
print(y, '\n')

df2 = pd.DataFrame(data={'ds':ds, 'y':y})

print(df2.head())
print(df2.dtypes)

#print(df.loc['DESN025'].groupby(['AirPollutant']).count()['AirPollutantCode'])

In [None]:
df2.plot(x='ds', y='y')
print(df2.count())

In [None]:
np.seterr(divide='ignore')
df2['y'] = pd.DataFrame(np.log(df2['y'])).replace([np.inf, -np.inf], 0)
np.seterr(divide='warn')

In [None]:
df2.plot(x='ds', y='y')

In [None]:
df2_train = df2[:40000]
df2_test = df2[40000:40240]

In [None]:
model = Prophet()

In [None]:
model.fit(df2_train)
future = model.make_future_dataframe(periods=240, freq='H')

In [None]:
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()



In [None]:
model.plot(forecast)


In [None]:
model.plot_components(forecast)

In [None]:
# retransform using e
y_hat = np.exp(forecast['yhat'][20000:])
y_true = np.exp(df2_test['y'])
 
# compute the mean square error
mse = ((y_hat - y_true) ** 2).mean()
print('Prediction quality: {:.2f} MSE ({:.2f} RMSE)'.format(mse, math.sqrt(mse)))

In [None]:
print(y_hat)

In [None]:
df_agg = df.loc['DESN025', 'PM10'].groupby(df.loc['DESN025', 'PM10'].index).count()
g = df_agg.groupby('Namespace')
#df_agg = df_agg.reindex('Namespace')



print(g.groups)