# Import module

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.geometry import Point, Polygon

import datetime
import os, gc

# Load data

In [None]:
epa_path = sorted([os.path.join('data', i) for i in os.listdir('data') if i.startswith('EPA_OD')])
epa_path

In [None]:
EPA_all = pd.DataFrame()
for path in epa_path:
    tmp_df = pd.read_csv(path, parse_dates=['PublishTime'])
    EPA_all = pd.concat([EPA_all, tmp_df], axis=0)

In [None]:
EPA_all.head()

In [None]:
EPA_all.shape

# EDA

In [None]:
station_info = pd.read_excel('data/station_info.xlsx')
station_info.head()

In [None]:
station_info.rename(columns={'SITENAME':'SiteName'}, inplace=True)
EPA_all = EPA_all.merge(station_info[['SiteName','station_id' , 'lon', 'lat']], on='SiteName', how='left')

In [None]:
geom = [Point(x, y) for x, y in zip(EPA_all['lon'], EPA_all['lat'])]
EPA_all = gpd.GeoDataFrame(EPA_all, geometry=geom, crs={'init':'epsg:4326'})

In [None]:
EPA_all[EPA_all['PublishTime']==datetime.datetime(2018,12,1,12)].plot('AQI', 
                                                                             figsize=(10, 10), 
                                                                             legend=True, 
                                                                             cmap='cool', 
                                                                             scheme='quantiles', k=5
                                                                            )

In [None]:
EPA_all['SiteName'].unique()

In [None]:
AQI_df = EPA_all[(EPA_all['SiteName']=='古亭')&(EPA_all['PublishTime']>datetime.datetime(2018,5,31))]

In [None]:
min_ts = AQI_df['PublishTime'].min()
max_ts = AQI_df['PublishTime'].max()
time_list = pd.date_range(min_ts, max_ts, freq='1H')

data_df = pd.DataFrame(time_list, columns=['PublishTime'])
data_df = data_df.merge(AQI_df, on='PublishTime', how='left')

In [None]:
data_df.shape

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(data_df.set_index('PublishTime')['AQI'])

In [None]:
data_df.corr().sort_values('AQI', ascending=False)['AQI']

In [None]:
data_df[['AQI', 'PM2.5', 'PM10', 'O3', 'CO']].plot(subplots=True, figsize=(10, 25))
plt.show()

# Data split

In [None]:
trn_data = data_df.loc[data_df['PublishTime']<datetime.datetime(2018, 12, 1, 0), 
                       ['AQI', 'PM2.5', 'PM10', 'O3', 'CO']].reset_index(drop=True)
tst_data = data_df.loc[data_df['PublishTime']>=datetime.datetime(2018, 12, 1, 0), 
                       ['AQI', 'PM2.5', 'PM10', 'O3', 'CO']].reset_index(drop=True)

trn_data.shape, tst_data.shape

# Normalization

In [None]:
train_mean = np.nanmean(trn_data, axis=0)
train_std = np.nanstd(trn_data, axis=0)

In [None]:
trn_data = (trn_data - train_mean) / train_std
tst_data = (tst_data - train_mean) / train_std

# FE

In [None]:
def data_generator(dataset, start_index, end_index, history_size, target_size):
    data = []
    labels = []
    
    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size
    
    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        
        feat = dataset[indices, 1:]
        label = dataset[i+target_size, 0]
        if np.isnan(label):
            continue
        if np.where(np.isnan(feat))[0].shape[0] > (feat.reshape(-1).shape[0]*1//3):
            continue
            
        feat = pd.DataFrame(feat).interpolate(limit_direction='both').values
        feat = np.where(np.isnan(feat), -3, feat)
        data.append(feat.reshape(-1))
        labels.append(label)
    return np.array(data), np.array(labels)

def create_time_steps(length):
    return list(range(-length, 0))

def show_plot(plot_data, delta, title):
    labels = ['History PM2.5', 'History PM10', 'History O3', 'History CO', 'True Future', 'Model Prediction']
    marker = ['.-', '.-', '.-', '.-', 'rx', 'go']
    time_steps = create_time_steps(plot_data[0].shape[0])
    if delta:
        future = delta
    else:
        future = 0
    
    plt.title(title)
    for i, x in enumerate(plot_data):
        if i > 3:
            plt.plot(future, plot_data[i], marker[i], markersize=8, label=labels[i])
        else:
            plt.plot(time_steps, plot_data[i], marker[i], label=labels[i])
    plt.legend()
    plt.xlim([time_steps[0], (future+8)])
    plt.xlabel('Time-Step')
    return plt

In [None]:
past_history = 24
future_target = 3

x_train, y_train = data_generator(trn_data.values, 0, None, past_history, future_target)
x_test, y_test = data_generator(tst_data.values, 0, None, past_history, future_target)

In [None]:
print('Train set data shape')
print(x_train.shape, y_train.shape)
print('Single window of past hidtory')
print(x_train[0])
print('Target AQI to predict')
print(y_train[0])

In [None]:
print('Test set data shape')
print(x_test.shape, y_test.shape)
print('Single window of past hidtory')
print(x_test[0])
print('Target AQI to predict')
print(y_test[0])

In [None]:
plt.figure(figsize=(10, 5))
show_plot([x_train[10][::4], x_train[10][1::4], x_train[10][2::4], x_train[10][3::4], y_train[10]], 
          future_target, 'Example train data')

# Build models

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

## Linear regression

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train)
y_lr = linear_reg.predict(x_test)

In [None]:
print(f'Linear regression mae : {mean_absolute_error(y_test, y_lr)}, r2 score : {r2_score(y_test, y_lr)}')

rnd_idx = np.random.randint(x_test.shape[0], size=3)
for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
    plt.figure(figsize = (10,3))
    plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, linear_reg.predict(x.reshape(1, -1))], 
         future_target, 'Linear Regression prediction')

## SVM

In [None]:
svr = SVR('linear')
svr.fit(x_train, y_train)
y_svr = svr.predict(x_test)

In [None]:
print(f'SVM mae : {mean_absolute_error(y_test, y_svr)}, r2 score : {r2_score(y_test, y_svr)}')

rnd_idx = np.random.randint(x_test.shape[0], size=3)
for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
    plt.figure(figsize = (10,3))
    plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, svr.predict(x.reshape(1, -1))], 
         future_target, 'SVM prediction')

# Random forest

In [None]:
rf = RandomForestRegressor(n_estimators=300, max_depth=10)
rf.fit(x_train, y_train)
y_rf = rf.predict(x_test)

In [None]:
print(f'RF mae : {mean_absolute_error(y_test, y_rf)}, r2 score : {r2_score(y_test, y_rf)}')

rnd_idx = np.random.randint(x_test.shape[0], size=3)
for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
    plt.figure(figsize = (10,3))
    plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, rf.predict(x.reshape(1, -1))], 
         future_target, 'RF prediction')

## Gradient Boost Tree

In [None]:
gb = GradientBoostingRegressor(learning_rate=0.01, n_estimators=500, max_depth=5)
gb.fit(x_train, y_train)
y_gb = gb.predict(x_test)

In [None]:
print(f'GB mae : {mean_absolute_error(y_test, y_gb)}, r2 score : {r2_score(y_test, y_gb)}')

rnd_idx = np.random.randint(x_test.shape[0], size=3)
for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
    plt.figure(figsize = (10,3))
    plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, rf.predict(x.reshape(1, -1))], 
         future_target, 'GB prediction')

# Plot result

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(y_test*train_std[-1]+train_mean[-1], label='True AQI')
plt.plot(y_lr*train_std[-1]+train_mean[-1], label='linear reg')
plt.plot(y_svr*train_std[-1]+train_mean[-1], label='svr')
plt.plot(y_rf*train_std[-1]+train_mean[-1], label='RF')
plt.plot(y_gb*train_std[-1]+train_mean[-1], label='GB')

plt.xlim(550, 700)
plt.legend()