# Import module

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime
import os, gc

# Load data

In [None]:
codis_path = sorted([os.path.join('data', i) for i in os.listdir('data') if i.startswith('codis')])
codis_path

In [None]:
codis_all = pd.DataFrame()
for path in codis_path:
    tmp_df = pd.read_csv(path, parse_dates=['ObsTime'])
    codis_all = pd.concat([codis_all, tmp_df], axis=0)

In [None]:
for st in [i for i in codis_all.columns if i != 'ObsTime']:
    codis_all[st] = pd.to_numeric(codis_all[st], errors='coerce')

In [None]:
codis_all.head()

In [None]:
codis_all.shape

# EDA

In [None]:
print(f"Data from {codis_all['ObsTime'].min()} to {codis_all['ObsTime'].max()}")

In [None]:
codis_all.describe()

In [None]:
codis_all.set_index('ObsTime')['Temperature'].plot()

In [None]:
codis_all.set_index('ObsTime').plot(subplots=True, figsize=(14, 18))
plt.show()

In [None]:
codis_all.corr().sort_values('Temperature', ascending=False)['Temperature']

In [None]:
codis_all.set_index('ObsTime')[['Temperature', 'StnPres', 'RH', 'WD']].plot(subplots=True, figsize=(14, 12))
plt.show()

# Data split

In [None]:
trn_data = codis_all.loc[codis_all['ObsTime']<datetime.datetime(2019, 1, 1, 0), 
                       ['Temperature', 'StnPres', 'RH', 'WD']].reset_index(drop=True)

val_data = codis_all.loc[(codis_all['ObsTime']>=datetime.datetime(2019, 1, 1, 0))&(codis_all['ObsTime']<datetime.datetime(2020, 1, 1, 0)), 
                       ['Temperature', 'StnPres', 'RH', 'WD']].reset_index(drop=True)

tst_data = codis_all.loc[codis_all['ObsTime']>=datetime.datetime(2020, 1, 1, 0), 
                       ['Temperature', 'StnPres', 'RH', 'WD']].reset_index(drop=True)

trn_data.shape, val_data.shape, tst_data.shape

# Normalization

In [None]:
train_mean = np.nanmean(trn_data, axis=0)
train_std = np.nanstd(trn_data, axis=0)

In [None]:
train_mean, train_std

In [None]:
trn_data = (trn_data - train_mean) / train_std
val_data = (val_data - train_mean) / train_std
tst_data = (tst_data - train_mean) / train_std

In [None]:
np.nanmax(trn_data, axis=0), np.nanmin(trn_data, axis=0) 

In [None]:
np.nanmax(val_data, axis=0), np.nanmin(val_data, axis=0) 

In [None]:
np.nanmax(tst_data, axis=0), np.nanmin(tst_data, axis=0) 

# FE

In [None]:
def data_generator(dataset, start_index, end_index, history_size, target_size):
    data = []
    labels = []
    
    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size
    
    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        
        feat = dataset[indices, :]
        label = dataset[i+target_size, 0]
        if np.isnan(label):
            continue
        if np.where(np.isnan(feat))[0].shape[0] > (feat.reshape(-1).shape[0]*1//3):
            continue
            
        feat = pd.DataFrame(feat).interpolate(limit_direction='both').values
        feat = np.where(np.isnan(feat), 0, feat)
        data.append(feat.reshape(-1))
        labels.append(label)
    return np.array(data), np.array(labels)

def create_time_steps(length):
    return list(range(-length, 0))

def show_plot(plot_data, delta, title):
    labels = ['History Temperature', 'History StnPres', 'History RH', 'History WD', 'True Future', 'Model Prediction']
    marker = ['.-', '.-', '.-', '.-', 'rx', 'go']
    time_steps = create_time_steps(plot_data[0].shape[0])
    if delta:
        future = delta
    else:
        future = 0
    
    plt.title(title)
    for i, x in enumerate(plot_data):
        if i > 3:
            plt.plot(future, plot_data[i], marker[i], markersize=8, label=labels[i])
        else:
            plt.plot(time_steps, plot_data[i], marker[i], label=labels[i])
    plt.legend()
    plt.xlim([time_steps[0], (future+8)])
    plt.xlabel('Time-Step')
    return plt

In [None]:
past_history = 72
future_target = 24

x_train, y_train = data_generator(trn_data.values, 0, None, past_history, future_target)
x_valid, y_valid = data_generator(val_data.values, 0, None, past_history, future_target)
x_test, y_test = data_generator(tst_data.values, 0, None, past_history, future_target)

In [None]:
print('Train set data shape')
print(x_train.shape, y_train.shape)
print('Single window of past hidtory')
print(x_train[0][-5:])
print('Target AQI to predict')
print(y_train[0])

In [None]:
print('Validation set data shape')
print(x_valid.shape, y_valid.shape)
print('Single window of past hidtory')
print(x_valid[0][-5:])
print('Target AQI to predict')
print(y_valid[0])

In [None]:
print('Test set data shape')
print(x_test.shape, y_test.shape)
print('Single window of past hidtory')
print(x_test[0][-5:])
print('Target AQI to predict')
print(y_test[0])

In [None]:
plt.figure(figsize=(10, 5))
show_plot([x_train[10][::4], x_train[10][1::4], x_train[10][2::4], x_train[10][3::4], y_train[10]], 
          future_target, 'Example train data')

# Build models

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import xgboost

## Linear regression

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train)
y_lr = linear_reg.predict(x_test)

In [None]:
print(f'Linear regression mae : {mean_absolute_error(y_test, y_lr)}, r2 score : {r2_score(y_test, y_lr)}')

rnd_idx = np.random.randint(x_test.shape[0], size=3)
for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
    plt.figure(figsize = (10,3))
    plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, linear_reg.predict(x.reshape(1, -1))], 
         future_target, 'Linear Regression prediction')

## SVM

In [None]:
# %%time
# svr = SVR('linear')
# svr.fit(x_train, y_train)
# y_svr = svr.predict(x_test)

In [None]:
# print(f'SVM mae : {mean_absolute_error(y_test, y_svr)}, r2 score : {r2_score(y_test, y_svr)}')

# rnd_idx = np.random.randint(x_test.shape[0], size=3)
# for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
#     plt.figure(figsize = (10,3))
#     plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, svr.predict(x.reshape(1, -1))], 
#          future_target, 'SVM prediction')

# Random forest

In [None]:
%%time
rf = RandomForestRegressor(n_estimators=1000, max_depth=3, n_jobs=-1)
rf.fit(x_train, y_train)
y_rf = rf.predict(x_test)

In [None]:
print(f'RF mae : {mean_absolute_error(y_test, y_rf)}, r2 score : {r2_score(y_test, y_rf)}')

rnd_idx = np.random.randint(x_test.shape[0], size=3)
for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
    plt.figure(figsize = (10,3))
    plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, rf.predict(x.reshape(1, -1))], 
         future_target, 'RF prediction')

## Gradient Boost Tree

In [None]:
%%time
gb = GradientBoostingRegressor(learning_rate=0.02, 
                               n_estimators=1000,
                               max_depth=3, 
                               verbose=1, 
                               validation_fraction=0.2, 
                               n_iter_no_change=10)
gb.fit(x_train, y_train)
y_gb = gb.predict(x_test)

In [None]:
print(f'GB mae : {mean_absolute_error(y_test, y_gb)}, r2 score : {r2_score(y_test, y_gb)}')

rnd_idx = np.random.randint(x_test.shape[0], size=3)
for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
    plt.figure(figsize = (10,3))
    plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, gb.predict(x.reshape(1, -1))], 
         future_target, 'GB prediction')

# XGB

In [None]:
%%time
xgb = xgboost.XGBRegressor(learning_rate=0.02, 
                               n_estimators = 1000,
                               max_depth = 3, 
                               n_jobs = -1)
xgb.fit(x_train, y_train, 
#         eval_set=[(x_train, y_train), (x_valid, y_valid)], 
        eval_metric=['mae'], 
#         early_stopping_rounds=800, 
        verbose=100)
y_xgb = xgb.predict(x_test)

In [None]:
print(f'XGB mae : {mean_absolute_error(y_test, y_xgb)}, r2 score : {r2_score(y_test, y_xgb)}')

rnd_idx = np.random.randint(x_test.shape[0], size=3)
for x, y in zip(x_test[rnd_idx], y_test[rnd_idx]):
    plt.figure(figsize = (10,3))
    plt = show_plot([x[::4], x[1::4], x[2::4], x[3::4], y, xgb.predict(x.reshape(1, -1))], 
         future_target, 'XGB prediction')

# Plot result

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(y_test*train_std[0]+train_mean[0], label='True Tem')
plt.plot(y_lr*train_std[0]+train_mean[0], label='linear reg')
plt.plot(y_rf*train_std[0]+train_mean[0], label='RF')
plt.plot(y_gb*train_std[0]+train_mean[0], label='GB')
plt.plot(y_xgb*train_std[0]+train_mean[0], label='XGB')
plt.xlim(1500, 2000)
plt.legend()