In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from pandas import DataFrame
from pandas import concat
import itertools
import xgboost as xgb

In [2]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = [], []
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
    agg = concat(cols, axis=1)
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def mean_percent_error(y_pred, y_true):
    return np.sum(np.abs(y_pred - y_true) / y_true) / y_true.shape[0]

In [3]:
PATH = './all_data.csv'
aqi_data = pd.read_csv(PATH)
aqi_data = aqi_data.drop(['Unnamed: 0', 'date'], axis=1)
feature_columns = aqi_data.columns
target_columns = ['PM2_5', 'PM_10', 'SO2', 'NO2', 'O3', 'CO']
aqi_data_supervised = series_to_supervised(aqi_data, 24, 1, True)
X = aqi_data_supervised[aqi_data_supervised.columns[0: feature_columns.__len__() * 24]]
aqi_data_y = series_to_supervised(aqi_data[target_columns], 24, 1, True)
y = aqi_data_y[aqi_data_y.columns[-6:-5]]
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.9, shuffle=True)



In [4]:
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
train_X = scaler_X.fit_transform(train_X)
test_X = scaler_X.transform(test_X)
train_y = scaler_y.fit_transform(train_y)
test_y = scaler_y.transform(test_y)

In [5]:
estimators = [ i for i in range(50, 101, 50)]
models1 = [('RandomForest %d' % estimator, RandomForestRegressor(n_estimators=estimator, n_jobs=5, random_state=1, max_depth=20)) for estimator in estimators]
models2 = [('GradientBoosting %d' % estimator, GradientBoostingRegressor(n_estimators=estimator, random_state=1, max_depth=20)) for estimator in estimators]
model3 = [('LinearRegression', LinearRegression(n_jobs=5))]
models4 = [('XGBoost %d' % estimator, xgb.XGBRegressor(n_jobs=5, n_estimators=estimator, random_state=1, max_depth=-1)) for estimator in estimators]
models = list(itertools.chain.from_iterable([model3]))
for model in models:
    model[1].fit(train_X, np.ravel(train_y))

In [6]:
metric = pd.DataFrame(columns=['mse', 'mae', 'r2', 'mape'])
for model in models:
    pred = model[1].predict(test_X)
    pred = scaler_y.inverse_transform(pred.reshape(-1,1))
    test_y = scaler_y.inverse_transform(test_y)
    mse = ('mse', mean_squared_error(test_y, pred))
    mae = ('mae', mean_absolute_error(test_y, pred))
    mape = ('mape', mean_percent_error(pred, test_y))
    r2 = ('r2', r2_score(test_y, pred))
    for err in [mse , mae, mape, r2]:
        metric.loc[model[0], err[0]] = err[1]
metric.to_csv('./a.csv', sep=',', header=True, index=True)