In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from skopt import BayesSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import math
from sklearn.linear_model import LinearRegression
from skopt.plots import plot_convergence
from skopt.space import Real, Integer, Categorical
import seaborn as sns
import pickle

In [2]:
# Constant
VALIDATION_PORTION = 0.2
FEATURES = ['PT08.S1(CO)', 'PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S4(NO2)', 
            'PT08.S5(O3)', 'T', 'RH', 'AH']
TARGET = ['NMHC(GT)', 'CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']
TARGET_EXCLUSION = ['NMHC(GT)']

# Data Preparation

In [3]:
from model_pipeline import get_train_test, MissingValueTransformer, \
    CreateFeaturesTransformer, select_non_nan_rows, preprocess_data

In [5]:
train_dataset, test_dataset = get_train_test()

In [6]:
train_dataset.to_csv('train_data/train_data.csv', index=True)
test_dataset.to_csv('test_data/test_data.csv', index=True)

In [7]:
X_train, Y_train, X_test, Y_test = preprocess_data(train_dataset, test_dataset, 'CO(GT)')

In [9]:
train_cleaned = pd.concat([X_train, Y_train], axis=1)
test_cleaned = pd.concat([X_test, Y_test], axis=1)
train_cleaned.to_csv('train_data/train_data_cleaned.csv', index=True)
test_cleaned.to_csv('test_data/test_data_cleaned.csv', index=True)