### Import libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import os
from matplotlib import pyplot as plt
from sklearn.linear_model import BayesianRidge, ARDRegression
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingRegressor

#### Load data

In [2]:
DATA_PATH = "data/week-one/"
train_filename, test_filename, macro_filename = "X_train.csv", "X_test.csv", "macro.csv"

data = pd.read_csv(os.path.join(DATA_PATH, train_filename), parse_dates=['timestamp'])
test = pd.read_csv(os.path.join(DATA_PATH, test_filename), parse_dates=['timestamp'])
macro = pd.read_csv(os.path.join(DATA_PATH, macro_filename), parse_dates=['timestamp'])

In [3]:
print(data.shape, test.shape, macro.shape)

(21329, 292) (9142, 291) (2484, 100)


## Data Preprocessing

Some useful functions used for the preprocessing of the data

In [4]:
def reduce(data, threshold=0.9):
    correlations = data.corr().abs()
    upper = correlations.where(
        np.triu(np.ones(correlations.shape), k=1).astype(np.bool))
    to_drop = [
        column for column in upper.columns if any(upper[column] > threshold)
    ]
    return data.drop(columns=to_drop)

def inpute(data, feature, verbose=False, **kwargs):
    X = data.copy().drop(columns=[feature])
    X = X.select_dtypes(exclude=['object'])
    X = X.fillna(X.median())
    y = data[feature]
    X_train = X[~y.isna()]
    X_test = X[y.isna()]
    y_train = y[~y.isna()]

    model = DecisionTreeRegressor(**kwargs)
    model.fit(X_train, y_train)
    if verbose:
        print("Feature: %s" % feature)
    filled_gaps = model.predict(X_test)
    for i, ind in enumerate(data[feature][data[feature].isna()].index):
        data.at[ind, feature] = filled_gaps[i]
    return data

Separate out target and features and exclude categorical features from training set

In [5]:
y = data.copy()["price_doc"]
data.drop(['id', 'price_doc'], axis=1, inplace=True)
# self.X = pd.merge_ordered(
#     self.data.copy(), self.macro.copy(), on='timestamp', how='left')
X = data.copy()
# self.X.fillna(self.X.median(), inplace=True)

# Take only numeric data for now
X = X.select_dtypes(exclude=['object'])
X.drop(columns=["timestamp"], inplace=True)

Reduce dimensionality by removing strongly correlated features

In [6]:
X = reduce(X, threshold=0.95)

Use a basic decision tree regressor to predict missing values in the data

In [7]:
for column in X.columns[X.isna().any() == True]:
    X = inpute(X, column, min_samples_leaf=100)

In [8]:
print("Data shape:", X.shape)

Data shape: (21329, 161)


## Model Selection

In [11]:
models = {
#     "ridge": {
#         'model': sklearn.linear_model.Ridge(),
#         'param_grid': {
#             'ridge__alpha': np.logspace(2, 6, 20)
#         }
#     },
#     # "LinearRegression": LinearRegression(),
#     "lasso": {
#         'model': sklearn.linear_model.Lasso(),
#         'param_grid': {
#             'lasso__alpha': np.logspace(-5, 1, 20)
#         }
#     },
#     "elasticnet": {
#         'model': sklearn.linear_model.ElasticNet(),
#         'param_grid': {
#             'elasticnet__alpha': np.logspace(-5, 1, 20)
#         }
#     },
#     "linearsvr": {
#         'model': sklearn.svm.LinearSVR(),
#         'param_grid': {
#             'linearsvr__C': np.logspace(-5, 0, 10)
#         }
#     },
    # "BayesianRidge": BayesianRidge(),
    # # "ARDRegression": ARDRegression(),
    # "NuSVR": NuSVR(),
    # # "KernelRidge": KernelRidge(),
    # # "GaussianProcessRegressor": GaussianProcessRegressor(),
    "DecisionTreeRegressor": {
        'model': DecisionTreeRegressor(),
        'param_grid' : {
            'decisiontreeregressor': {
                'decisiontreeregressor__max_depth': np.logspace(0, 2, 20, dtype=int)
            }
        }
    },
    # # "MLPRegressor": MLPRegressor(),
    # "PassiveAggressiveRegressor": PassiveAggressiveRegressor()
}

In [12]:
for model in models:
    print("Performing search for %s model" % model)
    pipeline = make_pipeline(StandardScaler(), models[model]['model'])

    param_grid = models[model]['param_grid']

    gscv = GridSearchCV(
        pipeline, param_grid, n_jobs=-1,
        scoring='neg_root_mean_squared_error', verbose=1, cv=5,
        refit='best_index_'
    )
    gscv.fit(X, np.log1p(y))
    models[model]['best_estimator'] = gscv.best_estimator_
    models[model]['best_score'] = gscv.best_score_
models

Performing search for ridge model
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Performing search for lasso model
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.8min finished


Performing search for elasticnet model
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Performing search for linearsvr model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.1min finished


{'ridge': {'model': Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='auto', tol=0.001),
  'param_grid': {'ridge__alpha': array([1.00000000e+02, 1.62377674e+02, 2.63665090e+02, 4.28133240e+02,
          6.95192796e+02, 1.12883789e+03, 1.83298071e+03, 2.97635144e+03,
          4.83293024e+03, 7.84759970e+03, 1.27427499e+04, 2.06913808e+04,
          3.35981829e+04, 5.45559478e+04, 8.85866790e+04, 1.43844989e+05,
          2.33572147e+05, 3.79269019e+05, 6.15848211e+05, 1.00000000e+06])},
  'best_estimator': Pipeline(memory=None,
           steps=[('standardscaler',
                   StandardScaler(copy=True, with_mean=True, with_std=True)),
                  ('ridge',
                   Ridge(alpha=33598.18286283781, copy_X=True, fit_intercept=True,
                         max_iter=None, normalize=False, random_state=None,
                         solver='auto', tol=0.001))],
           verbose=False),
  'best_score': 