### Import libraries

In [28]:
import pandas as pd
import numpy as np
import sklearn
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import os
from matplotlib import pyplot as plt
from sklearn.linear_model import BayesianRidge, ARDRegression
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingRegressor

#### Load data

In [2]:
DATA_PATH = "data/week-one/"
train_filename, test_filename, macro_filename = "X_train.csv", "X_test.csv", "macro.csv"

data = pd.read_csv(os.path.join(DATA_PATH, train_filename), parse_dates=['timestamp'])
test = pd.read_csv(os.path.join(DATA_PATH, test_filename), parse_dates=['timestamp'])
macro = pd.read_csv(os.path.join(DATA_PATH, macro_filename), parse_dates=['timestamp'])

In [3]:
print(data.shape, test.shape, macro.shape)

(21329, 292) (9142, 291) (2484, 100)


## Data Preprocessing

Some useful functions used for the preprocessing of the data

In [4]:
def reduce(data, threshold=0.9):
    correlations = data.corr().abs()
    upper = correlations.where(
        np.triu(np.ones(correlations.shape), k=1).astype(np.bool))
    to_drop = [
        column for column in upper.columns if any(upper[column] > threshold)
    ]
    return data.drop(columns=to_drop)

def inpute(data, feature, verbose=False, **kwargs):
    X = data.copy().drop(columns=[feature])
    X = X.select_dtypes(exclude=['object'])
    X = X.fillna(X.median())
    y = data[feature]
    X_train = X[~y.isna()]
    X_test = X[y.isna()]
    y_train = y[~y.isna()]

    model = DecisionTreeRegressor(**kwargs)
    model.fit(X_train, y_train)
    if verbose:
        print("Feature: %s" % feature)
    filled_gaps = model.predict(X_test)
    for i, ind in enumerate(data[feature][data[feature].isna()].index):
        data.at[ind, feature] = filled_gaps[i]
    return data

Separate out target and features and exclude categorical features from training set

Add macro data

In [5]:
# macro = macro.fillna(macro.median())
# X_all = pd.merge_ordered(data, macro, on='timestamp', how='left')

In [6]:
y = data.copy()["price_doc"]
data.drop(['id', 'price_doc'], axis=1, inplace=True)
# self.X = pd.merge_ordered(
#     self.data.copy(), self.macro.copy(), on='timestamp', how='left')
X = data.copy()
# self.X.fillna(self.X.median(), inplace=True)

# Take only numeric data for now
X = X.select_dtypes(exclude=['object'])
X.drop(columns=["timestamp"], inplace=True)

Reduce dimensionality by removing strongly correlated features

In [7]:
X = reduce(X, threshold=0.95)

#### Add categorical features using one-hot encoding

Some features would be best described with ordinal encoding

In [8]:
data['ecology'] = data['ecology'].map({'excellent':4,'good':3,'satisfactory':2,'poor':1,'no data':np.nan})
X['ecology'] = data['ecology']

In [9]:
for column in data.select_dtypes(include=['object']).drop(columns=['sub_area', 'product_type']).columns:
    data[column] = data[column].map({'yes':1, 'no':0})
    X[column] = data[column]

In [10]:
X = pd.concat([X,pd.get_dummies(data.select_dtypes(include=['object']))], axis=1)

In [11]:
print("Data shape:", X.shape)

Data shape: (21329, 321)


Use a basic decision tree regressor to predict missing values in the data

In [12]:
for column in X.columns[X.isna().any() == True]:
    X = inpute(X, column, min_samples_leaf=100)

In [13]:
print("Data shape:", X.shape)

Data shape: (21329, 321)


Remove outliers

In [14]:
z = pd.DataFrame(dict([(column,abs(stats.zscore(X[column]))) for column in X.columns]))

In [15]:
# X = X[~((z > 5).sum(axis=1) > 5)]
print("Data shape:", X.shape)

Data shape: (21329, 321)


## Model Selection

In [35]:
from weekone_models import models

# models = {
#     "ridge": {
#         'model': sklearn.linear_model.Ridge(),
#         'param_grid': {
#             'ridge__alpha': np.logspace(2, 6, 10)
#         }
#     },
#     # "LinearRegression": LinearRegression(),
#     "lasso": {
#         'model': sklearn.linear_model.Lasso(),
#         'param_grid': {
#             'lasso__alpha': np.logspace(-5, 1, 10)
#         }
#     },
#     "elasticnet": {
#         'model': sklearn.linear_model.ElasticNet(),
#         'param_grid': {
#             'elasticnet__alpha': np.logspace(-5, 1, 10)
#         }
#     },
# #     "linearsvr": {
# #         'model': sklearn.svm.LinearSVR(),
# #         'param_grid': {
# #             'linearsvr__C': np.logspace(-5, 0, 5)
# #         }
# #     },
#     # "BayesianRidge": BayesianRidge(),
#     # # "ARDRegression": ARDRegression(),
#     # "NuSVR": NuSVR(),
#     # # "KernelRidge": KernelRidge(),
#     # # "GaussianProcessRegressor": GaussianProcessRegressor(),
#     "decisiontreeregressor": {
#         'model': DecisionTreeRegressor(),
#         'param_grid' : {
#             'decisiontreeregressor__max_depth': np.logspace(0, 1.3, 10, dtype=int),
#             'decisiontreeregressor__min_samples_leaf': np.logspace(2, 3, 5, dtype=int)
#         }
#     },
#     "adaboostregressor": {
#         'model': sklearn.ensemble.AdaBoostRegressor(DecisionTreeRegressor(max_depth=3)),
#         'param_grid': {
#             'adaboostregressor__n_estimators': np.logspace(0, 3, 10, dtype=int)
#         }
#     },
# #     "mlpregressor": {
# #         'model': MLPRegressor(),
# #         'param_grid': {
# #             'mlpregressor__alpha': np.logspace(-5,-1,10)
# #         }
# #     }
#     # "PassiveAggressiveRegressor": PassiveAggressiveRegressor()
# }

In [30]:
for model in models:
    print("Performing search for %s model" % model)
    pipeline = make_pipeline(StandardScaler(), models[model]['model'])

    param_grid = models[model]['param_grid']

    gscv = RandomizedSearchCV(
        pipeline, param_grid, n_jobs=-1,
        scoring='neg_root_mean_squared_error', verbose=1, cv=5,
        refit='best_index_'
    )
    gscv.fit(X, np.log1p(y.loc[X.index]))
    models[model]['best_estimator'] = gscv.best_estimator_
    models[model]['best_score'] = gscv.best_score_
models

Performing search for ridge model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.5s finished


Performing search for lasso model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.4min finished
  positive)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Performing search for elasticnet model
Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

## Combine Models

In [18]:
model = make_pipeline(
    StandardScaler(),
    VotingRegressor(
        estimators=[(model, models[model]['best_estimator'].steps[1][1]) for model in models] + [],
        weights=[1/abs(models[model]['best_score']) for model in models],
        n_jobs=-1
    )
)

In [19]:
model

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('votingregressor',
                 VotingRegressor(estimators=[('ridge',
                                              Ridge(alpha=46415.888336127726,
                                                    copy_X=True,
                                                    fit_intercept=True,
                                                    max_iter=None,
                                                    normalize=False,
                                                    random_state=None,
                                                    solver='auto', tol=0.001)),
                                             ('lasso',
                                              Lasso(alpha=4.641588833612782e-05,
                                                    copy_X=True,
                                                    f...
                

In [20]:
model.fit(X,np.log1p(y.loc[X.index]))

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('votingregressor',
                 VotingRegressor(estimators=[('ridge',
                                              Ridge(alpha=46415.888336127726,
                                                    copy_X=True,
                                                    fit_intercept=True,
                                                    max_iter=None,
                                                    normalize=False,
                                                    random_state=None,
                                                    solver='auto', tol=0.001)),
                                             ('lasso',
                                              Lasso(alpha=4.641588833612782e-05,
                                                    copy_X=True,
                                                    f...
                

In [21]:
model.steps[1][1].estimators_

[Ridge(alpha=46415.888336127726, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=False, random_state=None, solver='auto', tol=0.001),
 Lasso(alpha=4.641588833612782e-05, copy_X=True, fit_intercept=True,
       max_iter=1000, normalize=False, positive=False, precompute=False,
       random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
 ElasticNet(alpha=4.641588833612782e-05, copy_X=True, fit_intercept=True,
            l1_ratio=0.5, max_iter=1000, normalize=False, positive=False,
            precompute=False, random_state=None, selection='cyclic', tol=0.0001,
            warm_start=False),
 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       

In [22]:
mean_squared_error(
    model.predict(X),
    np.log1p(y.loc[X.index]),
    squared=False
)

0.47701906241468833

## Run predictions on test data

In [23]:
test['ecology'] = test['ecology'].map({'excellent':4,'good':3,'satisfactory':2,'poor':1,'no data':np.nan})
for column in test.select_dtypes(include=['object']).drop(columns=['sub_area', 'product_type']).columns:
    test[column] = test[column].map({'yes':1, 'no':0})
X_predict = pd.concat([test.copy(),pd.get_dummies(test.select_dtypes(include=['object']))], axis=1)
for column in X.columns:
    if column not in X_predict:
        X_predict[column] = 0
X_predict = X_predict[X.columns]
for column in X_predict.columns[X_predict.isna().any() == True]:
    X_predict = inpute(X_predict, column, min_samples_leaf=100)

In [24]:
predictions = np.expm1(model.predict(X_predict))
predictions = pd.DataFrame(predictions, columns=["price_doc"])
predictions = pd.concat([test['id'], predictions], axis=1)

In [25]:
predictions.to_csv(os.path.join(DATA_PATH, "predictions.csv"), index=False)

In [26]:
macro = macro.fillna(macro.median())

In [27]:
X_all = pd.merge_ordered(data, macro, on='timestamp', how='left')

In [41]:
from sklearn.ensemble import GradientBoostingRegressor

In [53]:
model = GridSearchCV(GradientBoostingRegressor(), {'alpha' :[0.8,0.9]}, n_jobs=-1)
gscv = RandomizedSearchCV(
        GradientBoostingRegressor(), {'alpha' :[0.8,0.9]}, n_jobs=-1,
        scoring='neg_root_mean_squared_error', verbose=1, cv=5,
        refit='best_index_'
)

In [None]:
gscv.fit(X,np.log1p(y.loc[X.index]))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.7min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.6min finished


In [None]:
gscv.best_score_

In [45]:
mean_squared_error(
    model.predict(X),
    np.log1p(y.loc[X.index]),
    squared=False
)

0.446725374234969

In [46]:
predictions = np.expm1(model.predict(X_predict))
predictions = pd.DataFrame(predictions, columns=["price_doc"])
predictions = pd.concat([test['id'], predictions], axis=1)

In [47]:
predictions.to_csv(os.path.join(DATA_PATH, "predictions.csv"), index=False)