In [1]:
import sys
sys.path.append('..')

In [2]:
from typing import Optional, List
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from itertools import product

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from tqdm.notebook import tqdm, trange

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

from myutils.stats import corrcoef, acfunc
from myutils.matplotlib import matplotlib_seaborn_style

from src.missing_values import impute_average
from src.pca import ImputePCA
from src.rzd import PlacePriceModel

matplotlib_seaborn_style()

In [3]:
results_dir = Path('../data/prediction')
results_dir.mkdir(exist_ok=True)

figures_dir = results_dir / 'figures'
figures_dir.mkdir(exist_ok=True)

In [4]:
data = pd.read_csv('../data/data.csv', parse_dates=['date'], encoding='utf-8')

data['weekday'] = data['date'].dt.weekday
data['day_name'] = data['date'].dt.day_name()

weekdays = data[['weekday', 'day_name']].drop_duplicates().sort_values(by='weekday')\
    .set_index('weekday')['day_name'].to_dict()

data.head(2)

Unnamed: 0,date,days,places,price,num,class,train,departure_hour,arrival_hour,places_frac,weekend,short_term,weekday,day_name
0,2021-04-12,1,28.0,3790.0,001А,Купе,001А-Купе,23.916667,7.916667,0.2,0,1,0,Monday
1,2021-04-13,1,52.0,3790.0,001А,Купе,001А-Купе,23.916667,7.916667,0.371429,0,1,1,Tuesday


In [5]:
train_clusters = pd.read_csv('../data/weekends/weekly_avg_pc_clusters.csv', index_col=0)['cluster'].sort_index()

train_clusters.head()

train
001А-Купе    1
002А-Купе    2
005А-СВ      1
006А-СВ      2
701Н-СВ      0
Name: cluster, dtype: int64

In [6]:
dates = data['date'].unique()
dates.sort()

dates[:6]

array(['2021-04-12T00:00:00.000000000', '2021-04-13T00:00:00.000000000',
       '2021-04-14T00:00:00.000000000', '2021-04-15T00:00:00.000000000',
       '2021-04-16T00:00:00.000000000', '2021-04-17T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [7]:
places = data.pivot(index=['train', 'date'], columns='days', values='places_frac').sort_index()

places.head()

Unnamed: 0_level_0,days,1,2,3,4,5,6,7,8,9,10,...,80,81,82,83,84,85,86,87,88,89
train,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
001А-Купе,2021-04-12,0.2,0.228571,0.235714,0.514286,0.607143,0.707143,0.735714,0.75,0.778571,0.85,...,,,,,,,,,,
001А-Купе,2021-04-13,0.371429,0.392857,0.4,0.4,0.592857,0.65,0.735714,0.75,0.757143,0.757143,...,,,,,,,,,,
001А-Купе,2021-04-14,0.65,0.735714,0.757143,0.764286,0.771429,0.821429,0.85,0.9,,0.907143,...,,,,,,,,,,
001А-Купе,2021-04-15,0.521429,0.664286,0.742857,0.75,0.764286,0.764286,0.792857,0.807143,0.814286,0.835714,...,,,,,,,,,,
001А-Купе,2021-04-16,0.342857,0.457143,0.521429,0.585714,0.65,0.685714,0.685714,0.742857,0.807143,0.857143,...,,,,,,,,,,


In [8]:
class Regressor:
    def __init__(self, base_regressor=DecisionTreeRegressor):
        self.base_regressor = base_regressor
        
        self.n_features = None
        self.n_targets = None
        
        self.regressors = None
        
    def fit(self, x: np.ndarray, y: np.ndarray, strata: Optional[np.ndarray] = None) -> 'Regressor':
        if strata is None:
            strata = np.zeros(x.shape[0])
        
        self.n_features = x.shape[1]
        self.n_targets = y.shape[1]
        
        x = impute_average(x)
        
        unique_strata = np.unique(strata)
        keys = list(product(unique_strata, range(self.n_targets)))
        self.regressors = dict.fromkeys(keys)
        
        for stratum, i in keys:
            x_ = x[strata == stratum]
            y_ = y[strata == stratum, i]
            
            x_ = x_[~np.isnan(y_)]
            y_ = y_[~np.isnan(y_)]
            
            regressor = self.base_regressor()
            regressor.fit(x_, y_)
            self.regressors[(stratum, i)] = regressor
            
        return self
    
    def predict(self, x: np.ndarray, strata: Optional[np.ndarray] = None) -> np.ndarray:
        if strata is None:
            strata = np.zeros(x.shape[0])
        
        x = impute_average(x)
        
        y_pred = np.empty((x.shape[0], self.n_targets))
        
        for stratum, i in product(np.unique(strata), range(self.n_targets)):
            y_pred_ = self.regressors[(stratum, i)].predict(x[strata == stratum])
            
            y_pred[strata == stratum, i] = y_pred_
            
        return y_pred

In [9]:
class PrincipalComponentRegressor:
    def __init__(self, n_components: int = 1, base_regressor=DecisionTreeRegressor):
        self.n_components = n_components
        self.base_regressor = base_regressor

        self.n_pca_features = None
        self.pcas = None
        
        self.n_regressor_features = None
        self.regressors = None
        
    def fit_predict_pca(self, x: np.ndarray, strata: Optional[np.ndarray] = None) -> np.ndarray:
        if strata is None:
            strata = np.zeros(x.shape[0])
        
        x = impute_average(x)
        
        self.n_pca_features = x.shape[1]
        
        unique_strata = np.unique(strata)
        self.pcas = dict.fromkeys(unique_strata)
        
        components = np.empty((x.shape[0], self.n_components))
        
        for stratum in unique_strata:
            pca = ImputePCA(n_components=self.n_components)
            x_ = x[strata == stratum]
            
            stratum_components = pca.fit_predict(x_)
            components[strata == stratum] = stratum_components
            
            self.pcas[stratum] = pca
            
        return components
    
    def reconstruct_pca(self, components: np.ndarray, strata: Optional[np.ndarray] = None):
        if strata is None:
            strata = np.zeros(components.shape[0])
        
        x = np.empty((components.shape[0], self.n_pca_features))
        
        for stratum in np.unique(strata):
            stratum_components = components[strata == stratum]
            x_ = self.pcas[stratum].reconstruct(stratum_components)
            
            x[strata == stratum] = x_
            
        return x
    
    def fit_regressor(self, x: np.ndarray, y: np.ndarray, strata: Optional[np.ndarray] = None):
        if strata is None:
            strata = np.zeros(x.shape[0])
        
        x = impute_average(x)
        
        assert y.shape[1] == self.n_components
        
        self.n_regressor_features = x.shape[1]
        
        unique_strata = np.unique(strata)
        keys = list(product(unique_strata, range(y.shape[1])))
        
        self.regressors = dict.fromkeys(keys)
        
        for stratum, i in keys:
            x_ = x[strata == stratum]
            y_ = y[strata == stratum, i]

            regressor = self.base_regressor()
            regressor.fit(x_, y_)
            
            self.regressors[(stratum, i)] = regressor

    def predict_regressor(self, x: np.ndarray, strata: Optional[np.ndarray] = None) -> np.ndarray:
        if strata is None:
            strata = np.zeros(x.shape[0])
        
        x = impute_average(x)
        
        y_pred = np.empty((x.shape[0], self.n_components))
        
        for stratum, i in product(np.unique(strata), range(self.n_components)):
            x_ = x[strata == stratum]
            y_pred_ = self.regressors[(stratum, i)].predict(x_)
            
            y_pred[strata == stratum, i] = y_pred_
            
        return y_pred
    
    def fit(
            self,
            x_pca: np.ndarray,
            x_regressor: np.ndarray,
            strata_pca: Optional[np.ndarray] = None,
            strata_regressor: Optional[np.ndarray] = None,
    ) -> 'PrincipalComponentRegressor':
        components = self.fit_predict_pca(x_pca, strata_pca)
        self.fit_regressor(x_regressor, components, strata_regressor)
        
        return self
    
    def predict(
            self,
            x: np.ndarray,
            strata_pca: Optional[np.ndarray] = None,
            strata_regressor: Optional[np.ndarray] = None,
    ) -> np.ndarray:
        components_pred = self.predict_regressor(x, strata_regressor)
        x_pred = self.reconstruct_pca(components_pred, strata_pca)
        
        return x_pred

In [10]:
setups = [
    (np.arange(45), np.arange(30), np.arange(30, 45)),
    (np.arange(60), np.arange(30), np.arange(30, 45)),
    (np.arange(60), np.arange(30), np.arange(30, 60)),
    (np.arange(30), np.arange(15), np.arange(15, 30)),
    (np.arange(45), np.arange(15), np.arange(15, 30)),
    (np.arange(60), np.arange(15), np.arange(15, 30)),
]

In [None]:
ncols = 3
nrows = len(setups) // ncols + int(len(setups) % ncols > 0)

fig, axes = plt.subplots(nrows=nrows, ncols=ncols)
fig.set_size_inches(4 * ncols, 4 * nrows)
fig.subplots_adjust(wspace=0.35, hspace=0.5)
axes = axes.flatten()

for (cols, predict_cols, fit_cols), ax in zip(tqdm(setups), axes):
    x = places.values

    x_pca = x[:, cols]
    x_regressor = x[:, fit_cols]
    y = x[:, predict_cols]

    mask = ((~np.isnan(x_pca)).sum(axis=1) >= 2) & ((~np.isnan(x_regressor)).sum(axis=1) >= 2)

    x = x[mask]
    x_pca = x_pca[mask]
    x_regressor = x_regressor[mask]
    y = y[mask]

    strata_pca = places.index.get_level_values(0).values[mask]
    strata_regressor = places.index.get_level_values(0).map(train_clusters).values[mask]
    
    results = []

    kfold = KFold(5, shuffle=True)

    for train_idx, test_idx in kfold.split(x):
        x_pca_train = x_pca[train_idx]
        x_pca_test = x_pca[test_idx]

        x_regressor_train = x_regressor[train_idx]
        x_regressor_test = x_regressor[test_idx]

        y_train = y[train_idx]
        y_test = y[test_idx]

        strata_pca_train = strata_pca[train_idx]
        strata_pca_test = strata_pca[test_idx]

        strata_regressor_train = strata_regressor[train_idx]
        strata_regressor_test = strata_regressor[test_idx]

        regressor = Regressor().fit(x_regressor_train, y_train)
        y_pred = regressor.predict(x_regressor_test)
        scores = np.nanmean((y_test - y_pred) ** 2, axis=0)
        assert len(scores) == len(predict_cols)

        for col, score in zip(predict_cols, scores):
            results.append({'model': 'regr', 'target': col, 'score': score})

        regressor = Regressor().fit(x_regressor_train, y_train, strata_regressor_train)
        y_pred = regressor.predict(x_regressor_test, strata_regressor_test)
        scores = np.nanmean((y_test - y_pred) ** 2, axis=0)
        assert len(scores) == len(predict_cols)

        for col, score in zip(predict_cols, scores):
            results.append({'model': 'regr_strata', 'target': col, 'score': score})

        for i in [1, 2, 5]:
            pc_regressor = PrincipalComponentRegressor(i).fit(x_pca_train, x_regressor_train)
            y_pred = pc_regressor.predict(x_regressor_test)[:, predict_cols]
            scores = np.nanmean((y_test - y_pred) ** 2, axis=0)
            assert len(scores) == len(predict_cols)

            for col, score in zip(predict_cols, scores):
                results.append({'model': f'{i}comp', 'target': col, 'score': score})

            pc_regressor = PrincipalComponentRegressor(i).fit(x_pca_train, x_regressor_train,
                                                             strata_pca_train, strata_regressor_train)
            y_pred = pc_regressor.predict(x_regressor_test,
                                          strata_pca_test, strata_regressor_test)[:, predict_cols]
            scores = np.nanmean((y_test - y_pred) ** 2, axis=0)
            assert len(scores) == len(predict_cols)

            for col, score in zip(predict_cols, scores):
                results.append({'model': f'{i}comp_strata', 'target': col, 'score': score})

    results = pd.DataFrame(results)
        
    sns.lineplot(data=results, x='target', y='score', hue='model', ax=ax)

    title = f'PCA columns: {cols[0]}-{cols[-1]}\nFit columns: {fit_cols[0]}-{fit_cols[-1]}\n' \
        f'Predict columns: {predict_cols[0]}-{predict_cols[-1]}'
    ax.set_title(title)

for i, ax in enumerate(axes):
    if i % ncols == 2:
        ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    else:
        ax.legend().set_visible(False)

for i in range(len(setups), len(axes)):
    axes[i].axis('off')
    
plt.savefig(figures_dir / 'prediction.pdf', transparent=True, bbox_inches='tight')

  0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
!jupyter nbconvert --to html prediction.ipynb