In [36]:
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import pandas as pd
import os
import tqdm

class IDAO_validator:
    """
    Просто применяем TimeSeriesSplit отдельно для каждого из sat_id
    """
    
    def __init__(self, n_splits=5, max_train_size=None):
        self.n_splits = n_splits
        self.splitter = TimeSeriesSplit(n_splits, max_train_size=max_train_size)
        
    def get_n_splits(self):
        return self.n_splits
    
    def split(self, X):
        offset = 0
        for sat_id in X['sat_id'].unique():
            if sat_id != 0:
                offset += len(X[X['sat_id'] == (sat_id - 1)])
            for train_idx_, test_idx_ in self.splitter.split(X[X['sat_id'] == sat_id]):
                yield (train_idx_ + offset, test_idx_ + offset, sat_id)

In [37]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs((satellite_predicted_values - satellite_true_values) 
        / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))

In [38]:
# Загрузка данных
# windows
PATH_TO_DATA = os.path.join('../data')
full_train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'), index_col='id')
full_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'), index_col='id')

### Пример использования:

In [39]:
N_FOLDS = 5

idao_splitter = IDAO_validator(n_splits=N_FOLDS)

In [40]:
import lightgbm

In [None]:
%%time

## колонки, которые не нужны для предсказания
columns_to_drop = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz', 'epoch', 'sat_id']

## целевые колонки
target_columns = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']

## номер фолда для вывода логов
n_fold = 0
## предыдущий 'sat_id' - опять же для вывода логов - чтобы отслеживать смену объектов (sat_id)
prev_sat_id = 0

## список из значений smape (на тесте) для каждого из sat_id
smape_per_sat_id = []

for trn_idxs, tst_idxs, sat_id in idao_splitter.split(full_train):
    ## создаем массив для хранения предсказаний для теста, чтобы посчитать smape
    preds = np.zeros((tst_idxs.size, len(target_columns)))
    ## массив для хранения реальных значений целевых переменных для теста
    y_true = np.zeros((tst_idxs.size, len(target_columns)))
    
    ## для каждого sat_id обновляем n_fold (для логов)
    if prev_sat_id != sat_id:
            prev_sat_id = sat_id
            n_fold = 0
            print('-' * 15)
    n_fold += 1
    
    ## цикл по целевым колонкам
    for i, col in enumerate(target_columns):
        
        lgbm = lightgbm.LGBMRegressor(max_depth=3, learning_rate=0.2, n_estimators=500,
                                      random_state=13)
        X_train = full_train.iloc[trn_idxs].drop(columns=columns_to_drop).values
        y_train = full_train.iloc[trn_idxs][col].values

        X_test = full_train.iloc[tst_idxs].drop(columns=columns_to_drop).values
        y_test = full_train.iloc[tst_idxs][col].values

        lgbm.fit(X_train, y_train)
        
        preds[:, i] = lgbm.predict(X_test)
        y_true[:, i] = y_test
        
        ## выводим метрику для каждой целевой колонки отдельно
        print(f'sat_id: {sat_id}, fold: {n_fold}, target_col: {col},\
              smape: {1 - smape(preds[:, i], y_test):.5f}')
    ## выводим метрику для всех целевых колонок сразу (для одного sat_id)
    print('_' * 15)
    print(f'sat_id: {sat_id}, fold: {n_fold}, target_col: ALL_COLUMNS,\
              smape: {1 - smape(preds, y_true):.5f}')
    smape_per_sat_id.append(smape(preds, y_true))
    print('_' * 15)

sat_id: 0, fold: 1, target_col: x,              smape: 0.90318
sat_id: 0, fold: 1, target_col: y,              smape: 0.91468
sat_id: 0, fold: 1, target_col: z,              smape: 0.90183
sat_id: 0, fold: 1, target_col: Vx,              smape: 0.91655
sat_id: 0, fold: 1, target_col: Vy,              smape: 0.88539
sat_id: 0, fold: 1, target_col: Vz,              smape: 0.91893
_______________
sat_id: 0, fold: 1, target_col: ALL_COLUMNS,              smape: 0.90676
_______________
sat_id: 0, fold: 2, target_col: x,              smape: 0.95166
sat_id: 0, fold: 2, target_col: y,              smape: 0.97288
sat_id: 0, fold: 2, target_col: z,              smape: 0.95762
sat_id: 0, fold: 2, target_col: Vx,              smape: 0.98062
sat_id: 0, fold: 2, target_col: Vy,              smape: 0.92575
sat_id: 0, fold: 2, target_col: Vz,              smape: 0.98454
_______________
sat_id: 0, fold: 2, target_col: ALL_COLUMNS,              smape: 0.96218
_______________
sat_id: 0, fold: 3, target_c

sat_id: 3, fold: 3, target_col: x,              smape: 0.93872
sat_id: 3, fold: 3, target_col: y,              smape: 0.95801
sat_id: 3, fold: 3, target_col: z,              smape: 0.89936
sat_id: 3, fold: 3, target_col: Vx,              smape: 0.90913
sat_id: 3, fold: 3, target_col: Vy,              smape: 0.86019
sat_id: 3, fold: 3, target_col: Vz,              smape: 0.87490
_______________
sat_id: 3, fold: 3, target_col: ALL_COLUMNS,              smape: 0.90672
_______________
sat_id: 3, fold: 4, target_col: x,              smape: 0.95324
sat_id: 3, fold: 4, target_col: y,              smape: 0.98126
sat_id: 3, fold: 4, target_col: z,              smape: 0.96137
sat_id: 3, fold: 4, target_col: Vx,              smape: 0.98158
sat_id: 3, fold: 4, target_col: Vy,              smape: 0.95976
sat_id: 3, fold: 4, target_col: Vz,              smape: 0.95680
_______________
sat_id: 3, fold: 4, target_col: ALL_COLUMNS,              smape: 0.96567
_______________
sat_id: 3, fold: 5, target_c

sat_id: 6, fold: 5, target_col: x,              smape: 0.84790
sat_id: 6, fold: 5, target_col: y,              smape: 0.83251
sat_id: 6, fold: 5, target_col: z,              smape: 0.85358
sat_id: 6, fold: 5, target_col: Vx,              smape: 0.82446
sat_id: 6, fold: 5, target_col: Vy,              smape: 0.85725
sat_id: 6, fold: 5, target_col: Vz,              smape: 0.86664
_______________
sat_id: 6, fold: 5, target_col: ALL_COLUMNS,              smape: 0.84706
_______________
---------------
sat_id: 7, fold: 1, target_col: x,              smape: 0.56312
sat_id: 7, fold: 1, target_col: y,              smape: 0.56825
sat_id: 7, fold: 1, target_col: z,              smape: 0.53557
sat_id: 7, fold: 1, target_col: Vx,              smape: 0.26620
sat_id: 7, fold: 1, target_col: Vy,              smape: 0.11220
sat_id: 7, fold: 1, target_col: Vz,              smape: 0.13973
_______________
sat_id: 7, fold: 1, target_col: ALL_COLUMNS,              smape: 0.36418
_______________
sat_id: 7, f

In [None]:
print(1 - np.mean(smape_per_sat_id))

In [None]:
0.8809298917373344

In [None]:
for col_name in target_columns:
    full_test[col_name] = 0.0

In [None]:
for sat_id in tqdm.tqdm_notebook(full_test['sat_id'].unique()):

    ## цикл по целевым колонкам
    for i, col in enumerate(target_columns):

        lgbm = lightgbm.LGBMRegressor(max_depth=3, learning_rate=0.2, n_estimators=500,
                                      random_state=13)
        X_train = full_train[full_train['sat_id'] == sat_id].drop(columns=columns_to_drop).values
        y_train = full_train[full_train['sat_id'] == sat_id][col].values

        X_test = full_test[full_test['sat_id'] == sat_id].drop(columns=columns_to_drop).values
        y_test = full_test[full_test['sat_id'] == sat_id][col].values

        lgbm.fit(X_train, y_train)

        full_test.loc[full_test['sat_id'] == sat_id, col] = lgbm.predict(X_test)

In [None]:
full_test[['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('submission.csv', index_label='id')

In [None]:
full_test.head()