In [28]:
import numpy as np
import pandas as pd
import lightgbm as lgbm
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
import time
from sklearn.linear_model import LinearRegression
from tqdm import tqdm_notebook as tqdm
from tsfresh import extract_features
from lightgbm import LGBMRegressor

In [2]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs((satellite_predicted_values - satellite_true_values) 
        / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))

In [3]:
# Загрузка данных
# windows
PATH_TO_DATA = os.path.join('../data')
full_train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'), index_col='id')
full_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'), index_col='id')

In [4]:
# from sklearn.model_selection import TimeSeriesSplit

# N_FOLDS = 5
# ts_splitter = TimeSeriesSplit(N_FOLDS)

### Важно! В train.csv и test.csv признаки `epoch` и `sat_id` стоят в разном порядке. Это может быть критично для некоторых моделей (например, lgbm). Поэтому сделаем единный порядок:

In [15]:
test_cols = list(full_test.columns)
test_cols[0] = 'epoch'
test_cols[1] = 'sat_id'
full_test = full_test.reindex(columns=test_cols)

In [16]:
full_train['epoch'] = full_train.epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values
full_test['epoch'] = full_test.epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values

In [17]:
full_train.head()

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1388520000.0,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133
1,1388523000.0,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468
2,1388526000.0,0,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768
3,1388528000.0,0,-9148.251857,-20651.43746,-20720.381279,0.7156,-3.373762,1.722115,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306
4,1388531000.0,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237


In [9]:

# kind_to_fc_parameters = {
#     "x_sim": {"mean": None},#, "max": None, "min": None},
#     "y_sim": {"mean": None},#, "max": None, "min": None},
#     "z_sim": {"mean": None},#, "max": None, "min": None},
#     "Vx_sim": {"mean": None},#, "max": None, "min": None},
#     "Vy_sim": {"mean": None},#, "max": None, "min": None},
#     "Vz_sim": {"mean": None}#, "max": None, "min": None},
    
}

# extracted_features = extract_features(full_train, kind_to_fc_parameters=None,
#                                       column_id="sat_id", column_sort="epoch")

In [18]:
# import pickle

In [19]:
# with open('timeseries_features', 'wb') as handle:
#     pickle.dump(extracted_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
# extracted_features

In [21]:
full_train.head()

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1388520000.0,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133
1,1388523000.0,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468
2,1388526000.0,0,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768
3,1388528000.0,0,-9148.251857,-20651.43746,-20720.381279,0.7156,-3.373762,1.722115,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306
4,1388531000.0,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237


In [22]:
# x_num

for i, col in enumerate(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']):
    full_train[col + '_num'] = 0
    arr = full_train[col + '_num'].values
    start = 0
    for sat_id in range(600):
        df = full_train[full_train.sat_id == sat_id]
        step = df.shape[0]
        idx_min = df[col].idxmin() % 24
        arr[start: start + step] = np.fromfunction(lambda i: (i + idx_min) % 24, (step, ))
        start += step
    full_train[col + '_num'] = arr

In [23]:
# x_num

for i, col in enumerate(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']):
    full_test[col + '_num'] = 0
    arr = full_test[col + '_num'].values
    start = 0
    for sat_id in full_test.sat_id.unique():
        df = full_test[full_test.sat_id == sat_id]
        step = df.shape[0]
        idx_last = full_train[full_train.sat_id == sat_id][col + '_num'].iloc[-1] + 1
        arr[start: start + step] = np.fromfunction(lambda i: (i + idx_last) % 24, (step, ))
        start += step
    full_test[col + '_num'] = arr
        

In [24]:
for col_name in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
    full_test[col_name] = 0.0

In [None]:
%%time

features_for_tsfresh = ['sat_id', 'epoch', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']
target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']

width = 8
for sat_id in tqdm(full_test.sat_id.unique()):
    df1_train = full_train[full_train.sat_id == sat_id]
    df1_test =  full_test[full_test.sat_id == sat_id]
    for col in target_cols:
        for i in range(24):
            df2_train = df1_train[df1_train[col + '_sim_num'] == i]
            df2_test = df1_test[df1_test[col + '_sim_num'] == i]
            
            train_features = extract_features(df2_train[features_for_tsfresh], kind_to_fc_parameters=None,
                                              column_id="sat_id", column_sort="epoch")
            test_features = extract_features(df2_test[features_for_tsfresh], kind_to_fc_parameters=None,
                                              column_id="sat_id", column_sort="epoch")
            tmp = np.zeros((df2_train.shape[0], train_features.size))
            tmp = train_features.values + tmp
            df2_train = pd.concat([df2_train, pd.DataFrame(tmp, index=df2_train.index)], axis=1)
            
            tmp = np.zeros((df2_test.shape[0], test_features.size))
            tmp = test_features.values + tmp
            df2_test = pd.concat([df2_test, pd.DataFrame(tmp, index=df2_test.index)], axis=1)
            
            
            lin_model = LGBMRegressor()
            X_train = df2_train.reset_index().drop(columns=target_cols).values[-width:]
            y_train = df2_train[col].values[-width:]
            lin_model.fit(X_train, y_train)
            X_test = df2_test.reset_index().drop(columns=target_cols).values[-width:]
            y_test = lin_model.predict(X_test)
            full_test.loc[X_test[:, 0].ravel(), col] = y_test


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.20it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.78it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.49it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 69.25it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 68.76it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 69.71it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 66.40it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 72.15it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 64.98it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 71.66it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.29it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 64.15it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.52it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 65.87it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 70.26it/s]
Feature Ex

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 70.00it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 70.65it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 65.60it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 71.06it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 70.15it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.81it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 65.94it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 70.36it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 65.63it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 69.14it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 65.65it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 77.87it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.20it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 80.02it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 63.22it/s]
Feature Ex

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.93it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 68.80it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 65.99it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 69.81it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 64.61it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 68.41it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 71.91it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 69.95it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 65.74it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 68.45it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 68.03it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.64it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 67.84it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 68.04it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 65.37it/s]
Feature Ex

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 79.83it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 89.55it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 84.84it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.17it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.82it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 79.69it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 85.30it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 80.46it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.25it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 82.81it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.31it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.40it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.63it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.44it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 85.94it/s]
Feature Ex

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 79.68it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 82.61it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 82.58it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.44it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 86.01it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 105.75it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 82.36it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 86.37it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.08it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 79.64it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 84.14it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.82it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 85.45it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.54it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 79.94it/s]
Feature E

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.06it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.79it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 82.11it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 78.23it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.08it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 79.19it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.03it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 80.54it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 80.77it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 82.57it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.13it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 84.60it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 86.72it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 76.70it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 84.40it/s]
Feature Ex

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 80.59it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 82.77it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.63it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 78.90it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.53it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 80.95it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 85.57it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.70it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 80.02it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 79.41it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 84.83it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.40it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 77.85it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.97it/s]
Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 83.24it/s]
Feature Ex

In [None]:
full_test[['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('submission.csv', index_label='id')