In [28]:
import numpy as np
import pandas as pd
import lightgbm as lgbm
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
import time
from sklearn.linear_model import LinearRegression
from tqdm import tqdm_notebook as tqdm
from tsfresh import extract_features
from lightgbm import LGBMRegressor

In [2]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs((satellite_predicted_values - satellite_true_values) 
        / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))

In [3]:
# Загрузка данных
# windows
PATH_TO_DATA = os.path.join('../data')
full_train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'), index_col='id')
full_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'), index_col='id')

In [4]:
# from sklearn.model_selection import TimeSeriesSplit

# N_FOLDS = 5
# ts_splitter = TimeSeriesSplit(N_FOLDS)

### Важно! В train.csv и test.csv признаки `epoch` и `sat_id` стоят в разном порядке. Это может быть критично для некоторых моделей (например, lgbm). Поэтому сделаем единный порядок:

In [15]:
test_cols = list(full_test.columns)
test_cols[0] = 'epoch'
test_cols[1] = 'sat_id'
full_test = full_test.reindex(columns=test_cols)

In [16]:
full_train['epoch'] = full_train.epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values
full_test['epoch'] = full_test.epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values

In [17]:
full_train.head()

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1388520000.0,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133
1,1388523000.0,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468
2,1388526000.0,0,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768
3,1388528000.0,0,-9148.251857,-20651.43746,-20720.381279,0.7156,-3.373762,1.722115,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306
4,1388531000.0,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237


In [9]:

# kind_to_fc_parameters = {
#     "x_sim": {"mean": None},#, "max": None, "min": None},
#     "y_sim": {"mean": None},#, "max": None, "min": None},
#     "z_sim": {"mean": None},#, "max": None, "min": None},
#     "Vx_sim": {"mean": None},#, "max": None, "min": None},
#     "Vy_sim": {"mean": None},#, "max": None, "min": None},
#     "Vz_sim": {"mean": None}#, "max": None, "min": None},
    
}

# extracted_features = extract_features(full_train, kind_to_fc_parameters=None,
#                                       column_id="sat_id", column_sort="epoch")

In [18]:
# import pickle

In [19]:
# with open('timeseries_features', 'wb') as handle:
#     pickle.dump(extracted_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
# extracted_features

In [21]:
full_train.head()

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1388520000.0,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133
1,1388523000.0,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468
2,1388526000.0,0,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768
3,1388528000.0,0,-9148.251857,-20651.43746,-20720.381279,0.7156,-3.373762,1.722115,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306
4,1388531000.0,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237


In [22]:
# x_num

for i, col in enumerate(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']):
    full_train[col + '_num'] = 0
    arr = full_train[col + '_num'].values
    start = 0
    for sat_id in range(600):
        df = full_train[full_train.sat_id == sat_id]
        step = df.shape[0]
        idx_min = df[col].idxmin() % 24
        arr[start: start + step] = np.fromfunction(lambda i: (i + idx_min) % 24, (step, ))
        start += step
    full_train[col + '_num'] = arr

In [23]:
# x_num

for i, col in enumerate(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']):
    full_test[col + '_num'] = 0
    arr = full_test[col + '_num'].values
    start = 0
    for sat_id in full_test.sat_id.unique():
        df = full_test[full_test.sat_id == sat_id]
        step = df.shape[0]
        idx_last = full_train[full_train.sat_id == sat_id][col + '_num'].iloc[-1] + 1
        arr[start: start + step] = np.fromfunction(lambda i: (i + idx_last) % 24, (step, ))
        start += step
    full_test[col + '_num'] = arr
        

In [24]:
for col_name in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
    full_test[col_name] = 0.0

In [26]:
%%time

features_for_tsfresh = ['sat_id', 'epoch', 'x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']
target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']

width = 8
for sat_id in tqdm(full_test.sat_id.unique()):
    df1_train = full_train[full_train.sat_id == sat_id]
    df1_test =  full_test[full_test.sat_id == sat_id]
    for col in target_cols:
        for i in range(24):
            df2_train = df1_train[df1_train[col + '_sim_num'] == i]
            df2_test = df1_test[df1_test[col + '_sim_num'] == i]
            
            train_features = extract_features(df2_train[features_for_tsfresh], kind_to_fc_parameters=None,
                                              column_id="sat_id", column_sort="epoch")
            test_features = extract_features(df2_test[features_for_tsfresh], kind_to_fc_parameters=None,
                                              column_id="sat_id", column_sort="epoch")
            tmp = np.zeros((df2_train.shape[0], train_features.size))
            tmp = train_features.values + tmp
            df2_train = pd.concat([df2_train, pd.DataFrame(tmp, index=df2_train.index)], axis=1)
            
            tmp = np.zeros((df2_test.shape[0], test_features.size))
            tmp = test_features.values + tmp
            df2_test = pd.concat([df2_test, pd.DataFrame(tmp, index=df2_test.index)], axis=1)
            
            
            lin_model = LGBMRegressor()
            X_train = df2_train.reset_index().drop(columns=target_cols).values[-width:]
            y_train = df2_train[col].values[-width:]
            lin_model.fit(X_train, y_train)
            X_test = df2_test.reset_index().drop(columns=target_cols).values[-width:]
            y_test = lin_model.predict(X_test)
            full_test.loc[X_test[:, 0].ravel(), col] = y_test


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))

NameError: name 'extract_features' is not defined

In [20]:
df2_train

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,...,4514,4515,4516,4517,4518,4519,4520,4521,4522,4523
_temporary_index_column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49187,1388581000.0,25,50148.144734,6095.977588,25768.405552,-0.223176,0.921858,-1.91799,50130.410855,6126.332443,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49211,1388667000.0,25,50147.567353,6067.572767,25805.14441,-0.221002,0.922435,-1.917114,50126.538829,6115.274203,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49235,1388752000.0,25,50148.018441,6038.992785,25841.655839,-0.218733,0.922883,-1.916276,50123.528203,6104.473634,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49259,1388838000.0,25,50149.412834,6011.059028,25877.66288,-0.216324,0.92326,-1.915454,50121.330135,6094.431299,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49283,1388924000.0,25,50151.261376,5984.298706,25913.102319,-0.213777,0.923648,-1.914623,50119.697514,6085.444262,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49307,1389010000.0,25,50152.976126,5958.649188,25948.358951,-0.211128,0.924119,-1.913754,50118.26998,6077.535671,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49331,1389096000.0,25,50154.039289,5933.587719,25983.993861,-0.208435,0.924712,-1.91283,50116.683107,6070.472053,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49355,1389181000.0,25,50154.219765,5908.37788,26020.583503,-0.205754,0.925424,-1.911848,50114.662902,6063.85062,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49379,1389264000.0,25,50219.529077,2508.57179,32518.481628,0.156865,0.956589,-1.701318,50112.079143,6057.216328,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0
49403,1389349000.0,25,50209.599755,2478.677343,32553.898202,0.15918,0.957196,-1.7001,50108.951005,6050.169398,...,1.0,1.0,-26498680000.0,-53133460000.0,-79705800000.0,0.0,0.0,0.0,8105.278816,1.0


In [21]:
df2_test

Unnamed: 0_level_0,sat_id,epoch,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim,x_sim_num,y_sim_num,...,4514,4515,4516,4517,4518,4519,4520,4521,4522,4523
_temporary_index_column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49931,25,1391233000.0,50047.578228,5857.688367,25400.94318,-0.229928,0.927619,-1.93211,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
49955,25,1391319000.0,50046.348709,5849.464732,25388.401057,-0.229974,0.927454,-1.932676,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
49979,25,1391405000.0,50045.270662,5842.236602,25375.586049,-0.229934,0.927367,-1.933207,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
50003,25,1391490000.0,50043.988248,5835.727109,25362.906319,-0.229853,0.927385,-1.933695,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
50027,25,1391576000.0,50042.252345,5829.51459,25350.73585,-0.229776,0.927509,-1.934142,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
50051,25,1391658000.0,50039.963764,5823.149731,25339.335115,-0.22974,0.927715,-1.934554,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
50075,25,1391744000.0,50037.169644,5816.256235,25328.812879,-0.229768,0.927968,-1.934943,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
50099,25,1391830000.0,50034.027099,5808.592258,25319.130175,-0.229862,0.928227,-1.935322,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
50123,25,1391916000.0,50030.753345,5800.06983,25310.132498,-0.230012,0.928453,-1.935703,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0
50147,25,1392002000.0,50027.578165,5790.742489,25301.592868,-0.230198,0.928619,-1.936096,12,16,...,1.0,1.0,-25766720000.0,-51149860000.0,-76318390000.0,0.0,0.0,0.0,6461.359647,1.0


In [19]:
full_test[['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('submission.csv', index_label='id')

SystemError: <built-in method item of numpy.ndarray object at 0x7f4d605484e0> returned a result with an error set