In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgbm
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
import time
from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor, Ridge
from sklearn.svm import SVR
from tqdm import tqdm_notebook as tqdm

In [2]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs((satellite_predicted_values - satellite_true_values) 
        / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))

In [3]:
# Загрузка данных
# windows
PATH_TO_DATA = os.path.join('../data/')
full_train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'), index_col='id')
full_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'), index_col='id')

In [4]:
full_train

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2014-01-01T00:00:00.000,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.221690,-20741.615306,-0.907527,-3.804930,-2.024133
1,2014-01-01T00:46:43.000,0,-10567.672384,1619.746066,-24451.813271,-0.302590,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468
2,2014-01-01T01:33:26.001,0,-10578.684043,-10180.467460,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,-24271.169776,0.274880,-4.046788,0.718768
3,2014-01-01T02:20:09.001,0,-9148.251857,-20651.437460,-20720.381279,0.715600,-3.373762,1.722115,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306
4,2014-01-01T03:06:52.002,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234089,2014-01-31T22:00:22.602,599,-21721.485878,-14048.557595,5277.807430,-1.351754,3.373418,0.004995,-20717.958996,-16245.240500,5250.939232,-1.653931,3.157321,0.079069
1234090,2014-01-31T22:25:13.240,599,-23176.890569,-8712.016936,5153.371350,-0.575955,3.764450,-0.175109,-22673.444496,-11192.339393,5243.608790,-0.945328,3.603371,-0.092202
1234091,2014-01-31T22:50:03.878,599,-23363.044794,-2906.071320,4747.247386,0.351381,3.992943,-0.372198,-23461.830699,-5570.167175,4966.813869,-0.087089,3.912550,-0.281989
1234092,2014-01-31T23:14:54.515,599,-22058.020262,3074.894039,4038.853542,1.421085,3.984793,-0.578849,-22858.679929,373.249102,4396.055679,0.920162,4.021955,-0.485364


In [5]:
full_test

Unnamed: 0_level_0,sat_id,epoch,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3927,1,2014-02-01T00:01:45.162,-13366.891347,-14236.753503,6386.774555,4.333815,-0.692764,0.810774
3928,1,2014-02-01T00:22:57.007,-7370.434039,-14498.771520,7130.411325,5.077413,0.360609,0.313402
3929,1,2014-02-01T00:44:08.852,-572.068654,-13065.289498,7033.794876,5.519106,2.012830,-0.539412
3930,1,2014-02-01T01:05:20.697,6208.945257,-9076.852425,5548.296900,4.849212,4.338955,-1.869600
3931,1,2014-02-01T01:26:32.542,10768.200284,-2199.706707,2272.014862,1.940505,6.192887,-3.167724
...,...,...,...,...,...,...,...,...
1231060,597,2014-02-28T05:19:01.386,28595.031634,-85458.623976,5021.767767,-1.721131,-0.011611,0.122357
1231061,597,2014-02-28T07:21:46.454,15547.173728,-84233.509948,5840.616702,-1.815323,0.353445,0.098927
1231062,597,2014-02-28T09:24:31.522,1981.107111,-80123.860051,6458.394351,-1.858963,0.773846,0.067379
1231063,597,2014-02-28T11:27:16.590,-11644.801379,-72674.504171,6805.361999,-1.826030,1.262930,0.024782


In [6]:
full_train.groupby('sat_id').describe().T

Unnamed: 0,sat_id,0,1,2,3,4,5,6,7,8,9,...,590,591,592,593,594,595,596,597,598,599
x,count,958.000000,2108.000000,417.000000,354.000000,1210.000000,3531.000000,418.000000,235.000000,2578.000000,330.000000,...,584.000000,339.000000,1281.000000,1307.000000,1210.000000,499.000000,378.000000,366.000000,648.000000,1797.000000
x,mean,4900.339540,-16110.661352,-35645.536202,55850.315483,-6362.637373,-1332.440347,-3992.980574,-69315.624667,1821.125809,3974.751573,...,33969.821907,-46291.314162,-21555.015684,-8668.628135,-1425.664933,10580.871178,-46606.855632,31587.704549,-9840.325692,-1249.927953
x,std,9843.287489,13802.803673,40448.660925,42226.892159,20304.954487,10516.767043,40254.674362,54817.363085,12760.325326,43704.812113,...,32899.164658,45188.396733,19364.957099,20257.571491,20617.707887,12574.760713,41480.443911,44511.927815,30444.932286,15884.479675
x,min,-10752.229751,-33092.618399,-83475.871225,-28036.656341,-33119.544128,-15784.759280,-65975.489409,-130892.958010,-16973.914939,-59286.173946,...,-26260.593295,-98305.756377,-46535.764661,-34693.709309,-30089.826690,-11853.697706,-93502.440554,-42789.959911,-50041.240812,-23364.288658
x,25%,-4846.044039,-28724.877108,-72080.483505,27794.354266,-26239.094949,-11832.997586,-44257.957384,-119053.574244,-10903.902531,-38957.735073,...,3605.024327,-85473.827246,-39108.560422,-27242.692225,-23186.033247,-138.602966,-85035.962802,-10270.594276,-39882.598904,-16966.877633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vz_sim,min,-3.582274,-3.267208,-2.548670,-0.070043,-3.853693,-0.573519,-1.695279,-0.856612,-4.264669,-0.960054,...,-1.970567,-1.991123,-0.739670,-1.813285,-0.592408,-1.481722,-0.600138,-0.446232,-1.391375,-1.037044
Vz_sim,25%,-2.024101,-0.877540,-0.768268,-0.058169,-2.220249,-0.388132,-1.125444,-0.702429,-3.156244,-0.726537,...,-0.586607,-0.526903,-0.343957,-1.438917,-0.479257,-1.353386,-0.535063,-0.118929,-0.765975,-0.483254
Vz_sim,50%,0.714230,0.572531,0.423172,-0.028103,0.270589,0.011565,-0.117304,-0.328873,-0.245740,-0.170071,...,0.286695,0.286162,0.071640,-0.263998,-0.093983,-0.831555,-0.299655,0.077402,0.128822,0.152503
Vz_sim,75%,2.176876,1.214693,0.889922,0.039338,1.967847,0.387142,0.980571,0.378398,2.833273,0.738587,...,0.669490,0.629948,0.373128,1.429139,0.467210,0.375542,0.309803,0.152677,0.817466,0.563858


In [7]:
# захотел посмотреть разницу в значениях для SGP-4 и реальными данными
# типо много на смом деле пздц как сильно разнятся данные
for col_name in full_train.columns[2:8]:
    full_train['delta_' + col_name] = full_train[col_name + '_sim'] - full_train[col_name]

In [8]:
delta_names = full_train.columns[14:].to_list()
full_train[delta_names + ['sat_id']].groupby('sat_id').describe().T

Unnamed: 0,sat_id,0,1,2,3,4,5,6,7,8,9,...,590,591,592,593,594,595,596,597,598,599
delta_x,count,958.0,2108.0,417.0,354.0,1210.0,3531.0,418.0,235.0,2578.0,330.0,...,584.0,339.0,1281.0,1307.0,1210.0,499.0,378.0,366.0,648.0,1797.0
delta_x,mean,1.208894,2112.446451,209.738936,326.081733,8.249695,-12.821916,-13515.094036,449.078722,-266.266534,1837.652562,...,41.971583,-609.134603,4934.673529,-45.598508,424.624676,-498.634398,-346.468126,470.050911,406.748192,-1.919105
delta_x,std,3053.526207,3023.693121,8822.708298,9808.638586,10555.451555,1904.838397,18191.19296,21917.512787,663.871604,4024.603051,...,9866.726522,261.50306,13606.203374,580.305934,1245.23379,4053.668639,6271.068678,17616.065748,6091.914586,1106.953228
delta_x,min,-9158.649797,-6436.1832,-14920.301581,-14903.0451,-18287.374618,-4595.159437,-77497.609193,-39129.84272,-2349.980797,-2219.447596,...,-26350.088327,-1384.493213,-24977.828593,-1616.287974,-1374.456773,-13660.407129,-13088.737558,-27413.897192,-11074.432313,-3373.490862
delta_x,25%,-1828.329475,356.937143,-5421.606604,-6001.73525,-6183.584044,-1218.435437,-17509.553259,-12892.183612,-614.397696,-195.41134,...,-4129.12086,-676.543913,-3849.170155,-387.586543,-383.677554,-950.766496,-1278.515302,-11260.842362,-507.738066,-576.760631
delta_x,50%,-8.945639,1152.946981,128.041125,383.232946,-120.354162,-10.341319,-5265.083207,-172.927364,-89.095419,232.492879,...,119.66992,-514.252345,941.969324,-15.683449,3.197385,30.274255,-219.084875,127.330249,151.992679,80.917657
delta_x,75%,2282.419198,2726.027123,2288.651543,1527.084463,5709.884505,1211.534277,-1460.366024,6780.314991,154.338836,2163.460188,...,4807.368918,-430.638162,11340.1957,330.153067,923.723412,2526.074522,228.651769,4369.971272,1207.905108,743.114525
delta_x,max,6917.312123,17367.556899,19390.882441,28777.855432,31478.635403,4694.625385,33.298655,85752.824417,1083.993789,20008.869272,...,27120.696869,-310.074944,49615.638101,1223.798076,4896.482077,7368.15011,23498.280008,48952.8161,13452.134391,2168.584877
delta_y,count,958.0,2108.0,417.0,354.0,1210.0,3531.0,418.0,235.0,2578.0,330.0,...,584.0,339.0,1281.0,1307.0,1210.0,499.0,378.0,366.0,648.0,1797.0
delta_y,mean,116.593133,967.435361,270.342042,462.127083,-18.785206,4.901992,-2822.584859,999.266593,20.887208,677.148273,...,214.516953,-58.049923,2106.023413,-12.730545,176.778974,-815.344643,178.065723,668.191816,183.237904,4.772428


In [9]:
# отрисовка нужного рядя для анализа

figure = plt.figure(figsize=(10, 5))


sat_id = 28
real_col_name = 'x'
sim_col_name = real_col_name + '_sim'

fig = go.Figure()
fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][real_col_name],
                         name="Real",
                         line_color='deepskyblue'))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][sim_col_name],
                         name="Simulation",
                         line_color='dimgray'))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Sim_test",
                         line_color='MediumPurple'))

fig.update_layout(title_text='Time Series for ' + real_col_name,
                  xaxis_rangeslider_visible=True,
                  yaxis_title=real_col_name)
fig.show()

<Figure size 720x360 with 0 Axes>

In [10]:
# короче sim данные дофига тупые до 225 минут там вроде в задаче написано, нам надо бы предскзаать это дерьмо
# надо проверить гипотезу о периодах

figure = plt.figure(figsize=(10, 5))


sat_id = 113
real_col_name = 'x'
sim_col_name = real_col_name + '_sim'

df_train = full_train[full_train.sat_id == sat_id]
df_test = full_test[full_test.sat_id == sat_id]

d_real = df_train[real_col_name].values
d_sim = df_train[sim_col_name].values
d_test = df_test[sim_col_name].values

fig = go.Figure()
fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][real_col_name],
                         name="Real",
                         line_color='deepskyblue'))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][sim_col_name],
                         name="Simulation",
                         line_color='dimgray'))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Sim_test",
                         line_color='MediumPurple'))

fig.update_layout(title_text='Time Series for ' + real_col_name,
                  xaxis_rangeslider_visible=True,
                  yaxis_title=real_col_name)
fig.show()

<Figure size 720x360 with 0 Axes>

In [11]:
# интерполяция по производной !!! просто интересно было псомотерть


sat_id = 33
fig = make_subplots(rows=3, cols=2,
                    subplot_titles=['x', 'Vx', 'y', 'Vy', 'z', 'Vz'])

for i, real_col_name in enumerate(['x', 'y', 'z', 'Vx', 'Vy', 'Vz']):
    row, col = i % 3 + 1, i // 3 + 1
    sim_col_name = real_col_name + '_sim'
    fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                             y=full_train[full_train.sat_id == sat_id][real_col_name],
                             name="Real " + real_col_name,
                             ), row=row, col=col)

    fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                             y=full_train[full_train.sat_id == sat_id][sim_col_name],
                             name="Simulation " + real_col_name,
                             ), row=row, col=col)

    fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][sim_col_name],
                             name="Sim_test " + real_col_name,
                             ), row=row, col=col)
fig.show()

In [12]:


sat_id = 28

real_col_name = 'x'
sim_col_name = real_col_name + '_sim'

df_train = full_train[full_train.sat_id == sat_id]
df_test = full_test[full_test.sat_id == sat_id]

d_t = df_train.epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values
d_t = d_t[1:] - d_t[:-1]
d_t[d_t == 0] = 10000

d_tt = df_test.epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values
d_tt = d_tt[1:] - d_tt[:-1]
d_tt[d_tt == 0] = 10000

d_real = full_train[full_train.sat_id == sat_id][real_col_name].values[1:] - full_train[full_train.sat_id == sat_id][real_col_name].values[:-1]
d_sim = full_train[full_train.sat_id == sat_id][sim_col_name].values[1:] - full_train[full_train.sat_id == sat_id][sim_col_name].values[:-1]
d_test = full_test[full_test.sat_id == sat_id][sim_col_name].values[1:] - full_test[full_test.sat_id == sat_id][sim_col_name].values[:-1]

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.epoch.values[1:],
                         y=d_real / d_t,
                         name="Real",
                         line_color='deepskyblue'))

fig.add_trace(go.Scatter(x=df_train.epoch.values[1:],
                         y=d_sim / d_t,
                         name="Simulation",
                         line_color='dimgray'))

fig.add_trace(go.Scatter(x=df_test.epoch.values[1:],
                         y=d_test / d_tt,
                         name="Sim_test",
                         line_color='MediumPurple'))

fig.update_layout(title_text='Time Series for ' + real_col_name,
                  xaxis_rangeslider_visible=True,
                  yaxis_title=real_col_name)
fig.show()

In [13]:
# посчитаем основные статистики и закинем их как признаки хотя конечно
# странно вообще предсказывать временные ряды с помощью деревьев
# получается что 24 точки это период!
# давайте сделаем следующее - найдём минимум или максимум,
# чтобы найти нижнюю или верхнюю точку периода
# а далее будем считать по ним статистики

# ничего плохого в том, чтобы использовать простые модели для каждого спутника


In [14]:
# от каждого спутника мы берём первые 24 значения ищем минимум по каждой колонке

In [15]:
test_sat = full_test.sat_id.unique()
for i, col in enumerate(['x', 'y', 'z', 'Vx', 'Vy', 'Vz']):
    full_train[col + '_num'] = 0
    full_test[col + '_num'] = 0
    arr_train = full_train[col + '_num'].values
    arr_test = full_test[col + '_num'].values
    start_train = 0
    start_test = 0
    for sat_id in range(600):
        df_train = full_train[full_train.sat_id == sat_id]
        step_train = df_train.shape[0]
        idx_min = df_train[col].idxmin() % 24
        arr_train[start_train: start_train + step_train] = np.fromfunction(lambda i: (i + idx_min) % 24, (step_train, ))
        start_train += step_train
        idx_last = arr_train[start_train - 1] + 1
        if sat_id in test_sat:
            df_test = full_test[full_test.sat_id == sat_id]
            step_test = df_test.shape[0]
            arr_test[start_test: start_test + step_test] = np.fromfunction(lambda i: (i + idx_last) % 24, (step_test, ))
            start_test += step_test
    full_train[col + '_num'] = arr_train
    full_test[col + '_num'] = arr_test
    
for col_name in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
    full_test[col_name] = 0.0

In [16]:
# x_num

for i, col in enumerate(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']):
    full_train[col + '_num'] = 0
    arr = full_train[col + '_num'].values
    start = 0
    for sat_id in range(600):
        df = full_train[full_train.sat_id == sat_id]
        step = df.shape[0]
        idx_min = df[col].idxmin() % 24
        arr[start: start + step] = np.fromfunction(lambda i: (i + idx_min) % 24, (step, ))
        start += step
    full_train[col + '_num'] = arr
        

In [17]:
# давайте строить тренд по последним 5 предсказаниям (то есть мы будем делать следующее берём последние 5 
# значений интерполируем с помощью прямой, а далее смотрим чё получим)

In [18]:
# x_num

for i, col in enumerate(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']):
    full_test[col + '_num'] = 0
    arr = full_test[col + '_num'].values
    start = 0
    for sat_id in full_test.sat_id.unique():
        df = full_test[full_test.sat_id == sat_id]
        step = df.shape[0]
        idx_last = full_train[full_train.sat_id == sat_id][col + '_num'].iloc[-1] + 1
        arr[start: start + step] = np.fromfunction(lambda i: (i + idx_last) % 24, (step, ))
        start += step
    full_test[col + '_num'] = arr
        

In [19]:
full_train['x' + '_num'] = 0

In [20]:
full_train[full_train.sat_id == 0]

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,...,z_num,Vx_num,Vy_num,Vz_num,x_sim_num,y_sim_num,z_sim_num,Vx_sim_num,Vy_sim_num,Vz_sim_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2014-01-01T00:00:00.000,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.221690,...,1,0,1,0,2,7,1,22,1,22
1,2014-01-01T00:46:43.000,0,-10567.672384,1619.746066,-24451.813271,-0.302590,-4.272617,-0.612796,-10555.500066,1649.289367,...,2,1,2,1,3,8,2,23,2,23
2,2014-01-01T01:33:26.001,0,-10578.684043,-10180.467460,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,...,3,2,3,2,4,9,3,0,3,0
3,2014-01-01T02:20:09.001,0,-9148.251857,-20651.437460,-20720.381279,0.715600,-3.373762,1.722115,-9149.620794,-20618.200201,...,4,3,4,3,5,10,4,1,4,1
4,2014-01-01T03:06:52.002,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,...,5,4,5,4,6,11,5,2,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,2014-01-31T20:27:33.474,0,17337.596150,-3224.996803,40025.071742,-0.055615,2.650511,-0.222561,15425.677762,12387.078210,...,18,17,18,17,19,0,18,15,18,15
954,2014-01-31T21:14:16.474,0,16849.590836,4217.959953,38636.167298,-0.295282,2.642711,-0.774030,13419.744899,19007.658296,...,19,18,19,18,20,1,19,16,19,16
955,2014-01-31T22:00:59.475,0,15667.981809,11481.446566,35656.909015,-0.550136,2.518368,-1.356292,10662.661170,24548.177490,...,20,19,20,19,21,2,20,17,20,17
956,2014-01-31T22:47:42.475,0,13754.838284,18199.705814,31013.052037,-0.816256,2.247835,-1.959266,7182.757625,28395.590633,...,21,20,21,20,22,3,21,18,21,18


In [21]:
for col_name in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
    full_test[col_name] = 0.0

In [22]:
full_test.dtypes

sat_id          int64
epoch          object
x_sim         float64
y_sim         float64
z_sim         float64
Vx_sim        float64
Vy_sim        float64
Vz_sim        float64
x_num           int64
y_num           int64
z_num           int64
Vx_num          int64
Vy_num          int64
Vz_num          int64
x             float64
y             float64
z             float64
Vx            float64
Vy            float64
Vz            float64
x_sim_num       int64
y_sim_num       int64
z_sim_num       int64
Vx_sim_num      int64
Vy_sim_num      int64
Vz_sim_num      int64
dtype: object

NameError: name 'df1_train' is not defined

In [32]:
%%time
width = 10
for sat_id in tqdm(full_test.sat_id.unique()):
    df1_train = full_train[full_train.sat_id == sat_id]
    df1_test =  full_test[full_test.sat_id == sat_id]
    for col in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
        for i in range(24):
            df2_train = df1_train[df1_train[col + '_num'] == i]
            df2_test = df1_test[df1_test[col + '_num'] == i]
            lin_model = SVR('rbf', 2)
            sz = len(df2_train[col].values[-width:])
            X_train = np.arange(sz).reshape(-1, 1)
            y_train = df2_train[col].values[-width:]
            lin_model.fit(X_train, y_train)
            X_test = (np.arange(len(df2_test)) + sz).reshape(-1, 1)
            y_test = lin_model.predict(X_test)
            full_test.loc[df2_test.reset_index()['id'].values.reshape(-1, 1).ravel(), col] = y_test

HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))


CPU times: user 1min 48s, sys: 56.1 ms, total: 1min 48s
Wall time: 1min 48s


In [33]:
import warnings

warnings.filterwarnings("ignore")

In [34]:
# интерполяция по производной !!! просто интересно было псомотерть


sat_id = 2
fig = make_subplots(rows=3, cols=2,
                    subplot_titles=['x', 'Vx', 'y', 'Vy', 'z', 'Vz'])

for i, real_col_name in enumerate(['x', 'y', 'z', 'Vx', 'Vy', 'Vz']):
    row, col = i % 3 + 1, i // 3 + 1
    sim_col_name = real_col_name + '_sim'
    fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][real_col_name],
                             name="Estimated " + real_col_name,
                             ), row=row, col=col)

    fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][sim_col_name],
                             name="Simulation " + real_col_name,
                             ), row=row, col=col)

    fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][sim_col_name],
                             name="Sim_test " + real_col_name,
                             ), row=row, col=col)
    fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                             y=full_train[full_train.sat_id == sat_id][real_col_name],
                             name="Real " + real_col_name,
                             ), row=row, col=col)

    fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                             y=full_train[full_train.sat_id == sat_id][sim_col_name],
                             name="Simulation " + real_col_name,
                             ), row=row, col=col)

fig.show()

In [35]:
# отрисовка нужного рядя для анализа

figure = plt.figure(figsize=(10, 5))


sat_id = 124
real_col_name = 'x'
sim_col_name = real_col_name + '_sim'

fig = go.Figure()
fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][real_col_name],
                             name="Estimated " + real_col_name,
                             ))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Simulation " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Sim_test " + real_col_name,
                         ))
fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][real_col_name],
                         name="Real " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][sim_col_name],
                         name="Simulation " + real_col_name,
                         ))

fig.update_layout(title_text='Time Series for ' + real_col_name,
                  xaxis_rangeslider_visible=True,
                  yaxis_title=real_col_name)
fig.show()

<Figure size 720x360 with 0 Axes>

In [36]:
full_test[['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('submission.csv', index_label='id')

In [28]:
# отрисовка нужного рядя для анализа

figure = plt.figure(figsize=(10, 5))


sat_id = 587
real_col_name = 'y'
sim_col_name = real_col_name + '_sim'

fig = go.Figure()
fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][real_col_name],
                             name="Estimated " + real_col_name,
                             ))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Simulation " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Sim_test " + real_col_name,
                         ))
fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][real_col_name],
                         name="Real " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][sim_col_name],
                         name="Simulation " + real_col_name,
                         ))

fig.update_layout(title_text='Time Series for ' + real_col_name,
                  xaxis_rangeslider_visible=True,
                  yaxis_title=real_col_name)
fig.show()

<Figure size 720x360 with 0 Axes>

In [29]:
full_train[(full_train.sat_id == 587) & (full_train.x_num == 0)]

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,...,z_num,Vx_num,Vy_num,Vz_num,x_sim_num,y_sim_num,z_sim_num,Vx_sim_num,Vy_sim_num,Vz_sim_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1198941,2014-01-01T00:00:00.000,587,11981.661651,3895.893807,-5013.521270,-0.459862,5.063567,2.835773,11986.484916,3890.984518,...,18,20,7,12,8,14,17,1,7,12
1198942,2014-01-01T00:13:59.971,587,10935.809252,7853.551928,-2396.556385,-2.000657,4.275212,3.335077,10942.693146,7848.342852,...,19,21,8,13,9,15,18,2,8,13
1198943,2014-01-01T00:27:59.941,587,8700.761895,10961.699021,483.441071,-3.258906,3.073918,3.459887,8710.102955,10957.299508,...,20,22,9,14,10,16,19,3,9,14
1198944,2014-01-01T00:41:59.912,587,5579.596790,12962.798286,3318.859621,-4.097744,1.676311,3.240258,5590.973108,12959.952251,...,21,23,10,15,11,17,20,4,10,15
1198945,2014-01-01T00:55:59.883,587,1950.365645,13777.601086,5848.833134,-4.488928,0.265298,2.756484,1953.909601,13778.252330,...,22,0,11,16,12,18,21,5,11,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202125,2014-01-31T22:54:26.834,587,-1488.591642,-13517.295688,-8089.100521,4.417639,2.283310,-0.658697,-9202.002156,-14274.428046,...,10,12,23,4,0,6,9,17,23,4
1202126,2014-01-31T23:08:26.805,587,2228.897536,-11090.605026,-8313.865777,4.368465,3.528298,0.177234,-5856.646486,-14734.377607,...,11,13,0,5,1,7,10,18,0,5
1202127,2014-01-31T23:22:26.776,587,5692.708685,-7545.699699,-7685.850197,3.748976,4.922682,1.388175,-2178.409214,-14348.454514,...,12,14,1,6,2,8,11,19,1,6
1202128,2014-01-31T23:36:26.747,587,8230.907661,-2869.568209,-5869.576911,2.072837,6.118921,2.984005,1634.220746,-13048.336525,...,13,15,2,7,3,9,12,20,2,7


In [30]:
full_train[(full_train.sat_id == 587)]

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,...,z_num,Vx_num,Vy_num,Vz_num,x_sim_num,y_sim_num,z_sim_num,Vx_sim_num,Vy_sim_num,Vz_sim_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1198941,2014-01-01T00:00:00.000,587,11981.661651,3895.893807,-5013.521270,-0.459862,5.063567,2.835773,11986.484916,3890.984518,...,18,20,7,12,8,14,17,1,7,12
1198942,2014-01-01T00:13:59.971,587,10935.809252,7853.551928,-2396.556385,-2.000657,4.275212,3.335077,10942.693146,7848.342852,...,19,21,8,13,9,15,18,2,8,13
1198943,2014-01-01T00:27:59.941,587,8700.761895,10961.699021,483.441071,-3.258906,3.073918,3.459887,8710.102955,10957.299508,...,20,22,9,14,10,16,19,3,9,14
1198944,2014-01-01T00:41:59.912,587,5579.596790,12962.798286,3318.859621,-4.097744,1.676311,3.240258,5590.973108,12959.952251,...,21,23,10,15,11,17,20,4,10,15
1198945,2014-01-01T00:55:59.883,587,1950.365645,13777.601086,5848.833134,-4.488928,0.265298,2.756484,1953.909601,13778.252330,...,22,0,11,16,12,18,21,5,11,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202125,2014-01-31T22:54:26.834,587,-1488.591642,-13517.295688,-8089.100521,4.417639,2.283310,-0.658697,-9202.002156,-14274.428046,...,10,12,23,4,0,6,9,17,23,4
1202126,2014-01-31T23:08:26.805,587,2228.897536,-11090.605026,-8313.865777,4.368465,3.528298,0.177234,-5856.646486,-14734.377607,...,11,13,0,5,1,7,10,18,0,5
1202127,2014-01-31T23:22:26.776,587,5692.708685,-7545.699699,-7685.850197,3.748976,4.922682,1.388175,-2178.409214,-14348.454514,...,12,14,1,6,2,8,11,19,1,6
1202128,2014-01-31T23:36:26.747,587,8230.907661,-2869.568209,-5869.576911,2.072837,6.118921,2.984005,1634.220746,-13048.336525,...,13,15,2,7,3,9,12,20,2,7


In [31]:
full_train['epoch_seconds'] = full_train.epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values
full_test['epoch_seconds'] = full_test.epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values

In [32]:
full_train['diff_seconds'] = 0.0
arr = full_train['diff_seconds'].values
start = 0
for sat_id in range(600):
    arr_ = full_train[full_train.sat_id == sat_id]['epoch_seconds'].values[1:] \
          - full_train[full_train.sat_id == sat_id]['epoch_seconds'].values[:-1]
    step = len(arr_) + 1
    arr[start: start + step] = np.hstack(([-1], arr_))
    start += step
full_train['diff_seconds'] = arr

In [33]:
full_train

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,...,Vy_num,Vz_num,x_sim_num,y_sim_num,z_sim_num,Vx_sim_num,Vy_sim_num,Vz_sim_num,epoch_seconds,diff_seconds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2014-01-01T00:00:00.000,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.221690,...,1,0,2,7,1,22,1,22,1.388520e+09,-1.0
1,2014-01-01T00:46:43.000,0,-10567.672384,1619.746066,-24451.813271,-0.302590,-4.272617,-0.612796,-10555.500066,1649.289367,...,2,1,3,8,2,23,2,23,1.388523e+09,2803.0
2,2014-01-01T01:33:26.001,0,-10578.684043,-10180.467460,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,...,3,2,4,9,3,0,3,0,1.388526e+09,2803.0
3,2014-01-01T02:20:09.001,0,-9148.251857,-20651.437460,-20720.381279,0.715600,-3.373762,1.722115,-9149.620794,-20618.200201,...,4,3,5,10,4,1,4,1,1.388528e+09,2803.0
4,2014-01-01T03:06:52.002,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,...,5,4,6,11,5,2,5,2,1.388531e+09,2803.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234089,2014-01-31T22:00:22.602,599,-21721.485878,-14048.557595,5277.807430,-1.351754,3.373418,0.004995,-20717.958996,-16245.240500,...,12,8,3,20,12,20,13,8,1.391191e+09,1491.0
1234090,2014-01-31T22:25:13.240,599,-23176.890569,-8712.016936,5153.371350,-0.575955,3.764450,-0.175109,-22673.444496,-11192.339393,...,13,9,4,21,13,21,14,9,1.391193e+09,1491.0
1234091,2014-01-31T22:50:03.878,599,-23363.044794,-2906.071320,4747.247386,0.351381,3.992943,-0.372198,-23461.830699,-5570.167175,...,14,10,5,22,14,22,15,10,1.391194e+09,1490.0
1234092,2014-01-31T23:14:54.515,599,-22058.020262,3074.894039,4038.853542,1.421085,3.984793,-0.578849,-22858.679929,373.249102,...,15,11,6,23,15,23,16,11,1.391196e+09,1491.0


In [34]:
inconsistent = []
for sat_id in full_test.sat_id.unique():
    arr_train = full_test[full_test.sat_id == sat_id].epoch.values
    arr_test = full_train[full_train.sat_id == sat_id].epoch.values
    if list(np.unique(np.sort(arr_train[1:] - arr_train[:-1]))) != list(np.unique(np.sort(arr_test[1:] - arr_test[:-1]))):
        inconsistent.append(sat_id)
print(len(inconsistent))

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [None]:
inconsistent

In [None]:
# 241, 242, 252, 310, 385, 515

In [None]:
# отрисовка нужного рядя для анализа

figure = plt.figure(figsize=(10, 5))


sat_id = 1
real_col_name = 'x'
sim_col_name = real_col_name + '_sim'

fig = go.Figure()
fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][real_col_name],
                             name="Estimated " + real_col_name,
                             ))

# fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
#                          y=full_test[full_test.sat_id == sat_id][sim_col_name],
#                          name="Simulation " + real_col_name,
#                          ))

# fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
#                          y=full_test[full_test.sat_id == sat_id][sim_col_name],
#                          name="Sim_test " + real_col_name,
#                          ))
fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][real_col_name],
                         name="Real " + real_col_name,
                         ))

# fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
#                          y=full_train[full_train.sat_id == sat_id][sim_col_name],
#                          name="Simulation " + real_col_name,
#                          ))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch.values[1:],
                         y=full_train[full_train.sat_id == sat_id].diff_seconds.values[1:],
                         name="Diff_time",
                         ))

fig.update_layout(title_text='Time Series for ' + real_col_name,
                  xaxis_rangeslider_visible=True,
                  yaxis_title=real_col_name)
fig.show()

In [None]:
full_train[(full_train.diff_seconds < 1) & (full_train.diff_seconds > -1)]

In [None]:
# отрисовка нужного рядя для анализа

figure = plt.figure(figsize=(10, 5))


sat_id = 33

real_col_name = 'x'
sim_col_name = real_col_name + '_sim'

fig = go.Figure()
fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][real_col_name],
                             name="Estimated " + real_col_name,
                             ))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Simulation " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Sim_test " + real_col_name,
                         ))
fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][real_col_name],
                         name="Real " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][sim_col_name],
                         name="Simulation " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch.values[1:],
                         y=full_train[full_train.sat_id == sat_id].diff_seconds.values[1:],
                         name="Diff_time",
                         ))

fig.update_layout(title_text='Time Series for ' + real_col_name,
                  xaxis_rangeslider_visible=True,
                  yaxis_title=real_col_name)
fig.show()

In [None]:
# отрисовка нужного рядя для анализа

figure = plt.figure(figsize=(10, 5))


sat_id = uniq[13]
real_col_name = 'y'
sim_col_name = real_col_name + '_sim'

fig = go.Figure()
fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                             y=full_test[full_test.sat_id == sat_id][real_col_name],
                             name="Estimated " + real_col_name,
                             ))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Simulation " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_test[full_test.sat_id == sat_id].epoch,
                         y=full_test[full_test.sat_id == sat_id][sim_col_name],
                         name="Sim_test " + real_col_name,
                         ))
fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][real_col_name],
                         name="Real " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch,
                         y=full_train[full_train.sat_id == sat_id][sim_col_name],
                         name="Simulation " + real_col_name,
                         ))

fig.add_trace(go.Scatter(x=full_train[full_train.sat_id == sat_id].epoch.values[1:],
                         y=full_train[full_train.sat_id == sat_id].diff_seconds.values[1:],
                         name="Diff_time",
                         ))

fig.update_layout(title_text='Time Series for ' + real_col_name,
                  xaxis_rangeslider_visible=True,
                  yaxis_title=real_col_name)
fig.show()

In [None]:
uniq = full_test.sat_id.unique()