In [1]:
import glob
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.manifold import TSNE
from sklearn.preprocessing import minmax_scale
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
def calc_price_from_tick(df):
    tick = sorted(np.diff(sorted(np.unique(df.values.flatten()))))[0]
    return 0.01/tick

def calc_prices(r):
    df = pd.read_parquet(r.book_path,
                         columns=[
                             'time_id',
                             'ask_price1',
                             'ask_price2',
                             'bid_price1',
                             'bid_price2'
                         ])
    df = df.groupby('time_id', group_keys=False) \
        .apply(calc_price_from_tick).to_frame('price').reset_index()
    df['stock_id'] = r.stock_id
    return df


def reconstruct_time_id_order():
    paths = glob.glob('optiver-realized-volatility-prediction/book_train.parquet/**/*.parquet')

    df_files = pd.DataFrame(
        {'book_path': paths}) \
        .eval('stock_id = book_path.str.extract("stock_id=(\d+)").astype("int")',
              engine='python')

    # build price matrix using tick-size
    df_prices = pd.concat(
        Parallel(n_jobs=4)(
            delayed(calc_prices)(r) for _, r in df_files.iterrows()
        )
    )
    df_prices = df_prices.pivot(index='time_id', columns='stock_id', values='price')

    # t-SNE to recovering time-id order
    clf = TSNE(
        n_components=1,
        perplexity=400,
        random_state=0,
        n_iter=2000
    )
    compressed = clf.fit_transform(
        pd.DataFrame(minmax_scale(df_prices.fillna(df_prices.mean())))
    )

    order = np.argsort(compressed[:, 0])
    ordered = df_prices.reindex(order).reset_index(drop=True)

    # correct direction of time-id order using known stock (id61 = AMZN)
    if ordered[61].iloc[0] > ordered[61].iloc[-1]:
        ordered = ordered.reindex(ordered.index[::-1])\
            .reset_index(drop=True)

    return ordered

In [9]:
timeseries = reconstruct_time_id_order()



In [17]:
timeseries.dropna(inplace=True)

In [18]:
timeseries.shape

(460, 112)

In [19]:
timeseries.head()

stock_id,0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,...,82,83,84,85,86,87,88,89,90,93,94,95,96,97,98,99,100,101,102,103,104,105,107,108,109,110,111,112,113,114,115,116,118,119,120,122,123,124,125,126
8,206.615952,108.100616,137.743973,210.240799,616.809387,693.273376,290.766296,244.565826,122.104919,165.782761,62.206955,120.873314,36.472206,97.202873,116.025002,111.107391,339.619751,1215.740234,71.636276,123.180733,34.56369,247.451553,328.965027,68.478432,295.373505,115.070068,282.444702,185.178986,6.25502,24.958666,381.300354,207.638809,41.100479,112.297295,1784.810181,276.851746,152.797958,607.870117,42.090355,95.705734,...,99.627174,94.042686,197.379013,39.718788,58.375835,141.46051,321.402588,28.130812,352.462524,136.178696,86.97364,139.345642,493.44751,207.638809,460.912506,66.470741,96.199631,101.311691,322.638763,586.615906,138.197815,48.349323,73.778435,23.649866,202.623383,371.177338,49.099255,323.88446,1582.756226,314.180054,82.809555,212.369812,272.357391,49.257828,83.13784,110.376419,77.172104,83.802277,57.693314,203.606979
9,208.154037,112.447823,142.179794,215.092514,603.496948,735.842773,521.031555,233.665955,132.73114,204.102386,61.862888,135.082245,42.217453,103.180908,124.830475,129.6539,335.544312,1290.555054,72.628639,115.545563,31.207619,253.432266,374.491425,73.519791,451.000427,111.699173,285.326782,189.3591,7.735,24.888319,425.817657,211.833527,34.836411,128.462601,1823.610352,265.46228,145.383148,574.562195,38.782284,100.824615,...,94.042686,90.491997,205.100433,37.566536,52.20042,150.064545,339.619751,30.693771,326.404968,139.577499,93.832306,148.208618,487.709747,217.321442,460.912506,60.414894,100.824615,113.054016,332.881256,640.351746,149.263489,43.919415,84.054184,23.28875,206.108307,377.865204,54.050308,327.679993,1613.193848,326.404968,80.273758,225.500214,209.192215,43.942421,81.127731,109.942436,89.145676,80.043968,56.794907,205.100433
11,212.908829,111.402496,140.985001,231.091125,621.378357,739.084412,383.041443,255.360962,121.047729,182.758347,62.836014,127.680481,37.974686,103.307976,118.81881,115.228127,350.987762,1133.595703,75.301689,127.19648,34.435993,250.03302,361.577911,69.327339,318.353241,116.508438,292.796082,191.084457,6.525053,26.128664,388.361481,221.043686,39.803596,119.410789,1950.838989,283.398926,153.637512,599.186279,41.221661,101.556992,...,94.893753,93.310432,199.254349,37.382389,54.120049,143.395004,340.308624,30.207447,352.462524,142.663391,95.001221,144.382233,537.731262,218.738144,477.983337,67.001663,104.335915,105.783203,343.092346,594.936707,142.300385,47.907528,77.636353,24.778048,211.034164,379.575012,51.685814,338.250305,1677.721558,320.175873,85.380234,219.310013,275.94104,52.379692,80.659691,111.402496,87.518082,84.605225,59.346359,208.412613
17,208.671829,105.120399,118.483162,169.466827,521.031555,594.936707,254.200241,215.092514,123.361877,187.036957,61.455002,120.35305,37.349098,98.45784,123.908531,117.323189,308.972656,998.643799,71.57515,121.31031,32.400959,247.087128,315.361206,62.000057,268.006653,107.546257,250.406204,167.772156,7.0051,25.025681,326.404968,181.179428,46.942406,90.491997,1677.721558,271.915985,143.640549,635.50061,45.63987,102.300095,...,89.62188,108.660721,175.127518,42.864628,49.902485,129.254364,273.244568,26.512667,367.921387,131.896347,92.385551,127.875122,471.270111,200.924744,432.402466,75.64119,89.95826,96.420776,296.417236,562.993835,151.692734,42.517017,74.831467,27.625912,206.108307,371.177338,60.262989,294.337128,1471.685547,296.417236,70.849731,194.631271,298.526978,53.042099,77.243164,94.893753,75.64119,55.151924,52.330677,194.18074
40,215.092514,109.942436,126.716133,74.433075,513.064697,616.809387,76.46862,211.299942,135.300125,132.104065,64.157616,135.518707,47.86652,136.845154,120.525978,137.293091,326.404968,665.762512,76.190804,143.886932,30.581873,196.454514,315.361206,62.508255,279.62027,82.809555,224.895645,163.839996,11.435632,34.309235,290.263245,230.456253,67.028427,92.794334,2046.001953,291.271118,148.998367,361.577911,49.200047,122.282913,...,90.006523,78.840302,116.025002,53.635601,35.810493,125.577965,284.359589,35.719002,331.565521,143.395004,117.983231,133.576553,327.679993,126.334457,439.194122,63.358067,145.383148,102.425003,291.271118,539.460327,178.861572,35.956314,89.910049,48.377209,248.18367,297.468353,62.276226,276.851746,1497.965698,300.666962,67.378372,153.919403,235.635056,80.197014,60.436657,74.764778,93.000084,51.527077,55.738258,181.571594
