In [None]:
%reload_ext autoreload
%autoreload

import os
from itertools import product

import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

In [None]:
os.path.exists("C:/Users/q1371/Desktop/Projects/BenchmarkDatasets")

In [None]:
norm_type = 'DecPre'
assert norm_type in ['Zscore','MinMax','DecPre']

if norm_type == 'Zscore':
    norm_num = 1
    file_type = 'ZScore'
elif norm_type == 'MinMax':
    norm_num = 2
    file_type = norm_type
elif norm_type == 'DecPre':
    norm_num = 3
    file_type = norm_type

In [None]:
root = "C:/Users/q1371/Desktop/Projects"
Trainset_path = f'{root}/BenchmarkDatasets/NoAuction/{norm_num}.NoAuction_{norm_type}/NoAuction_{norm_type}_Training'
Train_path = os.path.join(Trainset_path,f'Train_Dst_NoAuction_{file_type}_CF_1.txt')
Testset_path = f'{root}/BenchmarkDatasets/NoAuction/{norm_num}.NoAuction_{norm_type}/NoAuction_{norm_type}_Testing'
Test_path_1 = os.path.join(Testset_path,f'Test_Dst_NoAuction_{file_type}_CF_7.txt')
Test_path_2 = os.path.join(Testset_path,f'Test_Dst_NoAuction_{file_type}_CF_8.txt')
Test_path_3 = os.path.join(Testset_path,f'Test_Dst_NoAuction_{file_type}_CF_9.txt')

In [None]:
def load_raw(day: int, normalization="DecPre") -> np.array:
    index = {"DecPre": 3, "Zscore": 1}[normalization]
    root = f"C:/Users/q1371/Desktop/Projects/BenchmarkDatasets/NoAuction/{index}.NoAuction_{normalization}"
    if normalization == "Zscore":
        normalization1 = "ZScore"
    else:
        normalization1 = normalization
    if day == 1:
        return np.loadtxt(
            f"{root}/NoAuction_{normalization}_Training/Train_Dst_NoAuction_{normalization1}_CF_1.txt"
        )
    else:
        return np.loadtxt(
            f"{root}/NoAuction_{normalization}_Testing/Test_Dst_NoAuction_{normalization1}_CF_{day-1}.txt"
        )

In [None]:
# # 如何找到stock的分隔点
# # 1.每一步的值和前一步的值去取插值
# # 2. 对这些插值进行排序
# # 3.不同股票之间的插值肯定最大
# # 4.找到差值最大的五个分隔点
# diff = np.diff(a_raw[0],prepend=np.inf)

# diff = np.abs(diff)
# sort_idx = np.argsort(diff)
# split_point = np.sort(sort_idx[-5:])
# split_point = np.append(split_point,[a_raw.shape[1]])
# # for i in range(len(sort_idx)):
# #     print(sort_idx[i])
# split_point

In [None]:
def split(array, n_stocks=5):
    boundaries = np.sort(
        np.argsort(np.abs(np.diff(array[0], prepend=np.inf)))[-n_stocks :]
    )
    boundaries = np.append(boundaries,[array.shape[1]])
    return tuple(array[:, boundaries[i] : boundaries[i + 1]] for i in range(n_stocks))

In [None]:
def to_dataframe(array) -> pd.DataFrame:
    data = {}

    for level in range(10):
        data[f"PRICE_ASK_{level}"] = array[4 * level]
    for level in range(10):
        data[f"PRICE_BID_{level}"] = array[4 * level + 2]
    for level in range(10):
        data[f"VOLUME_ASK_{level}"] = array[4 * level + 1]
    for level in range(10):
        data[f"VOLUME_BID_{level}"] = array[4 * level + 3]
    data[f"LABEL_1TICK"] = 2 - array[-5]
    data[f"LABEL_2TICK"] = 2 - array[-4]
    data[f"LABEL_3TICK"] = 2 - array[-3]
    data[f"LABEL_5TICK"] = 2 - array[-2]
    data[f"LABEL_10TICK"] = 2 - array[-1]

    return pd.DataFrame(data)

In [None]:
def revert_decimal_normalization(df, max_denom=10 ** 8):
    assert (df.iloc[:, :-5].values != 0).all()

    df.iloc[:, :-5] = df.iloc[:, :-5] * max_denom
    df.iloc[:, :-5] = df.iloc[:, :-5].round()
    df.iloc[:, :-5] = df.iloc[:, :-5].astype(int)
    for c in df.columns[:-5]:
        while (df.loc[:, c] % 10 == 0).all():
            df.loc[:, c] //= 10
    return df

In [None]:
dfs = {i_stock: {} for i_stock in range(1, 5 + 1)}

for day in tqdm(range(1, 10 + 1)):
    a_raw = load_raw(day)
    for i_stock, a in enumerate(split(a_raw)):
        dfs[i_stock + 1][day] = to_dataframe(a)

In [None]:
listdf = []

for i_stock, day in tqdm(list(product(range(1, 5 + 1), range(1, 10 + 1)))):
    df = dfs[i_stock][day]
    df = revert_decimal_normalization(df)
    df["STOCK"] = i_stock
    df["DAY"] = day
    df = df[list(df.columns[-2:]) + list(df.columns[:-2])]

    listdf.append(df)

dataframe = pd.concat(listdf, axis=0)
dataframe = dataframe.reset_index(drop=True)
dataframe = dataframe.astype(int)
dataframe.to_csv(f"data_denorm.csv")

In [None]:
dataframe

In [None]:
dfs = {i_stock: {} for i_stock in range(1, 5 + 1)}

for day in tqdm(range(1, 10 + 1)):
    a_raw = load_raw(day, normalization="Zscore")
    for i_stock, a in enumerate(split(a_raw)):
        dfs[i_stock + 1][day] = to_dataframe(a)

In [None]:
listdf = []

for i_stock, day in tqdm(list(product(range(1, 5 + 1), range(1, 10 + 1)))):
    df = dfs[i_stock][day]
    # df = revert_decimal_normalization(df)
    df["STOCK"] = i_stock
    df["DAY"] = day
    df = df[list(df.columns[-2:]) + list(df.columns[:-2])]

    listdf.append(df)

dataframe = pd.concat(listdf, axis=0)
dataframe = dataframe.reset_index(drop=True)
# dataframe = dataframe.astype(int)
dataframe.iloc[:, -5:] = dataframe.iloc[:, -5:].astype(int)
dataframe.to_csv(f"data_no_date.csv")
# dataframe.iloc[:100000].to_csv(f"data_zscore1.csv")
# dataframe.iloc[100000:].to_csv(f"data_zscore2.csv")

In [None]:
dataframe

In [None]:
dataframe.loc[dataframe['STOCK']==1]

In [None]:
day_begin = ["2010-06-01 07:30:00.000",
             "2010-06-02 07:30:00.000",
             "2010-06-03 07:30:00.000",
             "2010-06-04 07:30:00.000",
             "2010-06-07 07:30:00.000",
             "2010-06-08 07:30:00.000",
             "2010-06-09 07:30:00.000",
             "2010-06-10 07:30:00.000",
             "2010-06-11 07:30:00.000",
             "2010-06-14 07:30:00.000"]
day_end =["2010-06-01 15:25:00.000",
              "2010-06-02 15:25:00.000",
              "2010-06-03 15:25:00.000",
              "2010-06-04 15:25:00.000",
              "2010-06-07 15:25:00.000",
              "2010-06-08 15:25:00.000",
              "2010-06-09 15:25:00.000",
              "2010-06-10 15:25:00.000",
              "2010-06-11 15:25:00.000",
              "2010-06-14 15:25:00.000"]


# day_one_begin = datetime.strptime(day_begin[0], "%Y-%m-%d %H:%M:%S.%f")
# day_one_end = datetime.strptime(day_end[0], "%Y-%m-%d %H:%M:%S.%f")

In [None]:
len(day_end)

In [None]:
# step = (day_one_end - day_one_begin)/3454

In [None]:
# new = day_one_begin + step

In [None]:
# print(new)

In [None]:
all_date = []
for j in tqdm(range(1,11)):
    for k in range(1,6):
        for i in range(len(dataframe.loc[(dataframe['STOCK']==k)&(dataframe['DAY']==j)])):
            d_begin = datetime.strptime(day_begin[j-1], "%Y-%m-%d %H:%M:%S.%f")
            d_end = datetime.strptime(day_end[j-1], "%Y-%m-%d %H:%M:%S.%f")
            step = (d_end - d_begin)/len(dataframe.loc[(dataframe['STOCK']==k)&(dataframe['DAY']==j)])
            all_date.append(d_begin + i*step)


In [None]:
all_date = np.array(all_date)
np.save('date',all_date)

In [None]:
date_df = pd.DataFrame(all_date)

In [None]:
date_df

In [None]:
date_df.columns = ['date']

In [None]:
date_df

In [None]:
dataframe.insert(1, "date", date_df)

In [None]:
all_cols = dataframe.columns.values.tolist()
del_col = ['STOCK','DAY']

In [None]:
for i in range(len(del_col)):   
    all_cols.remove(del_col[i])

In [None]:
new_data = pd.DataFrame(dataframe,columns=all_cols)

In [None]:
new_data

In [175]:
new_data.to_csv(f"data.csv")