In [None]:
'''
@Author: Yitao Qiu
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
#import torch
from copy import deepcopy
import os

### Read data

In [None]:
product = 'eth'
df = pd.read_csv(f'datasets/{product}_usdt.csv')
df

In [None]:
# Create all the saving paths
if not os.path.exists(f'./price_{product}'):
    os.makedirs(f'./price_{product}')
if not os.path.exists(f'./label_{product}'):
    os.makedirs(f'./label_{product}')

In [None]:
k = [20,30,50,100]

In [None]:
#m_-(t)
for j in range(len(k)):
    m_minus = []
    for i in tqdm(range(k[j],len(df['mid_price']))):
        m =  df['mid_price'][i-k[j]+1:i+1].mean()
        m_minus.append(m)
    m_minus = np.array(m_minus)
    np.save(f"price_{product}/m_minus_{k[j]}",m_minus)

In [None]:
#m_+(t)
for j in range(len(k)):
    m_plus = []
    for i in tqdm(range(0,len(df['mid_price'])-k[j])):
        m = df['mid_price'][i+1:i+k[j]+1].mean()
        m_plus.append(m)
    m_plus = np.array(m_plus)
    np.save(f"price_{product}/m_plus_{k[j]}",m_plus)

### Make labels using smoothing label method

In [None]:
# Choose the horizon
horizon = 3
k_selected = k[horizon]
alpha = 0.000092
# Load the save information
m_minus = np.load(f"price_{product}/m_minus_{k_selected}.npy")
m_plus = np.load(f"price_{product}/m_plus_{k_selected}.npy")
print(m_minus.shape)
print(m_plus.shape)

In [None]:
m_minus = m_minus[0:len(m_minus)-k_selected]
m_plus = m_plus[k_selected:]
print(m_minus.shape)
print(m_plus.shape)

In [None]:
# l_(t)
label = []
for i in tqdm(range(len(m_minus))):
    l_v = (m_plus[i]-m_minus[i])/m_minus[i]
    if l_v > alpha:
        l = 2
    elif l_v < -alpha:
        l = 0
    else:
        l = 1
    label.append(l)
label = np.array(label)

In [None]:
# Check whether the labels are balanced before output
unique, counts = np.unique(label, return_counts=True)
print(dict(zip(unique, counts)))
plt.rcParams["figure.figsize"] = (5,5)
plt.bar(unique, height=counts)

In [None]:
# Check whether the train set is balanced
unique, counts = np.unique(label[0:5110311-k_selected], return_counts=True)
print(dict(zip(unique, counts)))
plt.rcParams["figure.figsize"] = (5,5)
all_count = counts.sum()
plt.bar(unique, height=(counts/all_count)*100)

In [None]:
# Check whether the validation set is balanced
unique, counts = np.unique(label[5110311-k_selected:7675082], return_counts=True)
print(dict(zip(unique, counts)))
plt.rcParams["figure.figsize"] = (5,5)
all_count = counts.sum()
plt.bar(unique, height=(counts/all_count)*100)

In [None]:
# Check whether the test set is balanced
unique, counts = np.unique(label[7675082:10255344-k_selected], return_counts=True)
print(dict(zip(unique, counts)))
plt.rcParams["figure.figsize"] = (5,5)
all_count = counts.sum()
plt.bar(unique, height=(counts/all_count)*100)

In [None]:
np.save(f"label_{product}/label_{k_selected}",label)

### Plot to see whether the label are correct

In [None]:
label = np.load(f"label_{product}/label_100.npy")
k_selected =100

In [None]:
plt.rcParams["figure.figsize"] = (10,5)
for i in range(0,10000):
    if label[i] == 1 and label[i+1] == 1:
        plt.axvspan(i, i+1, facecolor='b', alpha=0.5)
    elif  label[i] == 2 and label[i+1] == 2:
        plt.axvspan(i, i+1, facecolor='g', alpha=0.5)
    else:
        plt.axvspan(i, i+1, facecolor='r', alpha=0.5)
plt.plot(df['mid_price'][k_selected:10000])


### Append the labels back to the origin csv

In [None]:
# Load the saved labels
label_2 = np.load(f"label_{product}/label_20.npy")
label_3 = np.load(f"label_{product}/label_30.npy")
label_4 = np.load(f"label_{product}/label_50.npy")
label_5 = np.load(f"label_{product}/label_100.npy")

In [None]:
# Crop the length
label_4 = label_4[50:-50]
label_3 = label_3[70:-70]
label_2 = label_2[80:-80]
label_1 = np.zeros(len(label_5))
df = df[100:-100]
print(df.shape)

In [None]:
df.insert(42, "label_1", label_1)
df.insert(43, "label_2", label_2)
df.insert(44, "label_3", label_3)
df.insert(45, "label_4", label_4)
df.insert(46, "label_5", label_5)

### Save the dataset

In [None]:
df.to_csv(f'datasets/{product}_usdt_label.csv',index=False)