-
Notifications
You must be signed in to change notification settings - Fork 448
/
Copy pathdata_processing.py
119 lines (108 loc) · 5.25 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
class DataProcessing:
def __init__(self, split):
self.split = split
def make_features(self, file_path, window, csv_path, make_y=True, verbose=True, save_csv=False):
df = pd.read_csv(f"{file_path}", index_col=0)
print(df.shape)
cols = df.columns
#print(type(df[cols[1]].iloc[1]))
#print(df[cols[2]].head())
for i in cols[1:]:
print(i)
df[f'{i}_ret1'] = np.log(df[i]/df[i].shift(1))
#df[f'{i}_autocorr1'] = df[f'{i}_ret1'].corr(df[f'{i}_ret1'].shift(1))
for j in range(2, window, 2):
#print((df[i]/df[i].shift(1)).head())
df[f'{i}_ret{j}'] = np.log(df[i]/df[i].shift(j))
df[f'{i}_mavg{j}'] = df[f'{i}_ret1'].rolling(j).mean()
df[f'{i}_ewm{j}'] = df[f'{i}_ret1'].ewm(span=j).mean()
#df[f'{i}_autocorr{j}'] = df[f'{i}_ret1'].corr(df[f'{i}_ret1'].shift(j))
if window > 10:
for j in range(10, window, 5):
df[f'{i}_vol{j}'] = df[f'{i}_ret1'].rolling(j).std()
df[f'{i}_ewmvol{j}'] = df[f'{i}_ret1'].ewm(span=j).std()
break
df['liq'] = df['close']*df['volume']
df['liq_ret1'] = df['liq']/df['liq'].shift(1)
#df['liq_autocorr1'] = df['liq'].corr(df['liq'].shift(1))
for j in range(2, window, 2):
df[f'liq_ret{j}'] = df['liq']/df['liq'].shift(j)
df[f'liq_mavg{j}'] = df['liq'].rolling(j).mean()
df[f'liq_ewm{j}'] = df['liq'].ewm(span=j).mean()
#df[f'liq_autocorr{j}'] = df['liq'].corr(df['liq'].shift(j))
if window > 10:
for j in range(10, window, 5):
#df[f'liq_vol{j}'] = df['liq'].rolling(j).std()
#df[f'liq_ewmvol{j}'] = df['liq'].ewm(span=j).std()
break
if verbose:
df = df.dropna()
print(df.shape)
print(df.tail())
print(df.head())
if save_csv:
df.to_csv(f'{csv_path}/full_features.csv')
return df
def make_train_test(self, df_x, df_y, window, csv_path, has_y=False, binary_y=False, save_csv=False):
"""
Splits the dataset into train and test
:param df_x: dataframe of x variables
:type df_x: pd.DataFrame
:param df_y: dataframe of y values
:type df_y: pd.DataFrame
:param window: the prediction window
:type window: int
:param has_y: whether df_y exists separately or is a column in df_x (must be 'target' column)
:type has_y: boolean
:return: train_x, train_y, test_x, test_y
:rtype: pd.DataFrames
"""
if has_y:
y_values = df_y.copy()
y_values.columns = ['y_values']
fulldata = df_x.copy()
else:
if window == 0:
y_values = df_x['close'].copy()
y_values.columns = ['y_values']
fulldata = df_x.copy()
else:
y_values = np.log(df_x['close'].copy()/df_x['close'].copy().shift(-window)).dropna()
y_values.columns = ['y_values']
fulldata = df_x.iloc[:-window, :].copy()
if binary_y:
y_values.loc[y_values['y_values']<0] = -1
y_values.loc[y_values['y_values']>0] = 1
y_values.loc[y_values['y_values']==0] = 0
print(y_values.shape)
print(fulldata.shape)
train_y = y_values.iloc[:int(len(y_values)*self.split)]
test_y = y_values.iloc[int(len(y_values)*self.split)+1:]
train_x = fulldata.iloc[:int(len(y_values)*self.split), :]
test_x = fulldata.iloc[int(len(y_values)*self.split)+1:len(y_values), :]
print(train_y.shape)
print(train_x.shape)
if save_csv:
train_x.to_csv(f'data/processed_data/{csv_path}/train_x.csv')
train_y.to_csv(f'data/processed_data/{csv_path}/train_y.csv', header=['y_values'])
test_x.to_csv(f'data/processed_data/{csv_path}/test_x.csv')
test_y.to_csv(f'data/processed_data/{csv_path}/test_y.csv', header=['y_values'])
fulldata.to_csv(f'data/processed_data/{csv_path}/full_x.csv')
y_values.to_csv(f'data/processed_data/{csv_path}/full_y.csv', header=['y_values'])
return fulldata, y_values, train_x, train_y, test_x, test_y
def check_labels(self, y_values):
print(y_values['y_values'].value_counts())
if __name__ == "__main__":
# preprocess = DataProcessing(0.8)
# df = preprocess.make_features(file_path="price_bars/tick_bars.csv", window=20,
# csv_path="autoencoder_data", save_csv=True)
# fulldata, y_values, train_x, train_y, test_x, test_y = preprocess.make_train_test(df_x=df, df_y=None, window=1,
# csv_path="autoencoder_data", save_csv=True)
preprocess = DataProcessing(0.8)
df1 = pd.read_csv("../data/processed_data/nn_data/full_x.csv", index_col=0)
df2 = pd.read_csv('../data/processed_data/autoencoder_data/full_y.csv', index_col=0)
fulldata, y_values, train_x, train_y, test_x, test_y = preprocess.make_train_test(df_x=df1, df_y=df2, window=1,
csv_path="train_test_data", has_y=True, save_csv=True)