In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor

In [2]:
input_folder = r"D:\AShare_2"
csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]

In [3]:
features = ['return1','open','close','high','low','volume','VWAP','turn','free_turn']

In [4]:
X, Y, Y_dates, empty = [], [], [], []
def get_window(file_name):
    file_path = os.path.join(input_folder, file_name)
    df = pd.read_csv(file_path)
    x, y, dates = [], [], []
    n = 30
    
    for feature in features:
        df[feature] = (df[feature] - df[feature].rolling(n).mean().shift(1)) / (df[feature].rolling(n).std().shift(1) + 1e-8)

    i = 0
    
    while i + 40 < len(df):
        
        date = df.iloc[i+40]['date']
        dates.append(date)
        
        window = df.iloc[i:i+30, 1:]
        window.set_index('date', inplace=True)
        window = window.transpose()
        x.append(np.array(window))
        
        y.append(df.iloc[i+40]['return1'])
        
        i += 10
        
    if not x or not y:
        empty.append(df['code'][0])

    x = np.stack(x)
    y = np.stack(y)
    y_dates = np.stack(dates)
    X.append(x)
    Y.append(y)
    Y_dates.append(y_dates)

In [5]:
with ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(get_window, csv_files), total=len(csv_files)))

  7%|█████▏                                                                         | 317/4843 [02:52<40:56,  1.84it/s]


KeyboardInterrupt: 

In [None]:
Y_dates = np.concatenate(Y_dates, axis=0)
order = np.argsort(Y_dates)
X = np.concatenate(X, axis=0)[order]
Y = np.concatenate(Y, axis=0)[order]
Y_dates = Y_dates[order]

print('Shape of X: ', X.shape)
print('Shape of Y: ', Y.shape)
print('Stocks with not enough data: ', empty)

In [None]:
np.save('X_fe.npy', X)
np.save('Y_fe.npy', Y)
np.save('Y_dates.npy', Y_dates)