In [3]:
import yaml
import pickle
import numpy as np
import pandas as pd
import logging

# 로깅 설정
logging.basicConfig(filename='make_dataset.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def make_DL_dataset(data, data_len, batch_size=1000):
    """
    배치로 데이터를 생성하여 메모리 사용을 최적화합니다.
    """
    dataset = []
    times = []
    
    for i in range(0, len(data) - data_len + 1, batch_size):
        end_idx = min(i + batch_size, len(data) - data_len + 1)
        batch_data = data.iloc[i:end_idx + data_len]
        
        for j in range(end_idx - i):
            subset = batch_data.iloc[j:j + data_len]
            dataset.append(subset.values.astype(np.float32))  # float32로 변환
            times.append(subset.index.astype(str))  # datetime을 문자열로 변환

    return np.array(dataset), times

def save_pickle(filename, data):
    """
    데이터를 pickle 파일로 저장합니다.
    """
    with open(filename, 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

def data_split(data, train_len, pred_len, train_dates, val_dates, test_dates, batch_size=1000):
    """
    데이터를 날짜별로 분할하고 배치로 처리합니다.
    """
    # 날짜를 Pandas Timestamp로 변환
    train_start, train_end = pd.to_datetime(train_dates)
    val_start, val_end = pd.to_datetime(val_dates)
    test_start, test_end = pd.to_datetime(test_dates)

    # 데이터 분할
    data_train = data.loc[train_start:train_end]
    data_val = data.loc[val_start:val_end]
    data_test = data.loc[test_start:test_end]

    # 데이터셋 생성
    x_tr, y_tr = make_DL_dataset(data_train, train_len + pred_len, batch_size)
    x_val, y_val = make_DL_dataset(data_val, train_len + pred_len, batch_size)
    x_te, y_te = make_DL_dataset(data_test, train_len + pred_len, batch_size)

    return x_tr, y_tr, x_val, y_val, x_te, y_te

if __name__ == "__main__":
    path = "../data/"
    with open("../config/config.yaml", "r", encoding="utf8") as file:
        config = yaml.safe_load(file)

    # us_ret.feather 파일 읽기
    infile = '../RIPT_processed_data/us_ret.feather'
    df = pd.read_feather(infile)
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

    # 필요한 열 선택 및 정렬
    data = df.pivot_table(values='Ret', index=df.index, columns='StockID').sort_index()
    data = data.fillna(-2).astype(np.float32)  # NaN을 -2로 채우고 float32로 변환

    # 데이터 분할
    x_tr, y_tr, x_val, y_val, x_te, y_te = data_split(
        data,
        config["TRAIN_LEN"],
        config["PRED_LEN"],
        train_dates=(config["TRAIN_START_DATE"], config["TRAIN_END_DATE"]),
        val_dates=(config["VAL_START_DATE"], config["VAL_END_DATE"]),
        test_dates=(config["TEST_START_DATE"], config["TEST_END_DATE"]),
        batch_size=1000  # 배치 크기
    )

    # 데이터 저장 (pickle)
    save_pickle(path + "dataset.pkl", {
        "x_tr": x_tr, "y_tr": y_tr,
        "x_val": x_val, "y_val": y_val,
        "x_te": x_te, "y_te": y_te
    })

    logging.info("데이터셋 생성 완료")


: 

In [4]:
import pandas as pd
# us_ret.feather 파일 읽기
infile = '../RIPT_processed_data/us_ret.feather'
df = pd.read_feather(infile)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
df

Unnamed: 0_level_0,StockID,Low,High,Close,Vol,Shares,Open,Ret,MarketCap,log_ret,...,Ret_month,Ret_quarter,Ret_year,Ret_5d,Ret_20d,Ret_60d,Ret_65d,Ret_180d,Ret_250d,Ret_260d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1985-01-02,1,5.520,5.633,5.520,279650.0,,5.596,-0.014,,-0.014099,...,,,,0.025252,0.068656,-0.060027,-0.035552,-0.057351,0.113820,0.101161
1985-01-02,1004,1.585,1.601,1.585,12814.0,,,-0.029,,-0.029429,...,,,,0.030097,0.140979,0.270089,0.374783,0.136983,0.320233,0.408542
1985-01-02,1016,0.895,0.895,0.895,13200.0,,0.895,0.000,,0.000000,...,,,,0.009223,0.159632,0.319429,0.302277,0.686962,0.861958,0.908591
1985-01-02,1017,0.433,0.439,0.437,1180800.0,,0.437,0.000,,0.000000,...,,,,-0.015364,0.094631,0.023336,0.022464,0.113492,0.369712,0.425809
1985-01-02,1034,0.702,0.702,0.702,6834.0,,0.702,0.027,,0.026642,...,,,,-0.034712,0.112347,-0.006131,-0.032301,-0.273319,-0.024784,-0.024690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-30,995,39.690,40.140,40.120,4417.0,,40.120,-0.010,,-0.010050,...,,,,,,,,,,
2024-08-30,996,37.070,37.970,37.240,437600.0,,37.930,-0.014,,-0.014099,...,,,,,,,,,,
2024-08-30,997,89.090,94.890,91.020,757100.0,,94.590,-0.022,,-0.022246,...,,,,,,,,,,
2024-08-30,998,0.870,1.010,0.922,5863000.0,,1.000,-0.087,,-0.091019,...,,,,,,,,,,


In [6]:
# 2001년 이전 데이터 삭제
import numpy as np
df = df[df.index >= '2001-01-01']

# 필요한 열 선택 및 정렬
data = df.pivot_table(values='Ret', index=df.index, columns='StockID').sort_index()
data = data.fillna(-2).astype(np.float32)  # NaN을 -2로 채우고 float32로 변환
data

StockID,0,1,100,1000,1001,1002,1004,1005,1006,1007,...,99,990,991,992,994,995,996,997,998,999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-01-02,-0.071,-0.037,-0.059,-2.000,-2.000,-2.000,0.000,-2.000,-2.000,-2.000,...,-2.000,-0.024,-0.044,0.053,0.031,-2.000,-2.000,-0.127,-2.000,-2.000
2001-01-03,0.103,0.014,0.187,-2.000,-2.000,-2.000,0.175,-2.000,-2.000,-2.000,...,-2.000,-0.026,0.039,-0.009,0.000,-2.000,-2.000,0.175,-2.000,-2.000
2001-01-04,0.038,0.033,-0.011,-2.000,-2.000,-2.000,0.000,-2.000,-2.000,-2.000,...,-2.000,-0.071,0.000,0.065,0.000,-2.000,-2.000,-0.058,-2.000,-2.000
2001-01-05,-0.055,-0.019,-0.053,-2.000,-2.000,-2.000,-0.006,-2.000,-2.000,-2.000,...,-2.000,0.048,-0.023,-0.013,-0.030,-2.000,-2.000,-0.026,-2.000,-2.000
2001-01-08,-0.033,0.015,0.017,-2.000,-2.000,-2.000,-0.022,-2.000,-2.000,-2.000,...,-2.000,0.008,0.007,0.013,-0.083,-2.000,-2.000,-0.072,-2.000,-2.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-26,-0.003,0.019,-0.014,-0.011,-0.019,0.072,0.013,0.063,0.002,-0.002,...,-0.011,0.002,-0.001,-0.008,-0.001,-0.003,0.004,-0.061,0.069,0.013
2024-08-27,0.004,-0.008,-0.029,-0.028,-0.005,-0.020,-0.031,-0.030,-0.022,-0.001,...,0.005,0.004,0.000,0.005,-0.001,0.004,-0.023,0.027,-0.028,-0.042
2024-08-28,0.000,-0.064,0.007,0.000,-0.013,-0.021,-0.057,-0.008,0.005,-0.013,...,-0.012,0.009,-0.018,-0.036,0.002,0.000,0.000,-0.025,-0.063,0.004
2024-08-29,0.010,0.012,0.063,0.029,0.006,0.002,0.016,-0.011,0.015,0.026,...,0.000,0.010,-0.014,0.012,0.006,0.000,0.026,0.010,0.047,-0.026


In [7]:
data.to_csv('../data/data.csv')