In [7]:
import yaml
import pickle
import numpy as np
import pandas as pd
import logging

# 로깅 설정
logging.basicConfig(filename='make_dataset.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def make_DL_dataset(data, data_len):
    """
    딥러닝 데이터셋을 생성합니다.

    Args:
        data (pd.DataFrame): 입력 데이터
        data_len (int): 데이터 길이

    Returns:
        tuple: (dataset, times)
    """
    times = []
    dataset = []
    for i in range(len(data) - data_len + 1):
        subset = data.iloc[i:i+data_len]
        dataset.append(subset.values)
        times.append(subset.index)
    return np.array(dataset), times

def data_split(data, train_len, pred_len, train_dates, val_dates, test_dates):
    """
    데이터를 훈련, 검증, 테스트 세트로 분할합니다.

    Args:
        data (pd.DataFrame): 입력 데이터
        train_len (int): 훈련 데이터 길이
        pred_len (int): 예측 길이
        train_dates (tuple): (훈련 시작 날짜, 훈련 종료 날짜)
        val_dates (tuple): (검증 시작 날짜, 검증 종료 날짜)
        test_dates (tuple): (테스트 시작 날짜, 테스트 종료 날짜)

    Returns:
        tuple: (x_tr, y_tr, x_val, y_val, x_te, y_te, times_tr, times_val, times_te)
    """
    # 날짜를 Pandas Timestamp로 변환
    train_start_date = pd.to_datetime(train_dates[0])
    train_end_date = pd.to_datetime(train_dates[1])
    val_start_date = pd.to_datetime(val_dates[0])
    val_end_date = pd.to_datetime(val_dates[1])
    test_start_date = pd.to_datetime(test_dates[0])
    test_end_date = pd.to_datetime(test_dates[1])

    # 각 세트의 종료 날짜를 조정하여 시퀀스 길이만큼 데이터가 포함되도록 함
    train_end_date_adjusted = train_end_date - pd.Timedelta(days=pred_len - 1)
    val_end_date_adjusted = val_end_date - pd.Timedelta(days=pred_len - 1)
    test_end_date_adjusted = test_end_date - pd.Timedelta(days=pred_len - 1)

    # 훈련 데이터
    data_train = data.loc[train_start_date:train_end_date_adjusted]
    return_train, times_train = make_DL_dataset(data_train, train_len + pred_len)

    # 검증 데이터
    data_val = data.loc[val_start_date:val_end_date_adjusted]
    return_val, times_val = make_DL_dataset(data_val, train_len + pred_len)

    # 테스트 데이터
    data_test = data.loc[test_start_date:test_end_date_adjusted]
    return_test, times_test = make_DL_dataset(data_test, train_len + pred_len)

    x_tr, y_tr = return_train[:, :train_len], return_train[:, -pred_len:]
    x_val, y_val = return_val[:, :train_len], return_val[:, -pred_len:]
    x_te, y_te = return_test[:, :train_len], return_test[:, -pred_len:]

    times_tr = [t[-pred_len:] for t in times_train]
    times_val = [t[-pred_len:] for t in times_val]
    times_te = [t[-pred_len:] for t in times_test]

    return x_tr, y_tr, x_val, y_val, x_te, y_te, times_tr, times_val, times_te

if __name__ == "__main__":
    path = "data/"
    with open("../config/config.yaml", "r", encoding="utf8") as file:
        config = yaml.safe_load(file)
    
    # us_ret.feather 파일 읽기
    infile = '../RIPT_processed_data/us_ret.feather'
    df = pd.read_feather(infile)
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

    # 필요한 열 선택 (예: 'Ret' 컬럼)
    data = df.pivot_table(values='Ret', index=df.index, columns='StockID')

    # 날짜 정렬
    data = data.sort_index()

    # NaN 값 처리
    data = data.fillna(-2)  # NaN을 -2로 채움

    # 데이터 분할
    x_tr, y_tr, x_val, y_val, x_te, y_te, times_tr, times_val, times_te = data_split(
        data,
        config["TRAIN_LEN"],
        config["PRED_LEN"],
        train_dates=(config["TRAIN_START_DATE"], config["TRAIN_END_DATE"]),
        val_dates=(config["VAL_START_DATE"], config["VAL_END_DATE"]),
        test_dates=(config["TEST_START_DATE"], config["TEST_END_DATE"])
    )
    
    logging.info("데이터셋 검증:")
    logging.info(f"Train images shape: {x_tr.shape}")
    logging.info(f"Train labels shape: {y_tr.shape}")
    logging.info(f"Validation images shape: {x_val.shape}")
    logging.info(f"Validation labels shape: {y_val.shape}")
    logging.info(f"Test images shape: {x_te.shape}")
    logging.info(f"Test labels shape: {y_te.shape}")
    logging.info(f"Train times length: {len(times_tr)}, Train times last: {times_tr[-1][-1]}")
    logging.info(f"Validation times length: {len(times_val)}, Validation times last: {times_val[-1][-1]}")
    logging.info(f"Test times length: {len(times_te)}, Test times last: {times_te[-1][-1]}")

    # 날짜 정보 저장
    with open("../data/date.pkl", "wb") as f:
        pickle.dump({'train': times_tr, 'val': times_val, 'test': times_te}, f)

    # 데이터셋 저장
    with open("../" + path + "dataset.pkl", "wb") as f:
        pickle.dump([x_tr, y_tr, x_val, y_val, x_te, y_te], f)

    logging.info("데이터셋 생성 완료")


Unnamed: 0,Date,StockID,Low,High,Close,Vol,Shares,Open,Ret,MarketCap,...,Ret_month,Ret_quarter,Ret_year,Ret_5d,Ret_20d,Ret_60d,Ret_65d,Ret_180d,Ret_250d,Ret_260d
0,1985-01-02,1,5.520,5.633,5.520,279650.0,,5.596,-0.014,,...,,,,0.025252,0.068656,-0.060027,-0.035552,-0.057351,0.113820,0.101161
1,1985-01-02,1004,1.585,1.601,1.585,12814.0,,,-0.029,,...,,,,0.030097,0.140979,0.270089,0.374783,0.136983,0.320233,0.408542
2,1985-01-02,1016,0.895,0.895,0.895,13200.0,,0.895,0.000,,...,,,,0.009223,0.159632,0.319429,0.302277,0.686962,0.861958,0.908591
3,1985-01-02,1017,0.433,0.439,0.437,1180800.0,,0.437,0.000,,...,,,,-0.015364,0.094631,0.023336,0.022464,0.113492,0.369712,0.425809
4,1985-01-02,1034,0.702,0.702,0.702,6834.0,,0.702,0.027,,...,,,,-0.034712,0.112347,-0.006131,-0.032301,-0.273319,-0.024784,-0.024690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23787879,2024-08-30,995,39.690,40.140,40.120,4417.0,,40.120,-0.010,,...,,,,,,,,,,
23787880,2024-08-30,996,37.070,37.970,37.240,437600.0,,37.930,-0.014,,...,,,,,,,,,,
23787881,2024-08-30,997,89.090,94.890,91.020,757100.0,,94.590,-0.022,,...,,,,,,,,,,
23787882,2024-08-30,998,0.870,1.010,0.922,5863000.0,,1.000,-0.087,,...,,,,,,,,,,
