In [1]:
import pandas as pd

In [2]:
def create_time_windows(df, window_size, stride):
    # 입력 유효성 검사
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame")
    if not isinstance(window_size, int) or window_size <= 0:
        raise ValueError("window_size must be a positive integer")
    if not isinstance(stride, int) or stride <= 0:
        raise ValueError("stride must be a positive integer")
    
    # 결과를 저장할 딕셔너리 생성
    result_dict = {}
    
    # 각 열에 대해 반복
    for column in df.columns:
        # 시간 윈도우 생성
        windows = {}
        for i in range(0, len(df) - window_size + 1, stride):
            window = df[column].iloc[i:i+window_size].values
            windows[df.index[i+window_size-1]] = window
        
        # 윈도우 데이터로 새 DataFrame 생성
        window_df = pd.DataFrame.from_dict(windows, orient='index')
        window_df.columns = [f"{column}_t-{window_size-i-1}" for i in range(window_size)]
        
        # 결과 딕셔너리에 추가
        result_dict[column] = window_df
    
    return result_dict

In [3]:
df = pd.read_parquet(r"D:\Workspace\DnS\data\AJ네트웍스_20190825_20240825.parquet")

In [4]:
df.shape

(1233, 6)

In [5]:
window_size = 5  # window 크기
stride = 2  # stride 크기
df_list = create_time_windows(df, window_size, stride)

In [6]:
print(df.shape)
print(df.columns)
print(df.head(5))

(1233, 6)
Index(['시가', '고가', '저가', '종가', '거래량', '등락률'], dtype='object')
              시가    고가    저가    종가    거래량       등락률
날짜                                                 
2019-08-26  4615  4615  4480  4540  34971 -2.365591
2019-08-27  4505  4585  4505  4560  20983  0.440529
2019-08-28  4540  4640  4490  4580  20526  0.438596
2019-08-29  4650  4650  4365  4650  22742  1.528384
2019-08-30  4645  4715  4610  4700  20754  1.075269


In [7]:
df_list['종가']

Unnamed: 0,종가_t-4,종가_t-3,종가_t-2,종가_t-1,종가_t-0
2019-08-30,4540,4560,4580,4650,4700
2019-09-03,4580,4650,4700,4670,4590
2019-09-05,4700,4670,4590,4575,4440
2019-09-09,4590,4575,4440,4455,4375
2019-09-11,4440,4455,4375,4385,4395
...,...,...,...,...,...
2024-08-12,4320,4315,4275,4320,4360
2024-08-14,4275,4320,4360,4325,4350
2024-08-19,4360,4325,4350,4410,4400
2024-08-21,4350,4410,4400,4500,4620


In [8]:
from acf import calculate_acf
acf_df = calculate_acf(df_list['종가'], window_size=window_size)
acf_df

Unnamed: 0,lag_0,lag_1,lag_2,lag_3,lag_4
2019-08-30,1.0,0.403125,-0.153571,-0.403348,-0.346205
2019-09-03,1.0,0.046442,-0.579401,-0.227715,0.260674
2019-09-05,1.0,0.260341,-0.030414,-0.333942,-0.395985
2019-09-09,1.0,0.293466,-0.070114,-0.385350,-0.338002
2019-09-11,1.0,0.205000,-0.330000,-0.285000,-0.090000
...,...,...,...,...,...
2024-08-12,1.0,0.033333,-0.522865,-0.033609,0.023140
2024-08-14,1.0,0.010069,-0.208696,-0.021281,-0.280092
2024-08-19,1.0,0.343426,-0.442629,-0.345219,-0.055578
2024-08-21,1.0,0.269285,-0.116328,-0.269373,-0.383583


In [9]:
from buffett import calculate_buffett_index
buffett = calculate_buffett_index(df['종가'], 'KOR')
buffett_df = create_time_windows(buffett.to_frame(), window_size, stride)['종가']

In [10]:
from deMartini import demartini_index
de = demartini_index(df['종가'])
de_df = create_time_windows(de.to_frame(), window_size, stride)['rsi']

In [11]:
from div_each_before import div_each_before

deb = div_each_before(df['종가'])
deb_df = create_time_windows(deb.to_frame(), window_size, stride)['종가']


In [12]:
from fractional_difference import fractional_difference
fracdiff = fractional_difference(df['종가'], 0.3)
fracdiff_df = create_time_windows(fracdiff.to_frame(), window_size, stride)[0]

In [13]:
from pivot import calculate_pivot_points
pivot_points = calculate_pivot_points(df['고가'], df['저가'], df['종가'])
pivot_points_df = create_time_windows(pivot_points, window_size, stride)['Pivot']

In [14]:

from sonar import sonar_indicator
sn = sonar_indicator(df, window_size=14)
sn_df = create_time_windows(sn.to_frame(), window_size, stride)[0]

In [15]:
from stocastic import stochastic_fast, stochastic_slow
stfa = stochastic_fast(df)
stsl = stochastic_slow(df)

In [16]:
fastk_df = create_time_windows(stfa['fastk'].to_frame(), window_size, stride)[0]

In [17]:
fastd_df = create_time_windows(stfa['fastd'].to_frame(), window_size, stride)[0]

In [18]:
slowk_df = create_time_windows(stsl['slowk'].to_frame(), window_size, stride)[0]

In [19]:
slowd_df = create_time_windows(stsl['slowd'].to_frame(), window_size, stride)[0]

In [20]:
slowd_df.shape

(615, 5)

In [21]:
from time_delay import time_delay_embedding
time_delay_df = time_delay_embedding(df['종가'], 154, 5)[:615 :]

In [22]:
time_delay_df.shape

(615, 5)

In [23]:
from vix import calculate_vix
calVix = calculate_vix(df['종가'], window_size)
calVix_df = create_time_windows(calVix.to_frame(), window_size, stride)['종가']

In [24]:
from williams import williams_r
will = williams_r(df, 5) 
will_df = create_time_windows(will.to_frame(), window_size, stride)[0]

날짜
2019-08-26   -55.555556
2019-08-27   -40.740741
2019-08-28   -37.500000
2019-08-29    -0.000000
2019-08-30    -4.285714
                ...    
2024-08-19   -21.052632
2024-08-20    -7.142857
2024-08-21   -18.918919
2024-08-22   -24.242424
2024-08-23   -22.222222
Length: 1233, dtype: float64


In [25]:
from sklearn.preprocessing import MinMaxScaler

In [58]:
def prepare_autoencoder_data(df_list, additional_dfs):
    """
    Prepare data for autoencoder by combining multiple indicators.
    
    :param df_list: Dictionary of DataFrames from create_time_windows function
    :param additional_dfs: List of additional DataFrames to include
    :return: numpy array ready for autoencoder input
    """
    # Get the shape of the first DataFrame to determine the number of samples and features
    first_df = next(iter(df_list.values()))
    n_samples, n_features = first_df.shape
    
    # Calculate the total number of indicators
    n_indicators = len(df_list) + len(additional_dfs)
    
    # Initialize the result array
    result = np.zeros((n_samples, n_indicators, n_features))
    
    # Fill in the data from df_list
    for i, df in enumerate(df_list.values()):
        result[:, i, :] = df.values
    
    # Fill in the data from additional_dfs
    for i, df in enumerate(additional_dfs, start=len(df_list)):
        result[:, i, :] = df.values
    
    return result

# Usage example:
df_list = {
    'acf': acf_df,
    'buffett': buffett_df,
    'demartini': de_df,
    'div_each_before': deb_df,
    'fractional_diff': fracdiff_df,
    'pivot': pivot_points_df,
    'sonar': sn_df,
    'vix': calVix_df,
    'williams': will_df
}

additional_dfs = [
    fastk_df,
    fastd_df,
    slowk_df,
    slowd_df,
    time_delay_df  # Note: This might need adjustment if the shape doesn't match
]

# Prepare the data
autoencoder_input = prepare_autoencoder_data(df_list, additional_dfs)

print(f"Shape of autoencoder input: {autoencoder_input.shape}")

Shape of autoencoder input: (615, 14, 5)


In [73]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def apply_minmax_scaling_to_each_column(df):
    """
    Apply MinMax scaling to each column of a DataFrame independently.
    
    :param df: Input DataFrame
    :return: DataFrame with each column scaled independently
    """
    scaler = MinMaxScaler()
    df_scaled = df.copy()
    for column in df.columns:
        df_scaled[column] = scaler.fit_transform(df[[column]])
    return df_scaled

def prepare_autoencoder_data(df_list, additional_dfs):
    """
    Prepare data for autoencoder by combining multiple indicators.
    
    :param df_list: Dictionary of DataFrames from create_time_windows function
    :param additional_dfs: List of additional DataFrames to include
    :return: numpy array ready for autoencoder input
    """
    # Get the shape of the first DataFrame to determine the number of samples and features
    first_df = next(iter(df_list.values()))
    n_samples, n_features = first_df.shape
    
    # Calculate the total number of indicators
    n_indicators = len(df_list) + len(additional_dfs)
    
    # Initialize the result array
    result = np.zeros((n_samples, n_indicators, n_features))
    
    # Fill in the data from df_list and apply scaling
    for i, df in enumerate(df_list.values()):
        scaled_df = apply_minmax_scaling_to_each_column(df)
        result[:, i, :] = scaled_df.values
    
    # Fill in the data from additional_dfs and apply scaling
    for i, df in enumerate(additional_dfs, start=len(df_list)):
        scaled_df = apply_minmax_scaling_to_each_column(df)
        result[:, i, :] = scaled_df.values
    
    return result

# Usage example:
df_list = {
    'acf': acf_df,
    'buffett': buffett_df,
    'demartini': de_df,
    'div_each_before': deb_df,
    'fractional_diff': fracdiff_df,
    'pivot': pivot_points_df,
    'sonar': sn_df,
    'vix': calVix_df,
    'williams': will_df
}

additional_dfs = [
    fastk_df,
    fastd_df,
    slowk_df,
    slowd_df,
    time_delay_df
]

# Prepare the data with MinMax scaling applied to each column independently
autoencoder_input = prepare_autoencoder_data(df_list, additional_dfs)

print(f"Shape of autoencoder input: {autoencoder_input.shape}")

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required by MinMaxScaler.

In [74]:
autoencoder_input

array([[[0.        , 0.92884299, 0.42138562, 0.09838865, 0.17300311],
        [0.40257649, 0.40342679, 0.40901771, 0.41744548, 0.42834138],
        [       nan,        nan,        nan,        nan,        nan],
        ...,
        [0.43929593, 0.52538647, 0.55559695, 0.7616876 , 0.88114618],
        [0.4498144 , 0.48442208, 0.51631447, 0.6174826 , 0.74696515],
        [0.53319502, 0.12769784, 0.16759156, 0.4721346 , 0.38380652]],

       [[0.        , 0.65471183, 0.0809289 , 0.2776527 , 0.9614132 ],
        [0.40901771, 0.41744548, 0.42834138, 0.42056075, 0.41062802],
        [       nan,        nan,        nan,        nan,        nan],
        ...,
        [0.55559695, 0.7616876 , 0.88114618, 0.97538368, 0.83442737],
        [0.51631447, 0.6174826 , 0.74696515, 0.88084358, 0.91435649],
        [0.5373444 , 0.14118705, 0.15871254, 0.46582545, 0.35015773]],

       [[0.        , 0.8191051 , 0.51985207, 0.16923023, 0.10833279],
        [0.42834138, 0.42056075, 0.41062802, 0.40576324, 0.3