## 수정종가 처리

In [1]:
import pandas as pd
import numpy as np
import random
import os
import pickle
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

import pandas_ta as ta
warnings.filterwarnings("ignore")

In [2]:
# Read in price data
path = "./data/train.csv"
train = pd.read_csv(path, parse_dates=True, index_col="일자")
train = train.reset_index()
train.columns = ['date', 'ticker', 'firm', 'volume', 'open', 'high', 'low', 'close']
df = train.sort_values(by=['ticker', 'date'], ascending=True)

df['adjustTrue'] = 1
df.loc[df['volume'] == 0, 'adjustTrue'] = -1
df = df.sort_values(['ticker','date'], ascending=[True,False])
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
987995,2021-06-07,A383800,LX홀딩스,2714980,10550,11150,10500,10800,1
987996,2021-06-04,A383800,LX홀딩스,1737593,10450,10650,10350,10450,1
987997,2021-06-03,A383800,LX홀딩스,2709800,10650,10700,10300,10400,1
987998,2021-06-02,A383800,LX홀딩스,2426922,10700,10850,10600,10700,1
987999,2021-06-01,A383800,LX홀딩스,1879288,11000,11300,10900,11000,1


In [3]:
# Convert 'date' to datetime and sort the data by date
data = df
data

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
0,2023-05-30,A000020,동화약품,201361,9960,10040,9640,9700,1
1,2023-05-26,A000020,동화약품,196257,10050,10150,9850,9850,1
2,2023-05-25,A000020,동화약품,398326,9660,10180,9660,10040,1
3,2023-05-24,A000020,동화약품,205243,9770,9820,9550,9740,1
4,2023-05-23,A000020,동화약품,641524,9160,9900,9160,9770,1
...,...,...,...,...,...,...,...,...,...
987995,2021-06-07,A383800,LX홀딩스,2714980,10550,11150,10500,10800,1
987996,2021-06-04,A383800,LX홀딩스,1737593,10450,10650,10350,10450,1
987997,2021-06-03,A383800,LX홀딩스,2709800,10650,10700,10300,10400,1
987998,2021-06-02,A383800,LX홀딩스,2426922,10700,10850,10600,10700,1


In [4]:
result = []
ticker_list = data['ticker'].unique()

for ticker in tqdm(ticker_list, leave=True):
    temp = data[data['ticker'] == ticker]
    temp = temp.reset_index(drop=True)
    # Find the index where trading was suspended
    suspension_indices = temp[temp['adjustTrue'] == -1].index

    if len(suspension_indices) == 0: # 거래정지가 없는 경우 패스
        result.append(temp)
        continue
    else:
        for index in tqdm(suspension_indices, leave=True):
            # Get the split ratio from the close price at the suspension date and the open price at the date following the suspension
            close_price_at_suspension = temp.loc[index, 'close']
            try: # 23-05-30에 거래정지인 경우 + 다른 거래정지도 고려
                open_price_after_suspension = temp.loc[index-1, 'open']  # 미래 데이터
            except: # 23-05-30 하루만 거래정지면 for문 탈출
                continue
            split_ratio = close_price_at_suspension / open_price_after_suspension if open_price_after_suspension != 0 else 1
            # Adjust the volume, open, high, low, and close prices for all previous dates (because the data is in descending order)
            # 거래 정지 이후 값은 액면분할을 반영하여 덮어쓰기
            temp.loc[index+1:, ['open', 'high', 'low', 'close']] /= split_ratio
            temp.loc[index+1:, 'volume'] *= split_ratio
        
        # Sort the data in ascending order of date
        # 다시 과거-현재 순으로 재정렬
        temp = temp.sort_values('date', ascending=True)

        # Interpolate zero values in the data using 'pad' method
        # 아직도 0이 남았다 == 거래 정지일이 끝 날짜에 하루밖에 없었다
        # 과거, 미래 값으로 채우기
        temp.replace(0, pd.NA, inplace=True)
        temp.interpolate(method='ffill', inplace=True)
        temp.interpolate(method='bfill', inplace=True)

        # 액면분할 시 최초 거래정지일 기준 변경되지 않은 기준 close값 변경
        try:
            temp.loc[suspension_indices[0], 'close'] = temp.loc[suspension_indices[0] - 1, 'close']
        except:
            pass

        result.append(temp)

result = pd.concat(result, axis=0)

100%|██████████| 5/5 [00:00<00:00, 833.43it/s]/s]
100%|██████████| 14/14 [00:00<00:00, 1035.63it/s]
100%|██████████| 1/1 [00:00<?, ?it/s], 35.54it/s]
100%|██████████| 17/17 [00:00<00:00, 1029.96it/s]
100%|██████████| 3/3 [00:00<00:00, 664.29it/s]/s]
100%|██████████| 5/5 [00:00<00:00, 807.16it/s]/s]
100%|██████████| 1/1 [00:00<00:00, 500.27it/s]t/s]
100%|██████████| 1/1 [00:00<00:00, 483.33it/s]t/s]
100%|██████████| 15/15 [00:00<00:00, 964.40it/s]s]
100%|██████████| 21/21 [00:00<00:00, 1049.39it/s]]
100%|██████████| 3/3 [00:00<00:00, 664.39it/s]t/s]
100%|██████████| 6/6 [00:00<00:00, 799.02it/s]
100%|██████████| 75/75 [00:00<00:00, 1046.55it/s]]
100%|██████████| 36/36 [00:00<00:00, 1041.95it/s]]
100%|██████████| 474/474 [00:00<00:00, 1079.46it/s]
100%|██████████| 1/1 [00:00<00:00, 499.68it/s]t/s]
100%|██████████| 8/8 [00:00<00:00, 940.66it/s]t/s]
100%|██████████| 13/13 [00:00<00:00, 1039.36it/s]]
100%|██████████| 29/29 [00:00<00:00, 1093.07it/s]]
100%|██████████| 5/5 [00:00<00:00, 833.7

In [5]:
stock = result.round(2).copy()
st_price = stock.groupby('firm')['close'].agg('mean').sort_values(ascending=False)
print('변환전\n', st_price[:5])

변환전
 firm
LG생활건강      927289.473684
태광산업        919595.141700
삼성바이오로직스    837560.728745
LG화학        671966.599190
삼성SDI       656896.761134
Name: close, dtype: float64


In [6]:
for i, a in enumerate(list(st_price.index)):
    stock.loc[stock['firm'] == a, 'firm'] = i
st_price = result.groupby('firm')['close'].agg('mean').sort_values(ascending=False)
print('변환후\n', st_price[:5])

변환후
 firm
LG생활건강      927289.473684
태광산업        919595.141700
삼성바이오로직스    837560.728745
LG화학        671966.599190
삼성SDI       656896.761134
Name: close, dtype: float64


In [7]:
stock['firm'] = stock['firm'].astype('category').cat.codes

# 데이터 재배열
stock1 = stock[['date', 'ticker', 'firm', 'volume','open','high','low', 'adjustTrue', 'close']]
stock1 = stock1.sort_values(['ticker', 'date'])
stock1.head()

Unnamed: 0,date,ticker,firm,volume,open,high,low,adjustTrue,close
493,2021-06-01,A000020,822,114966.0,14700.0,14700.0,14450.0,1,14600.0
492,2021-06-02,A000020,822,109559.0,14700.0,14700.0,14450.0,1,14500.0
491,2021-06-03,A000020,822,96158.0,14550.0,14650.0,14450.0,1,14600.0
490,2021-06-04,A000020,822,133900.0,14600.0,14800.0,14550.0,1,14700.0
489,2021-06-07,A000020,822,511140.0,14800.0,15550.0,14750.0,1,15150.0


---

## 기술적 분석

In [9]:
def calculate_technical_indicators(df: pd.DataFrame) -> pd.DataFrame:
    # Moving Averages
    df.ta.sma(close='Close', length=10, append=True)
    df.ta.sma(close='Close', length=20, append=True)
    df.ta.sma(close='Close', length=50, append=True)
    df.ta.sma(close='Close', length=100, append=True)
    df.ta.sma(close='Close', length=200, append=True)

    df.ta.ema(close='Close', length=10, append=True)
    df.ta.ema(close='Close', length=20, append=True)
    df.ta.ema(close='Close', length=50, append=True)
    df.ta.ema(close='Close', length=100, append=True)
    df.ta.ema(close='Close', length=200, append=True)

    # Momentum Indicators
    df.ta.rsi(close='Close', length=14, append=True)
    df.ta.macd(close='Close', fast=12, slow=26, signal=9, append=True)
    df.ta.stoch(close='Close', append=True)
    df.ta.roc(close='Close', append=True)
    # MACD는 단기 EMA (12일)가 장기 EMA (26일)에 비해 얼마나 빠르게 움직이는지를 보여줍니다.

    # Volume Indicators
    df.ta.vp(close='Close', volume='Volume', append=True)
    df.ta.obv(close='Close', volume='Volume', append=True)

    # Volatility Indicators
    df.ta.atr(close='Close', append=True)
    df.ta.bbands(close='Close', append=True)

    # Trend Strength Indicators
    df.ta.adx(close='Close', append=True)

    df.ta.efi(length=13, append=True) # Elder's Force Index (EFI): 알렉산더 엘더가 개발한 이 지표는 가격의 변동성과 거래량을 결합하여 주식의 '힘'을 측정합니다.
    df.ta.kama(length=10, append=True) # Kaufman's Adaptive Moving Average (KAMA): 이 지표는 변동성을 고려하여 보다 유연한 이동 평균을 제공합니다.
    df.ta.mfi(high='High', low='Low', close='Close', volume='Volume', length=14, append=True) # Money Flow Index (MFI): 이 지표는 가격과 거래량을 결합하여 주식이 과매수 또는 과매도 상태인지 판단합니다.
    df.ta.vortex(high='High', low='Low', close='Close', length=14, append=True) # Vortex Indicator (VI): 이 지표는 최근 가격의 상승과 하락을 추적하여 상승 추세와 하락 추세를 식별합니다.

    return df


In [10]:
train = pd.read_csv("./data/train_adj.csv")
train = train.drop('Unnamed: 0', axis=1)
train = train.sort_values(['ticker', 'date'], ascending=True)
train.set_index('date', inplace=True)
train

Unnamed: 0_level_0,ticker,firm,volume,open,high,low,adjustTrue,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-06-01,A000020,822,114966.0,14700.0,14700.0,14450.0,1,14600.0
2021-06-02,A000020,822,109559.0,14700.0,14700.0,14450.0,1,14500.0
2021-06-03,A000020,822,96158.0,14550.0,14650.0,14450.0,1,14600.0
2021-06-04,A000020,822,133900.0,14600.0,14800.0,14550.0,1,14700.0
2021-06-07,A000020,822,511140.0,14800.0,15550.0,14750.0,1,15150.0
...,...,...,...,...,...,...,...,...
2023-05-23,A383800,976,150364.0,8390.0,8390.0,8310.0,1,8330.0
2023-05-24,A383800,976,122457.0,8310.0,8340.0,8280.0,1,8300.0
2023-05-25,A383800,976,84241.0,8300.0,8310.0,8270.0,1,8310.0
2023-05-26,A383800,976,126681.0,8300.0,8310.0,8270.0,1,8280.0


In [11]:
ticker_list = train['ticker'].unique()
data_frames = []  # store DataFrames here

for ticker in tqdm(ticker_list):
    temp = calculate_technical_indicators(train[train['ticker'] == ticker])
    data_frames.append(temp)

# concat all at once
data = pd.concat(data_frames, axis=0)
data.head()

100%|██████████| 2000/2000 [01:42<00:00, 19.47it/s]


Unnamed: 0_level_0,ticker,firm,volume,open,high,low,adjustTrue,close,SMA_10,SMA_20,...,BBB_5_2.0,BBP_5_2.0,ADX_14,DMP_14,DMN_14,EFI_13,KAMA_10_2_30,MFI_14,VTXP_14,VTXM_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,A000020,822,114966.0,14700.0,14700.0,14450.0,1,14600.0,,,...,,,,,,,,,,
2021-06-02,A000020,822,109559.0,14700.0,14700.0,14450.0,1,14500.0,,,...,,,,,,,,,,
2021-06-03,A000020,822,96158.0,14550.0,14650.0,14450.0,1,14600.0,,,...,,,,,,,,,,
2021-06-04,A000020,822,133900.0,14600.0,14800.0,14550.0,1,14700.0,,,...,,,,,,,,,,
2021-06-07,A000020,822,511140.0,14800.0,15550.0,14750.0,1,15150.0,,,...,6.224622,0.980537,,,,,,,,


In [12]:
data_ta = data.dropna(axis=1, how='all')
data_ta = data_ta.dropna(axis=0)
ticker_list = data_ta['ticker'].unique()
print("ticker_list 길이 : ",  len(ticker_list))

ticker_list 길이 :  1985


In [14]:
data_ta.to_pickle("./data/train_adj_tactical.pkl")

---

## Train / Test 분리

In [3]:
import pandas as pd
import numpy as np
import random
import os
import pickle
from tqdm import tqdm

import pandas_ta as ta

import warnings
warnings.filterwarnings("ignore")

In [4]:
data_ta = pd.read_pickle("./data/train_adj_tactical.pkl")
data_ta = data_ta.reset_index()

In [5]:
data_ta = data_ta.sort_values(['ticker', 'date'])
data_ta2 = data_ta.copy()
data_ta2['target'] = data_ta.groupby('firm')['close'].shift(-15)

In [15]:
train = data_ta2[~data_ta2['target'].isna()]
test = data_ta2[data_ta2['target'].isna()]

In [16]:
train['pct_change'] = ((train['target'] - train['close']) / train['close'] * 100).round(5)

def up_or_down(df):
    conditions = [
        (df['pct_change'] > 4), 
        (df['pct_change'] > 0),
        (df['pct_change'] < 0)
    ]
    choices = [2, 1, 0]
    df['class_target'] = np.select(conditions, choices, default=0)

    return df

train = up_or_down(train)
train.head()

Unnamed: 0,date,ticker,firm,volume,open,high,low,adjustTrue,close,SMA_10,...,DMP_14,DMN_14,EFI_13,KAMA_10_2_30,MFI_14,VTXP_14,VTXM_14,target,pct_change,class_target
0,2022-03-23,A000020,822,396150.0,13800.0,14100.0,13600.0,1,13650.0,13335.0,...,30.358104,13.516124,207597900.0,13031.963084,74.680545,1.109091,0.842424,12950.0,-5.12821,0
1,2022-03-24,A000020,822,164839.0,13600.0,13700.0,13500.0,1,13600.0,13405.0,...,29.6279,14.393671,176763600.0,13056.980792,72.14329,1.086957,0.913043,13200.0,-2.94118,0
2,2022-03-25,A000020,822,248995.0,13700.0,13950.0,13500.0,1,13900.0,13510.0,...,31.055802,13.600975,162182900.0,13116.285955,73.136564,1.078788,0.872727,13250.0,-4.67626,0
3,2022-03-28,A000020,822,160036.0,13900.0,13900.0,13600.0,1,13750.0,13630.0,...,29.874583,13.083657,135584500.0,13175.382776,70.75483,1.092593,0.901235,13200.0,-4.0,0
4,2022-03-29,A000020,822,160334.0,13850.0,14000.0,13650.0,1,13750.0,13720.0,...,29.815148,12.486931,116215300.0,13215.806253,72.694644,1.11875,0.86875,13200.0,-4.0,0


In [17]:
train.to_pickle("./data/train_완료.pkl")
test.to_pickle("./data/prediction_완료.pkl")