In [1]:
import pandas as pd
import numpy as np
import random
import os
import pickle
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read in price data
path = "./data/train.csv"
train = pd.read_csv(path, parse_dates=True, index_col="일자")
train = train.reset_index()
train.columns = ['date', 'ticker', 'firm', 'volume', 'open', 'high', 'low', 'close']
df = train.sort_values(by=['ticker', 'date'], ascending=True)

df['adjustTrue'] = 1
df.loc[df['volume'] == 0, 'adjustTrue'] = -1
df = df.sort_values(['ticker','date'], ascending=[True,False])
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
987995,2021-06-07,A383800,LX홀딩스,2714980,10550,11150,10500,10800,1
987996,2021-06-04,A383800,LX홀딩스,1737593,10450,10650,10350,10450,1
987997,2021-06-03,A383800,LX홀딩스,2709800,10650,10700,10300,10400,1
987998,2021-06-02,A383800,LX홀딩스,2426922,10700,10850,10600,10700,1
987999,2021-06-01,A383800,LX홀딩스,1879288,11000,11300,10900,11000,1


In [3]:
# Convert 'date' to datetime and sort the data by date
data = df
data

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
0,2023-05-30,A000020,동화약품,201361,9960,10040,9640,9700,1
1,2023-05-26,A000020,동화약품,196257,10050,10150,9850,9850,1
2,2023-05-25,A000020,동화약품,398326,9660,10180,9660,10040,1
3,2023-05-24,A000020,동화약품,205243,9770,9820,9550,9740,1
4,2023-05-23,A000020,동화약품,641524,9160,9900,9160,9770,1
...,...,...,...,...,...,...,...,...,...
987995,2021-06-07,A383800,LX홀딩스,2714980,10550,11150,10500,10800,1
987996,2021-06-04,A383800,LX홀딩스,1737593,10450,10650,10350,10450,1
987997,2021-06-03,A383800,LX홀딩스,2709800,10650,10700,10300,10400,1
987998,2021-06-02,A383800,LX홀딩스,2426922,10700,10850,10600,10700,1


In [4]:
result = []
ticker_list = data['ticker'].unique()

for ticker in tqdm(ticker_list, leave=True):
    temp = data[data['ticker'] == ticker]
    temp = temp.reset_index(drop=True)
    # Find the index where trading was suspended
    suspension_indices = temp[temp['adjustTrue'] == -1].index

    if len(suspension_indices) == 0: # 거래정지가 없는 경우 패스
        result.append(temp)
        continue
    else:
        for index in tqdm(suspension_indices, leave=True):
            # Get the split ratio from the close price at the suspension date and the open price at the date following the suspension
            close_price_at_suspension = temp.loc[index, 'close']
            try: # 23-05-30에 거래정지인 경우 + 다른 거래정지도 고려
                open_price_after_suspension = temp.loc[index-1, 'open']  # 미래 데이터
            except: # 23-05-30 하루만 거래정지면 for문 탈출
                continue
            split_ratio = close_price_at_suspension / open_price_after_suspension if open_price_after_suspension != 0 else 1
            # Adjust the volume, open, high, low, and close prices for all previous dates (because the data is in descending order)
            # 거래 정지 이후 값은 액면분할을 반영하여 덮어쓰기
            temp.loc[index+1:, ['open', 'high', 'low', 'close']] /= split_ratio
            temp.loc[index+1:, 'volume'] *= split_ratio
        
        # Sort the data in ascending order of date
        # 다시 과거-현재 순으로 재정렬
        temp = temp.sort_values('date', ascending=True)

        # Interpolate zero values in the data using 'pad' method
        # 아직도 0이 남았다 == 거래 정지일이 끝 날짜에 하루밖에 없었다
        # 과거, 미래 값으로 채우기
        temp.replace(0, pd.NA, inplace=True)
        temp.interpolate(method='ffill', inplace=True)
        temp.interpolate(method='bfill', inplace=True)

        # 액면분할 시 최초 거래정지일 기준 변경되지 않은 기준 close값 변경
        try:
            temp.loc[suspension_indices[0], 'close'] = temp.loc[suspension_indices[0] - 1, 'close']
        except:
            pass

        result.append(temp)

result = pd.concat(result, axis=0)

100%|██████████| 5/5 [00:00<00:00, 908.25it/s]/s]
100%|██████████| 14/14 [00:00<00:00, 998.13it/s]]
100%|██████████| 1/1 [00:00<00:00, 999.36it/s]/s]
100%|██████████| 17/17 [00:00<00:00, 1028.61it/s]
100%|██████████| 3/3 [00:00<00:00, 665.73it/s]/s]
100%|██████████| 5/5 [00:00<00:00, 824.74it/s]/s]
100%|██████████| 1/1 [00:00<00:00, 398.96it/s]t/s]
100%|██████████| 1/1 [00:00<00:00, 499.98it/s]t/s]
100%|██████████| 15/15 [00:00<00:00, 965.87it/s]s]
100%|██████████| 21/21 [00:00<00:00, 1048.73it/s]]
100%|██████████| 3/3 [00:00<00:00, 855.63it/s]t/s]
100%|██████████| 6/6 [00:00<00:00, 793.25it/s]t/s]
100%|██████████| 75/75 [00:00<00:00, 1024.49it/s]]
100%|██████████| 36/36 [00:00<00:00, 1089.44it/s]]
100%|██████████| 474/474 [00:00<00:00, 1081.01it/s]
100%|██████████| 1/1 [00:00<00:00, 499.98it/s]t/s]
100%|██████████| 8/8 [00:00<00:00, 1027.45it/s]/s]
100%|██████████| 13/13 [00:00<00:00, 999.14it/s]s]
100%|██████████| 29/29 [00:00<00:00, 1093.23it/s]]
100%|██████████| 5/5 [00:00<00:00, 9

In [5]:
stock = result.round(2).copy()
# 종목명별 중간값을 가지고 비싼 애들부터 인코딩해주기
st_price = stock.groupby('firm')['close'].agg('mean').sort_values(ascending=False)
print('변환전\n', st_price[:5])

변환전
 firm
LG생활건강      927289.473684
태광산업        919595.141700
삼성바이오로직스    837560.728745
LG화학        671966.599190
삼성SDI       656896.761134
Name: close, dtype: float64


In [6]:
#가격별로 0 ~ 인코딩 해주기
for i, a in enumerate(list(st_price.index)):
    stock.loc[stock['firm'] == a, 'firm'] = i + 1
st_price = result.groupby('firm')['close'].agg('mean').sort_values(ascending=False)
print('변환후\n', st_price[:5])

변환후
 firm
LG생활건강      927289.473684
태광산업        919595.141700
삼성바이오로직스    837560.728745
LG화학        671966.599190
삼성SDI       656896.761134
Name: close, dtype: float64


In [7]:
stock['firm'] = stock['firm'].astype('category').cat.codes

# 데이터 재배열
stock1 = stock[['date', 'ticker', 'firm', 'volume','open','high','low', 'adjustTrue', 'close']]
stock1 = stock1.sort_values(['ticker', 'date'])
stock1.head()

Unnamed: 0,date,ticker,firm,volume,open,high,low,adjustTrue,close
493,2021-06-01,A000020,822,114966.0,14700.0,14700.0,14450.0,1,14600.0
492,2021-06-02,A000020,822,109559.0,14700.0,14700.0,14450.0,1,14500.0
491,2021-06-03,A000020,822,96158.0,14550.0,14650.0,14450.0,1,14600.0
490,2021-06-04,A000020,822,133900.0,14600.0,14800.0,14550.0,1,14700.0
489,2021-06-07,A000020,822,511140.0,14800.0,15550.0,14750.0,1,15150.0


---

## 기술적 분석

In [9]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
import warnings

import pandas_ta as ta
warnings.filterwarnings("ignore")

In [10]:
def calculate_technical_indicators(df: pd.DataFrame) -> pd.DataFrame:
    # Moving Averages
    df.ta.sma(close='Close', length=10, append=True)
    df.ta.sma(close='Close', length=20, append=True)
    df.ta.sma(close='Close', length=50, append=True)
    df.ta.sma(close='Close', length=100, append=True)
    df.ta.sma(close='Close', length=200, append=True)

    df.ta.ema(close='Close', length=10, append=True)
    df.ta.ema(close='Close', length=20, append=True)
    df.ta.ema(close='Close', length=50, append=True)
    df.ta.ema(close='Close', length=100, append=True)
    df.ta.ema(close='Close', length=200, append=True)

    # Momentum Indicators
    df.ta.rsi(close='Close', length=14, append=True)
    df.ta.macd(close='Close', fast=12, slow=26, signal=9, append=True)
    df.ta.stoch(close='Close', append=True)
    df.ta.roc(close='Close', append=True)
    # MACD는 단기 EMA (12일)가 장기 EMA (26일)에 비해 얼마나 빠르게 움직이는지를 보여줍니다.

    # Volume Indicators
    df.ta.vp(close='Close', volume='Volume', append=True)
    df.ta.obv(close='Close', volume='Volume', append=True)

    # Volatility Indicators
    df.ta.atr(close='Close', append=True)
    df.ta.bbands(close='Close', append=True)

    # Trend Strength Indicators
    df.ta.adx(close='Close', append=True)

    df.ta.efi(length=13, append=True) # Elder's Force Index (EFI): 알렉산더 엘더가 개발한 이 지표는 가격의 변동성과 거래량을 결합하여 주식의 '힘'을 측정합니다.
    df.ta.kama(length=10, append=True) # Kaufman's Adaptive Moving Average (KAMA): 이 지표는 변동성을 고려하여 보다 유연한 이동 평균을 제공합니다.
    df.ta.mfi(high='High', low='Low', close='Close', volume='Volume', length=14, append=True) # Money Flow Index (MFI): 이 지표는 가격과 거래량을 결합하여 주식이 과매수 또는 과매도 상태인지 판단합니다.
    df.ta.vortex(high='High', low='Low', close='Close', length=14, append=True) # Vortex Indicator (VI): 이 지표는 최근 가격의 상승과 하락을 추적하여 상승 추세와 하락 추세를 식별합니다.

    return df


In [11]:
train = pd.read_csv("./data/train_adj.csv")
train = train.drop('Unnamed: 0', axis=1)
train = train.sort_values(['ticker', 'date'], ascending=True)
train.set_index('date', inplace=True)
train

Unnamed: 0_level_0,ticker,firm,volume,open,high,low,adjustTrue,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-06-01,A000020,822,114966.0,14700.0,14700.0,14450.0,1,14600.0
2021-06-02,A000020,822,109559.0,14700.0,14700.0,14450.0,1,14500.0
2021-06-03,A000020,822,96158.0,14550.0,14650.0,14450.0,1,14600.0
2021-06-04,A000020,822,133900.0,14600.0,14800.0,14550.0,1,14700.0
2021-06-07,A000020,822,511140.0,14800.0,15550.0,14750.0,1,15150.0
...,...,...,...,...,...,...,...,...
2023-05-23,A383800,976,150364.0,8390.0,8390.0,8310.0,1,8330.0
2023-05-24,A383800,976,122457.0,8310.0,8340.0,8280.0,1,8300.0
2023-05-25,A383800,976,84241.0,8300.0,8310.0,8270.0,1,8310.0
2023-05-26,A383800,976,126681.0,8300.0,8310.0,8270.0,1,8280.0


In [12]:
ticker_list = train['ticker'].unique()
data_frames = []  # store DataFrames here

for ticker in tqdm(ticker_list):
    temp = calculate_technical_indicators(train[train['ticker'] == ticker])
    data_frames.append(temp)

# concat all at once
data = pd.concat(data_frames, axis=0)
data.head()

100%|██████████| 2000/2000 [01:42<00:00, 19.56it/s]


Unnamed: 0_level_0,ticker,firm,volume,open,high,low,adjustTrue,close,SMA_10,SMA_20,...,BBB_5_2.0,BBP_5_2.0,ADX_14,DMP_14,DMN_14,EFI_13,KAMA_10_2_30,MFI_14,VTXP_14,VTXM_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,A000020,822,114966.0,14700.0,14700.0,14450.0,1,14600.0,,,...,,,,,,,,,,
2021-06-02,A000020,822,109559.0,14700.0,14700.0,14450.0,1,14500.0,,,...,,,,,,,,,,
2021-06-03,A000020,822,96158.0,14550.0,14650.0,14450.0,1,14600.0,,,...,,,,,,,,,,
2021-06-04,A000020,822,133900.0,14600.0,14800.0,14550.0,1,14700.0,,,...,,,,,,,,,,
2021-06-07,A000020,822,511140.0,14800.0,15550.0,14750.0,1,15150.0,,,...,6.224622,0.980537,,,,,,,,


In [42]:
data_ta = data.dropna(axis=1, how='all')
data_ta = data_ta.dropna(axis=0)
ticker_list = data_ta['ticker'].unique()
train_frames = []  # store DataFrames here
submission_frames = []

for ticker in tqdm(ticker_list):
    temp = data_ta[data_ta['ticker'] == ticker]
    temp['target'] = temp['close'].shift(-15)
    train_frames.append(temp.dropna(axis=0))
    submission_frames.append(temp[temp['target'].isna()])

# concat all at once
train = pd.concat(train_frames, axis=0)
submission = pd.concat(submission_frames, axis=0)

100%|██████████| 1985/1985 [00:35<00:00, 56.44it/s]


In [43]:
train['pct_change'] = ((train['target'] - train['close']) / train['close'] * 100).round(5)

def up_or_down(df):
    conditions = [
        (df['pct_change'] > 4), 
        (df['pct_change'] > 0),
        (df['pct_change'] < 0)
    ]
    choices = [1, 0, -1]
    df['class_target'] = np.select(conditions, choices, default=0)

    return df

train = up_or_down(train)
train.head()

Unnamed: 0_level_0,ticker,firm,volume,open,high,low,adjustTrue,close,SMA_10,SMA_20,...,DMP_14,DMN_14,EFI_13,KAMA_10_2_30,MFI_14,VTXP_14,VTXM_14,target,pct_change,class_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-03-23,A000020,822,396150.0,13800.0,14100.0,13600.0,1,13650.0,13335.0,13020.0,...,30.358104,13.516124,207597900.0,13031.963084,74.680545,1.109091,0.842424,12950.0,-5.12821,-1
2022-03-24,A000020,822,164839.0,13600.0,13700.0,13500.0,1,13600.0,13405.0,13062.5,...,29.6279,14.393671,176763600.0,13056.980792,72.14329,1.086957,0.913043,13200.0,-2.94118,-1
2022-03-25,A000020,822,248995.0,13700.0,13950.0,13500.0,1,13900.0,13510.0,13125.0,...,31.055802,13.600975,162182900.0,13116.285955,73.136564,1.078788,0.872727,13250.0,-4.67626,-1
2022-03-28,A000020,822,160036.0,13900.0,13900.0,13600.0,1,13750.0,13630.0,13205.0,...,29.874583,13.083657,135584500.0,13175.382776,70.75483,1.092593,0.901235,13200.0,-4.0,-1
2022-03-29,A000020,822,160334.0,13850.0,14000.0,13650.0,1,13750.0,13720.0,13270.0,...,29.815148,12.486931,116215300.0,13215.806253,72.694644,1.11875,0.86875,13200.0,-4.0,-1


In [60]:
train

Unnamed: 0_level_0,ticker,firm,volume,open,high,low,adjustTrue,close,SMA_10,SMA_20,...,DMP_14,DMN_14,EFI_13,KAMA_10_2_30,MFI_14,VTXP_14,VTXM_14,target,pct_change,class_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-11-21,A051900,1,84034.0,663000.0,667000.0,630000.0,1,634000.0,631100.0,581200.00,...,31.299275,22.001785,4.484854e+08,646366.421052,67.242230,1.214834,0.621483,702000.0,10.72555,1
2022-12-08,A051900,1,89487.0,690000.0,693000.0,668000.0,1,679000.0,663100.0,653500.00,...,27.554343,17.254613,7.705734e+08,660924.711773,49.864806,0.988263,0.936620,722000.0,6.33284,1
2022-12-09,A051900,1,107848.0,688000.0,727000.0,686000.0,1,722000.0,674000.0,658950.00,...,32.419195,15.315078,1.322986e+09,669539.997653,58.648988,1.084668,0.828375,720000.0,-0.27701,-1
2022-12-12,A051900,1,99837.0,717000.0,735000.0,699000.0,1,702000.0,682500.0,661450.00,...,31.570467,14.040353,8.487397e+08,672269.714411,64.378884,1.138158,0.745614,708000.0,0.85470,0
2022-12-13,A051900,1,63477.0,706000.0,707000.0,681000.0,1,699000.0,687000.0,662800.00,...,29.650952,17.395983,7.002868e+08,673392.111505,59.733604,1.091304,0.826087,703000.0,0.57225,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-22,A004410,2000,4076990.0,236.0,246.0,233.0,1,243.0,241.5,246.10,...,16.710705,23.170606,3.241243e+06,242.934931,32.007307,0.802326,1.081395,219.0,-9.87654,-1
2022-09-21,A004410,2000,933236.0,236.0,239.0,235.0,1,238.0,241.8,246.65,...,10.465810,27.067099,3.839583e+05,242.933787,12.483298,0.794872,1.179487,218.0,-8.40336,-1
2022-09-20,A004410,2000,800515.0,237.0,239.0,236.0,1,238.0,242.5,247.50,...,10.994044,27.171430,4.479513e+05,243.256509,13.181058,0.807692,1.192308,213.0,-10.50420,-1
2022-09-16,A004410,2000,947741.0,244.0,244.0,240.0,1,242.0,244.8,249.95,...,12.524390,24.756472,1.550881e+06,245.052695,21.713168,0.805195,1.168831,215.0,-11.15702,-1


In [61]:
submission

Unnamed: 0_level_0,ticker,firm,volume,open,high,low,adjustTrue,close,SMA_10,SMA_20,...,BBP_5_2.0,ADX_14,DMP_14,DMN_14,EFI_13,KAMA_10_2_30,MFI_14,VTXP_14,VTXM_14,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-05-09,A000020,822,77887.0,8600.0,8710.0,8520.0,1,8610.0,8466.0,8536.0,...,0.744587,26.573859,31.201427,13.154691,3.165895e+06,8498.704593,57.084641,0.955556,1.028571,
2023-05-10,A000020,822,26578.0,8620.0,8650.0,8540.0,1,8600.0,8489.0,8540.5,...,0.634815,27.581870,29.982978,12.640987,2.675656e+06,8515.099421,59.544055,1.003279,1.029508,
2023-05-11,A000020,822,53660.0,8600.0,8640.0,8500.0,1,8500.0,8503.0,8533.5,...,0.014919,28.169827,28.459686,13.450336,1.526848e+06,8514.278856,59.233261,0.993548,0.990323,
2023-05-12,A000020,822,50012.0,8500.0,8540.0,8370.0,1,8460.0,8506.0,8522.5,...,0.120763,27.666941,26.686688,17.376410,1.022944e+06,8513.703556,59.974022,0.977419,1.022581,
2023-05-15,A000020,822,35224.0,8410.0,8490.0,8350.0,1,8450.0,8508.0,8511.5,...,0.229110,27.047717,25.289416,17.214586,8.264891e+05,8513.192069,60.227909,0.983974,1.009615,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-23,A383800,976,150364.0,8390.0,8390.0,8310.0,1,8330.0,8394.0,8476.0,...,0.130726,24.830786,12.066227,21.651061,-2.119785e+06,8402.067450,35.186028,0.830769,1.130769,
2023-05-24,A383800,976,122457.0,8310.0,8340.0,8280.0,1,8300.0,8370.0,8465.5,...,0.055122,25.443523,11.486704,23.012618,-2.341775e+06,8374.877786,26.722607,0.801587,1.214286,
2023-05-25,A383800,976,84241.0,8300.0,8310.0,8270.0,1,8310.0,8351.0,8455.0,...,0.280735,26.128448,11.103822,23.078862,-1.886891e+06,8361.207569,21.844744,0.758065,1.241935,
2023-05-26,A383800,976,126681.0,8300.0,8310.0,8270.0,1,8280.0,8337.0,8441.5,...,0.170044,26.764450,10.719044,22.279117,-2.160254e+06,8347.169606,11.271104,0.717949,1.333333,


In [62]:
train.to_pickle("train_기술적_분석.pkl")
submission.to_pickle("prediction_기술적_분석.pkl")