In [1]:
import pandas as pd
import numpy as np
import random
import os
import pickle
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read in price data
path = "./data/train.csv"
train = pd.read_csv(path, parse_dates=True, index_col="일자")
train = train.reset_index()
train.columns = ['date', 'ticker', 'firm', 'volume', 'open', 'high', 'low', 'close']
df = train.sort_values(by=['ticker', 'date'], ascending=True)

df['adjustTrue'] = 1
df.loc[df['volume'] == 0, 'adjustTrue'] = -1
df = df.sort_values(['ticker','date'], ascending=[True,False])
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
987995,2021-06-07,A383800,LX홀딩스,2714980,10550,11150,10500,10800,1
987996,2021-06-04,A383800,LX홀딩스,1737593,10450,10650,10350,10450,1
987997,2021-06-03,A383800,LX홀딩스,2709800,10650,10700,10300,10400,1
987998,2021-06-02,A383800,LX홀딩스,2426922,10700,10850,10600,10700,1
987999,2021-06-01,A383800,LX홀딩스,1879288,11000,11300,10900,11000,1


In [3]:
# Convert 'date' to datetime and sort the data by date
data = df
data

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
0,2023-05-30,A000020,동화약품,201361,9960,10040,9640,9700,1
1,2023-05-26,A000020,동화약품,196257,10050,10150,9850,9850,1
2,2023-05-25,A000020,동화약품,398326,9660,10180,9660,10040,1
3,2023-05-24,A000020,동화약품,205243,9770,9820,9550,9740,1
4,2023-05-23,A000020,동화약품,641524,9160,9900,9160,9770,1
...,...,...,...,...,...,...,...,...,...
987995,2021-06-07,A383800,LX홀딩스,2714980,10550,11150,10500,10800,1
987996,2021-06-04,A383800,LX홀딩스,1737593,10450,10650,10350,10450,1
987997,2021-06-03,A383800,LX홀딩스,2709800,10650,10700,10300,10400,1
987998,2021-06-02,A383800,LX홀딩스,2426922,10700,10850,10600,10700,1


In [4]:
result = []
ticker_list = data['ticker'].unique()

for ticker in tqdm(ticker_list, leave=True):
    temp = data[data['ticker'] == ticker]
    temp = temp.reset_index(drop=True)
    # Find the index where trading was suspended
    suspension_indices = temp[temp['adjustTrue'] == -1].index

    if len(suspension_indices) == 0: # 거래정지가 없는 경우 패스
        result.append(temp)
        continue
    else:
        for index in tqdm(suspension_indices, leave=True):
            # Get the split ratio from the close price at the suspension date and the open price at the date following the suspension
            close_price_at_suspension = temp.loc[index, 'close']
            try: # 23-05-30에 거래정지인 경우 + 다른 거래정지도 고려
                open_price_after_suspension = temp.loc[index-1, 'open']  # 미래 데이터
            except: # 23-05-30 하루만 거래정지면 for문 탈출
                continue
            split_ratio = close_price_at_suspension / open_price_after_suspension if open_price_after_suspension != 0 else 1
            # Adjust the volume, open, high, low, and close prices for all previous dates (because the data is in descending order)
            # 거래 정지 이후 값은 액면분할을 반영하여 덮어쓰기
            temp.loc[index+1:, ['open', 'high', 'low', 'close']] /= split_ratio
            temp.loc[index+1:, 'volume'] *= split_ratio
        
        # Sort the data in ascending order of date
        # 다시 과거-현재 순으로 재정렬
        temp = temp.sort_values('date', ascending=True)

        # Interpolate zero values in the data using 'pad' method
        # 아직도 0이 남았다 == 거래 정지일이 끝 날짜에 하루밖에 없었다
        # 과거, 미래 값으로 채우기
        temp.replace(0, pd.NA, inplace=True)
        temp.interpolate(method='ffill', inplace=True)
        temp.interpolate(method='bfill', inplace=True)

        # 액면분할 시 최초 거래정지일 기준 변경되지 않은 기준 close값 변경
        try:
            temp.loc[suspension_indices[0], 'close'] = temp.loc[suspension_indices[0] - 1, 'close']
        except:
            pass

        result.append(temp)

result = pd.concat(result, axis=0)

100%|██████████| 5/5 [00:00<00:00, 768.55it/s]/s]
100%|██████████| 14/14 [00:00<00:00, 964.35it/s]]
100%|██████████| 1/1 [00:00<?, ?it/s], 35.66it/s]
100%|██████████| 17/17 [00:00<00:00, 1061.06it/s]
100%|██████████| 3/3 [00:00<00:00, 665.76it/s]/s]
100%|██████████| 5/5 [00:00<00:00, 831.25it/s]/s]
100%|██████████| 1/1 [00:00<00:00, 399.50it/s]t/s]
100%|██████████| 1/1 [00:00<00:00, 663.87it/s]t/s]
100%|██████████| 15/15 [00:00<00:00, 966.36it/s]s]
100%|██████████| 21/21 [00:00<00:00, 1048.85it/s]]
100%|██████████| 3/3 [00:00<00:00, 665.59it/s]t/s]
100%|██████████| 6/6 [00:00<00:00, 921.99it/s]
100%|██████████| 75/75 [00:00<00:00, 1015.75it/s]]
100%|██████████| 36/36 [00:00<00:00, 1073.43it/s]]
100%|██████████| 474/474 [00:00<00:00, 1081.90it/s]
100%|██████████| 1/1 [00:00<00:00, 333.30it/s]t/s]
100%|██████████| 8/8 [00:00<00:00, 1012.38it/s]/s]
100%|██████████| 13/13 [00:00<00:00, 999.06it/s]s]
100%|██████████| 29/29 [00:00<00:00, 1034.49it/s]]
100%|██████████| 5/5 [00:00<00:00, 768.5

In [25]:
stock = result.round(2).copy()
# 종목명별 중간값을 가지고 비싼 애들부터 인코딩해주기
st_price = stock.groupby('firm')['close'].agg('mean').sort_values(ascending=False)
print('변환전\n', st_price[:5])

변환전
 firm
LG생활건강      927289.473684
태광산업        919595.141700
삼성바이오로직스    837560.728745
LG화학        671966.599190
삼성SDI       656896.761134
Name: close, dtype: float64


In [26]:
#가격별로 0 ~ 인코딩 해주기
for i, a in enumerate(list(st_price.index)):
    stock.loc[stock['firm'] == a, 'firm'] = i 
st_price = result.groupby('firm')['close'].agg('mean').sort_values(ascending=False)
print('변환후\n', st_price[:5])

변환후
 firm
LG생활건강      927289.473684
태광산업        919595.141700
삼성바이오로직스    837560.728745
LG화학        671966.599190
삼성SDI       656896.761134
Name: close, dtype: float64


In [30]:
stock['firm'] = stock['firm'].astype('category').cat.codes

# 데이터 재배열
stock1 = stock[['date', 'ticker', 'firm', 'volume','open','high','low', 'adjustTrue', 'close']]
stock1 = stock1.sort_values(['ticker', 'date'])
stock1.head()

Unnamed: 0,date,ticker,firm,volume,open,high,low,adjustTrue,close
492,2021-06-02,A000020,821,109559.0,14700.0,14700.0,14450.0,1,14500.0
491,2021-06-03,A000020,821,96158.0,14550.0,14650.0,14450.0,1,14600.0
490,2021-06-04,A000020,821,133900.0,14600.0,14800.0,14550.0,1,14700.0
489,2021-06-07,A000020,821,511140.0,14800.0,15550.0,14750.0,1,15150.0
488,2021-06-08,A000020,821,272839.0,15350.0,15500.0,15000.0,1,15050.0


In [31]:
stock1.to_csv("./data/train_adj.csv")