In [38]:
import pandas as pd
import numpy as np
import random
import os
import pickle
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [166]:
# Read in price data
train = pd.read_csv("./train.csv", parse_dates=True, index_col="일자")
train = train.reset_index()
train.columns = ['date', 'ticker', 'firm', 'volume', 'open', 'high', 'low', 'close']

df = train.sort_values(by=['ticker', 'date'], ascending=True)
df['adjustTrue'] = 0
df.loc[df['volume'] == 0, 'adjustTrue'] = 1
df.tail()

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
978145,2023-05-23,A383800,LX홀딩스,150364,8390,8390,8310,8330,0
980145,2023-05-24,A383800,LX홀딩스,122457,8310,8340,8280,8300,0
982145,2023-05-25,A383800,LX홀딩스,84241,8300,8310,8270,8310,0
984145,2023-05-26,A383800,LX홀딩스,126681,8300,8310,8270,8280,0
986145,2023-05-30,A383800,LX홀딩스,70489,8300,8300,8270,8290,0


In [167]:
# Convert 'date' to datetime and sort the data by date
data = df

# Sort the data in descending order of date
data = data.sort_values(['ticker','date'], ascending=False)
data

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
986145,2023-05-30,A383800,LX홀딩스,70489,8300,8300,8270,8290,0
984145,2023-05-26,A383800,LX홀딩스,126681,8300,8310,8270,8280,0
982145,2023-05-25,A383800,LX홀딩스,84241,8300,8310,8270,8310,0
980145,2023-05-24,A383800,LX홀딩스,122457,8310,8340,8280,8300,0
978145,2023-05-23,A383800,LX홀딩스,150364,8390,8390,8310,8330,0
...,...,...,...,...,...,...,...,...,...
8502,2021-06-07,A000020,동화약품,511140,14800,15550,14750,15150,0
6502,2021-06-04,A000020,동화약품,133900,14600,14800,14550,14700,0
4502,2021-06-03,A000020,동화약품,96158,14550,14650,14450,14600,0
2502,2021-06-02,A000020,동화약품,109559,14700,14700,14450,14500,0


In [168]:
data[data['adjustTrue'] == 1]

Unnamed: 0,date,ticker,firm,volume,open,high,low,close,adjustTrue
426048,2022-04-12,A383220,F&F,0,0,0,0,770000,1
424048,2022-04-11,A383220,F&F,0,0,0,0,770000,1
422048,2022-04-08,A383220,F&F,0,0,0,0,770000,1
169805,2021-10-01,A373200,하인크코리아,0,0,0,0,2350,1
167805,2021-09-30,A373200,하인크코리아,0,0,0,0,2350,1
...,...,...,...,...,...,...,...,...,...
925481,2023-04-12,A000480,조선내화,0,0,0,0,100400,1
923481,2023-04-11,A000480,조선내화,0,0,0,0,100400,1
921481,2023-04-10,A000480,조선내화,0,0,0,0,100400,1
919481,2023-04-07,A000480,조선내화,0,0,0,0,100400,1


In [169]:
# Find the index where trading was suspended
suspension_indices = data[data['adjustTrue'] == 1].index
suspension_indices

Index([426048, 424048, 422048, 169805, 167805, 165805, 163805, 161805, 159805,
       157805,
       ...
       958272, 956272, 954272, 952272, 950272, 925481, 923481, 921481, 919481,
       917481],
      dtype='int64', length=21220)

In [171]:
# Convert 'date' to datetime and sort the data by date
data = df

# Sort the data in descending order of date
data = data.sort_values('date', ascending=False)

# Find the index where trading was suspended
suspension_indices = data[data['adjustTrue'] == 1].index

# Iterate over each suspension index
for index in tqdm(suspension_indices):
    # Get the split ratio from the close price at the suspension date and the open price at the date following the suspension
    close_price_at_suspension = data.loc[index, 'close']
    open_price_after_suspension = data.loc[index-1, 'open']  # because the data is in descending order
    split_ratio = close_price_at_suspension / open_price_after_suspension if open_price_after_suspension != 0 else 1

    # Adjust the volume, open, high, low, and close prices for all previous dates (because the data is in descending order)
    data.loc[index+1:, ['open', 'high', 'low', 'close']] /= split_ratio
    data.loc[index+1:, 'volume'] *= split_ratio

# Sort the data in ascending order of date
data = data.sort_values('date', ascending=True)

# Interpolate zero values in the data using 'pad' method
data.replace(0, pd.NA, inplace=True)
data.interpolate(method='pad', limit_direction='forward', inplace=True)
data.fillna(0, inplace=True) # If there are still NA values, replace them with 0

data

  8%|▊         | 1748/21220 [01:18<12:53, 25.16it/s]