## Import

In [1]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings("ignore")
import ta

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Data Load

In [3]:
df = pd.read_csv('train.csv')

# 거래량이 0인 데이터 필터링
zero_volume_data = df[df['거래량'] == 0]

unique_codes = df['종목명'].unique()

# 필요한 데이터 프레임 생성
zero_volume_df = zero_volume_data[['일자', '종목명']]

zero_volume_df

Unnamed: 0,일자,종목명
22,20210601,CNT85
43,20210601,DXVX
86,20210601,ITX-AI
107,20210601,KG모빌리티
233,20210601,강원에너지
...,...,...
987842,20230530,한국테크놀로지
987873,20230530,한송네오텍
987930,20230530,현대약품
987983,20230530,휴먼엔


In [4]:
train = pd.read_csv('train.csv')

In [5]:
train

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,20210601,A060310,3S,166690,2890,2970,2885,2920
1,20210601,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,20210601,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,20210601,A054620,APS,462544,14600,14950,13800,14950
4,20210601,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,20230530,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,20230530,A000540,흥국화재,50218,3250,3255,3195,3215
987997,20230530,A003280,흥아해운,130664,1344,1395,1340,1370
987998,20230530,A037440,희림,141932,9170,9260,9170,9200


In [6]:
# Read in price data
train = pd.read_csv("train.csv", parse_dates=True, index_col="일자")
train = train.reset_index()
#train.columns = ['date', 'ticker', 'firm', 'volume', 'open', 'high', 'low', 'close']
#train.set_index('date', inplace=True)
train

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,2021-06-01,A060310,3S,166690,2890,2970,2885,2920
1,2021-06-01,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,2021-06-01,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,2021-06-01,A054620,APS,462544,14600,14950,13800,14950
4,2021-06-01,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,2023-05-30,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,2023-05-30,A000540,흥국화재,50218,3250,3255,3195,3215
987997,2023-05-30,A003280,흥아해운,130664,1344,1395,1340,1370
987998,2023-05-30,A037440,희림,141932,9170,9260,9170,9200


In [7]:
def calculate_indicators(df):
    df = df.sort_index()
    df['log_return'] = np.log(df['종가'] / df['종가'].shift(1))
    df['sma5'] = ta.trend.sma_indicator(df['종가'], window=5)
    df['sma20'] = ta.trend.sma_indicator(df['종가'], window=20)
    df['macd'] = ta.trend.MACD(df['종가']).macd()
    df['rsi'] = ta.momentum.RSIIndicator(df['종가']).rsi()
    return df

# Apply function to each group (i.e., for each ticker)
train = train.groupby('종목코드').apply(calculate_indicators)
train

Unnamed: 0_level_0,Unnamed: 1_level_0,일자,종목코드,종목명,거래량,시가,고가,저가,종가,log_return,sma5,sma20,macd,rsi
종목코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A000020,502,2021-06-01,A000020,동화약품,114966,14700,14700,14450,14600,,,,,
A000020,2502,2021-06-02,A000020,동화약품,109559,14700,14700,14450,14500,-0.006873,,,,
A000020,4502,2021-06-03,A000020,동화약품,96158,14550,14650,14450,14600,0.006873,,,,
A000020,6502,2021-06-04,A000020,동화약품,133900,14600,14800,14550,14700,0.006826,,,,
A000020,8502,2021-06-07,A000020,동화약품,511140,14800,15550,14750,15150,0.030153,14710.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A383800,978145,2023-05-23,A383800,LX홀딩스,150364,8390,8390,8310,8330,-0.003595,8354.0,8476.0,-58.577781,34.780717
A383800,980145,2023-05-24,A383800,LX홀딩스,122457,8310,8340,8280,8300,-0.003608,8338.0,8465.5,-62.937391,32.654549
A383800,982145,2023-05-25,A383800,LX홀딩스,84241,8300,8310,8270,8310,0.001204,8330.0,8455.0,-64.838084,34.100667
A383800,984145,2023-05-26,A383800,LX홀딩스,126681,8300,8310,8270,8280,-0.003617,8316.0,8441.5,-67.981497,31.888417


In [8]:
train = train.fillna(0)
train

Unnamed: 0_level_0,Unnamed: 1_level_0,일자,종목코드,종목명,거래량,시가,고가,저가,종가,log_return,sma5,sma20,macd,rsi
종목코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A000020,502,2021-06-01,A000020,동화약품,114966,14700,14700,14450,14600,0.000000,0.0,0.0,0.000000,0.000000
A000020,2502,2021-06-02,A000020,동화약품,109559,14700,14700,14450,14500,-0.006873,0.0,0.0,0.000000,0.000000
A000020,4502,2021-06-03,A000020,동화약품,96158,14550,14650,14450,14600,0.006873,0.0,0.0,0.000000,0.000000
A000020,6502,2021-06-04,A000020,동화약품,133900,14600,14800,14550,14700,0.006826,0.0,0.0,0.000000,0.000000
A000020,8502,2021-06-07,A000020,동화약품,511140,14800,15550,14750,15150,0.030153,14710.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A383800,978145,2023-05-23,A383800,LX홀딩스,150364,8390,8390,8310,8330,-0.003595,8354.0,8476.0,-58.577781,34.780717
A383800,980145,2023-05-24,A383800,LX홀딩스,122457,8310,8340,8280,8300,-0.003608,8338.0,8465.5,-62.937391,32.654549
A383800,982145,2023-05-25,A383800,LX홀딩스,84241,8300,8310,8270,8310,0.001204,8330.0,8455.0,-64.838084,34.100667
A383800,984145,2023-05-26,A383800,LX홀딩스,126681,8300,8310,8270,8280,-0.003617,8316.0,8441.5,-67.981497,31.888417


In [9]:
from statsmodels.tsa.api import VAR

# 추론 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

# 각 종목코드에 대해서 모델 학습 및 추론 반복
for code in tqdm(unique_codes):
    
    # 학습 데이터 생성
    train_data = train[train['종목코드'] == code][['일자', 'log_return', 'sma5', 'sma20', 'macd', 'rsi']]
    train_data['일자'] = pd.to_datetime(train_data['일자'], format='%Y%m%d')
    train_data.set_index('일자', inplace=True)
    
    # Remove columns where all values are the same
    train_data = train_data.loc[:, train_data.nunique() > 1]
    
    # 모델 선언, 학습 및 추론
    model = VAR(train_data)
    model_fit = model.fit()
    predictions = model_fit.forecast(model_fit.endog[-model_fit.k_ar:], steps=15) # 향후 15개의 거래일에 대해서 예측
    
    # 최종 수익률 계산 (종가 예측치만 사용)
    final_return = (predictions[-1, 0] - predictions[0, 0]) / predictions[0, 0]
    
    # 결과 저장
    results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)

100%|██████████| 2000/2000 [07:53<00:00,  4.23it/s]


In [10]:
results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
results_df

Unnamed: 0,종목코드,final_return,순위
0,A000020,-1.483798,1585
1,A000040,0.344402,418
2,A000050,4.921949,75
3,A000070,-0.438477,950
4,A000080,0.738147,288
...,...,...,...
1995,A375500,2.256127,142
1996,A378850,-0.373600,890
1997,A383220,-0.605348,1088
1998,A383310,-1.123964,1444


In [12]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

Unnamed: 0,종목코드,순위
0,A000020,1
1,A000040,2
2,A000050,3
3,A000070,4
4,A000080,5
...,...,...
1995,A375500,1996
1996,A378850,1997
1997,A383220,1998
1998,A383310,1999


In [13]:
baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
baseline_submission

Unnamed: 0,종목코드,순위
0,A000020,1585
1,A000040,418
2,A000050,75
3,A000070,950
4,A000080,288
...,...,...
1995,A375500,142
1996,A378850,890
1997,A383220,1088
1998,A383310,1444


In [14]:
baseline_submission.to_csv('submit1.csv', index=False)