In [5]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings("ignore")

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [7]:
train = pd.read_csv('./train.csv')
train.head()

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,20210601,A060310,3S,166690,2890,2970,2885,2920
1,20210601,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,20210601,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,20210601,A054620,APS,462544,14600,14950,13800,14950
4,20210601,A265520,AP시스템,131987,29150,29150,28800,29050


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# 추론 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

features = ['거래량','시가', '고가', '저가', '종가']
# 각 종목코드에 대해서 모델 학습 및 추론 반복
for code in tqdm(unique_codes):
    
    # 학습 데이터 생성
    train_close = train[train['종목코드'] == code][features]
    # train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    # train_close.set_index('일자', inplace=True)
    tc = train_close['종가']
    
    
    # 데이터 스케일링
    tc_scaled = train_close

    scaler = MinMaxScaler(feature_range=(0, 1))
    tc_scaled['거래량'] = scaler.fit_transform(train_close['거래량'].values.reshape(-1, 1))
    scaler = MinMaxScaler(feature_range=(0, 1))
    tc_scaled['시가'] = scaler.fit_transform(train_close['시가'].values.reshape(-1, 1))
    scaler = MinMaxScaler(feature_range=(0, 1))
    tc_scaled['고가'] = scaler.fit_transform(train_close['고가'].values.reshape(-1, 1))
    scaler = MinMaxScaler(feature_range=(0, 1))
    tc_scaled['저가'] = scaler.fit_transform(train_close['저가'].values.reshape(-1, 1))
    scaler = MinMaxScaler(feature_range=(0, 1))
    tc_scaled['종가'] = scaler.fit_transform(train_close['종가'].values.reshape(-1, 1))


    # print(tc_scaled)
    # tc_scaled = scaler.fit_transform(train_close)

    # 데이터셋 생성
    def create_dataset(dataset, time_steps=1):
        X, y = [], []
        for i in range(len(dataset)-time_steps):
            X.append(dataset.iloc[i:(i+time_steps), :].values)
            y.append(dataset['종가'].iloc[i+time_steps])
       
        return np.array(X), np.array(y)

    time_steps = 10  # 시퀀스 길이 설정
    X, y = create_dataset(tc_scaled, time_steps)

    # 데이터셋 분할: 학습 데이터와 테스트 데이터
    train_size = int(len(X) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # LSTM 모델 구축
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(time_steps, len(features))))
    model.add(LSTM(50))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')

    # 모델 학습
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    
    # 향후 15개의 거래일에 대한 예측
    predictions = model.predict(X_test[-15:])
    predictions = scaler.inverse_transform(predictions)

    # 최종 수익률 계산
    final_return = (predictions[-1] - predictions[0]) / predictions[0]

    # 결과 저장
    results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)

  0%|          | 0/2000 [00:00<?, ?it/s]



  0%|          | 1/2000 [00:08<4:42:21,  8.48s/it]



  0%|          | 2/2000 [00:16<4:32:19,  8.18s/it]



  0%|          | 3/2000 [00:24<4:26:21,  8.00s/it]



  0%|          | 4/2000 [00:33<4:47:30,  8.64s/it]



  0%|          | 5/2000 [00:42<4:47:49,  8.66s/it]



  0%|          | 6/2000 [00:51<4:54:23,  8.86s/it]



  0%|          | 7/2000 [01:00<4:54:09,  8.86s/it]



  0%|          | 8/2000 [01:09<4:58:49,  9.00s/it]



  0%|          | 9/2000 [01:18<4:53:43,  8.85s/it]



  0%|          | 10/2000 [01:27<4:53:19,  8.84s/it]



  1%|          | 11/2000 [01:36<4:58:09,  8.99s/it]



  1%|          | 12/2000 [01:45<4:56:37,  8.95s/it]



  1%|          | 13/2000 [01:54<4:52:02,  8.82s/it]



  1%|          | 14/2000 [02:03<4:57:09,  8.98s/it]



  1%|          | 15/2000 [02:11<4:47:54,  8.70s/it]



  1%|          | 16/2000 [02:20<4:49:31,  8.76s/it]



  1%|          | 17/2000 [02:28<4:48:13,  8.72s/it]



  1%|          | 18/2000 [02:37<4:49:40,  8.77s/it]



  1%|          | 19/2000 [02:46<4:49:00,  8.75s/it]



  1%|          | 20/2000 [02:54<4:43:16,  8.58s/it]



  1%|          | 21/2000 [03:03<4:44:48,  8.64s/it]



  1%|          | 22/2000 [03:11<4:41:30,  8.54s/it]



  1%|          | 23/2000 [03:20<4:39:24,  8.48s/it]



  1%|          | 24/2000 [03:29<4:44:48,  8.65s/it]



  1%|▏         | 25/2000 [03:37<4:41:28,  8.55s/it]



  1%|▏         | 26/2000 [03:47<4:52:01,  8.88s/it]



  1%|▏         | 27/2000 [03:55<4:44:06,  8.64s/it]



  1%|▏         | 28/2000 [04:03<4:44:39,  8.66s/it]



  1%|▏         | 29/2000 [04:11<4:37:32,  8.45s/it]



  2%|▏         | 30/2000 [04:20<4:40:27,  8.54s/it]



  2%|▏         | 31/2000 [04:28<4:36:24,  8.42s/it]



  2%|▏         | 32/2000 [04:37<4:43:05,  8.63s/it]



  2%|▏         | 33/2000 [04:46<4:37:41,  8.47s/it]



  2%|▏         | 34/2000 [04:54<4:42:28,  8.62s/it]



  2%|▏         | 35/2000 [05:04<4:52:05,  8.92s/it]



  2%|▏         | 36/2000 [05:13<4:55:41,  9.03s/it]



  2%|▏         | 37/2000 [05:22<4:54:40,  9.01s/it]



  2%|▏         | 38/2000 [05:32<4:59:13,  9.15s/it]



  2%|▏         | 39/2000 [05:42<5:05:31,  9.35s/it]



  2%|▏         | 40/2000 [05:50<4:59:41,  9.17s/it]



  2%|▏         | 41/2000 [05:59<4:58:03,  9.13s/it]



  2%|▏         | 42/2000 [06:09<5:02:33,  9.27s/it]



  2%|▏         | 43/2000 [06:18<4:56:43,  9.10s/it]



  2%|▏         | 44/2000 [06:28<5:03:23,  9.31s/it]

In [None]:
results_df

In [None]:
results_df['순위'] = results_df['final_return'].rank(method='first').astype('int') # 각 순위를 중복없이 생성
results_df.sort_values('순위')

Unnamed: 0,종목코드,final_return,순위
5,A211270,[-0.06308102],1
1,A095570,[-0.04776682],2
8,A126600,[-0.038834322],3
7,A282330,[-0.01267713],4
6,A027410,[-0.009586],5
2,A006840,[0.016609492],6
9,A138930,[0.01706481],7
3,A054620,[0.01961614],8
4,A265520,[0.05338428],9
0,A060310,[0.073663026],10


In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission

Unnamed: 0,종목코드,final_return,순위
0,A060310,[-8.4089585e-08],1
1,A095570,[0.0],2
2,A006840,[8.503587e-08],9
3,A054620,[0.0],3
4,A265520,[0.0],4
5,A211270,[0.0],5
6,A027410,[0.0],6
7,A282330,[0.0],7
8,A126600,[0.0],8
9,A138930,[8.691532e-08],10


In [None]:
baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
baseline_submission

In [None]:
baseline_submission.to_csv('baseline_submission.csv', index=False)