In [1]:
import numpy as np
import pandas as pd
from init_env import *
from data.data import *
from data.dataset import *
from model.model import *
from train.train import *
from utils.utils import *
from sklearn.preprocessing import StandardScaler


torch version: 2.3.0+cu121 device: cuda:0
model_dir: D:\PROJECT\FINTECH\stock_price_predict\models
images_dir: D:\PROJECT\FINTECH\stock_price_predict\images


In [2]:
# hyper parameters
stock_code = 'sz399300'
day_nums = 10000
random_seed = 2021
day_after_nums = 1  # target after day_after_nums days to predict
days_seq_len = 7
test_data_ratio = 0.3

hidden_dim = 16
num_layers = 2
out_dim = 1
batch_size = 32
learning_rate = 0.01
num_epochs = 500


In [3]:
# ========================= run =========================
# ========================= data collect phase =========================
raw_data = get_data(stock_code, day_nums)
print(raw_data.shape)

(4726, 5)


In [4]:
# ========================= dataset prepare (data modeling) phase =========================

# feature scaling
normalizer = StandardScaler()
train_dataset, test_dataset = get_dataset(raw_data, days_seq_len, day_after_nums, test_data_ratio, normalizer)
print(train_dataset, test_dataset)
sample_size, sequence_len, feature_dim = train_dataset.features.shape
print(f"sample_size: {sample_size}, sequence_len: {sequence_len}, feature_dim: {feature_dim}")
print(train_dataset.time_seq[:10])

# prepare the data loader
generator = torch.Generator().manual_seed(random_seed)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,generator=generator)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

for features, labels, time_seq_index in test_loader:
    print(test_dataset.get_time_seq(time_seq_index))
    

<data.dataset.MyDataset object at 0x000002ADB42B2C10> <data.dataset.MyDataset object at 0x000002ADB42B2B50>
sample_size: 3303, sequence_len: 7, feature_dim: 5
DatetimeIndex(['2005-01-12', '2005-01-13', '2005-01-14', '2005-01-17',
               '2005-01-18', '2005-01-19', '2005-01-20', '2005-01-21',
               '2005-01-24', '2005-01-25'],
              dtype='datetime64[ns]', name='', freq=None)
DatetimeIndex(['2018-08-16', '2018-08-17', '2018-08-20', '2018-08-21',
               '2018-08-22', '2018-08-23', '2018-08-24', '2018-08-27',
               '2018-08-28', '2018-08-29', '2018-08-30', '2018-08-31',
               '2018-09-03', '2018-09-04', '2018-09-05', '2018-09-06',
               '2018-09-07', '2018-09-10', '2018-09-11', '2018-09-12',
               '2018-09-13', '2018-09-14', '2018-09-17', '2018-09-18',
               '2018-09-19', '2018-09-20', '2018-09-21', '2018-09-25',
               '2018-09-26', '2018-09-27', '2018-09-28', '2018-10-08'],
              dtype='datetim

In [5]:
raw_data

Unnamed: 0,open,high,low,close,volume
,,,,,
2005-01-04,994.769,994.769,980.658,982.794,7.412869e+08
2005-01-05,981.577,997.323,979.877,992.564,7.119109e+08
2005-01-06,993.331,993.788,980.330,983.174,6.288029e+08
2005-01-07,983.045,995.711,979.812,983.958,7.298694e+08
2005-01-10,983.760,993.959,979.789,993.879,5.791698e+08
...,...,...,...,...,...
2024-06-17,3521.437,3540.793,3520.738,3536.198,1.374334e+10
2024-06-18,3533.488,3551.708,3532.784,3545.590,1.236718e+10
2024-06-19,3543.360,3543.360,3524.845,3528.749,1.066428e+10


In [10]:
train_dataset.labels[:10], train_dataset.time_seq 

(tensor([[1.],
         [0.],
         [0.],
         [1.],
         [0.],
         [0.],
         [1.],
         [1.],
         [0.],
         [0.]]),
 DatetimeIndex(['2005-01-12', '2005-01-13', '2005-01-14', '2005-01-17',
                '2005-01-18', '2005-01-19', '2005-01-20', '2005-01-21',
                '2005-01-24', '2005-01-25',
                ...
                '2018-08-02', '2018-08-03', '2018-08-06', '2018-08-07',
                '2018-08-08', '2018-08-09', '2018-08-10', '2018-08-13',
                '2018-08-14', '2018-08-15'],
               dtype='datetime64[ns]', name='', length=3303, freq=None))

In [5]:
# ========================= model train phase =========================
input_dim = feature_dim

model = LSTM(input_dim, hidden_dim, num_layers, out_dim).to(device)
print(model)

# define the loss function and the optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# train the model
best_model, train_loss, test_loss, train_accuracy_list, test_accuracy_list = train(model, {'train': train_loader,
                                                                                           'val': test_loader},
                                                                                   criterion, num_epochs, optimizer)

LSTM(
  (lstm): LSTM(5, 16, num_layers=2, batch_first=True)
  (fc): Linear(in_features=16, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
torch.Size([32, 1]) torch.float32
torch.int64 torch.int64


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [11]:
features, labels, time_seq = get_processed_raw_data(raw_data, days_seq_len, day_after_nums)
print(features.shape,labels.shape,time_seq.shape) 

(4719, 7, 5) (4719, 1) (4719,)


In [12]:
labels

array([[1.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [14]:
time_seq

DatetimeIndex(['2005-01-12', '2005-01-13', '2005-01-14', '2005-01-17',
               '2005-01-18', '2005-01-19', '2005-01-20', '2005-01-21',
               '2005-01-24', '2005-01-25',
               ...
               '2024-06-06', '2024-06-07', '2024-06-11', '2024-06-12',
               '2024-06-13', '2024-06-14', '2024-06-17', '2024-06-18',
               '2024-06-19', '2024-06-20'],
              dtype='datetime64[ns]', name='', length=4719, freq=None)

In [13]:
raw_data

Unnamed: 0,open,high,low,close,volume
,,,,,
2005-01-04,994.769,994.769,980.658,982.794,7.412869e+08
2005-01-05,981.577,997.323,979.877,992.564,7.119109e+08
2005-01-06,993.331,993.788,980.330,983.174,6.288029e+08
2005-01-07,983.045,995.711,979.812,983.958,7.298694e+08
2005-01-10,983.760,993.959,979.789,993.879,5.791698e+08
...,...,...,...,...,...
2024-06-17,3521.437,3540.793,3520.738,3536.198,1.374334e+10
2024-06-18,3533.488,3551.708,3532.784,3545.590,1.236718e+10
2024-06-19,3543.360,3543.360,3524.845,3528.749,1.066428e+10


In [21]:
for features, labels, time_seq_index in train_loader:
    for i in range(features.shape[0]):
        print(train_dataset.get_time_seq(time_seq_index[i]))
        feature = features[i]
        label = labels[i]
        print(feature.shape,label.shape)
        print(feature)
        print(label)
        break
    break

2011-02-24 00:00:00
torch.Size([7, 5]) torch.Size([1])
tensor([[ 0.3462,  0.3501,  0.3656,  0.3798,  0.0572],
        [ 0.3895,  0.3659,  0.3888,  0.3773,  0.0288],
        [ 0.3744,  0.3439,  0.3732,  0.3442, -0.1342],
        [ 0.3278,  0.3592,  0.3601,  0.3889, -0.0859],
        [ 0.4012,  0.3696,  0.3324,  0.2972,  0.2555],
        [ 0.2892,  0.2860,  0.3168,  0.3080, -0.1828],
        [ 0.3065,  0.2990,  0.3286,  0.3238, -0.2409]])
tensor([1.])


In [28]:
raw_data.loc[['2011-02-21 00:00:00','2011-02-22 00:00:00','2011-02-23 00:00:00','2011-02-24 00:00:00',]]

Unnamed: 0,open,high,low,close,volume
,,,,,
2011-02-21,3192.146,3258.28,3189.848,3257.908,7939978000.0
2011-02-22,3267.615,3269.048,3161.833,3163.578,10786370000.0
2011-02-23,3152.441,3182.185,3145.986,3174.742,7132486000.0
2011-02-24,3170.236,3195.688,3157.969,3190.935,6648689000.0
