In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer



In [20]:
# Xは4週分のデータ、yは5週目＋80時間分のデータとなるようなデータセット

class FinancialDataset():
    def __init__(self, data, week_len=4, target_len=80):
        self.data = data
        self.week_len = week_len
        self.target_len = target_len
        
        self.x_tech, self.x_text, self.t = self.preprocessing(data, week_len, target_len)
        # print("self.x_tech" , self.x_tech )
        # print("self.x_text" , self.x_text )
        # print("self.t" , self.t )

    # def __len__(self):
    #     return len(self.x_tech)

    def get_epoch(self, batch_size):
        print( len(self.x_tech))
        print( len(self.x_text))
        print( len(self.t))
        data_length =  len(self.x_tech)
        # データのシャッフル
        shuffled_indices = np.random.permutation(data_length)
        
        epoch_x_tech =[]
        epoch_x_text =[]
        epoch_target =[]

        # バッチごとにデータを取得
        for batch_start in range(0, data_length, batch_size):
            batch_indices = shuffled_indices[batch_start:batch_start+batch_size]
            batch_x_tech = []  # バッチごとのデータを格納するリスト
            batch_x_text = []  # バッチごとのデータを格納するリスト
            batch_target = []  # バッチごとのデータを格納するリスト
            
            for idx in batch_indices:
                batch_x_tech.append(self.x_tech[idx])
                batch_x_text.append(self.x_text[idx])
                batch_target.append(self.t[idx])
            epoch_x_tech.append(batch_x_tech)
            epoch_x_text.append(batch_x_text)
            epoch_target.append(batch_target)
        return epoch_x_tech, epoch_x_text, epoch_target
   
    def preprocessing(self, data, week_len, target_len):
    
        target_len = int(target_len / 4) # 20
        print(target_len)
        # 4週分のデータと5週目＋80時間分のデータを格納
        x_tech_list = []
        x_text_list = []
        t_list = []

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        #print(data["Week"].max()+1-week_len)
        for week in range(1, data["Week"].max()+1-week_len):
            # dataのweek列がweek+5以降の80時間分（20データ）のデータを抽出
            if not len(data[data["Week"] >= week+4]) < target_len:
                target_data = data[data["Week"] >= week+4].reset_index(drop=True)[:target_len]
                #print(data[data["Week"] >= week+4].reset_index(drop=True)[:target_len])
                #print()
            else:
                break
            target_data = target_data.reset_index(drop=True)
            #Open, High, Low, Close, Volume, Adj Close
            target_data = target_data[["open_mid", "high_mid", "low_mid", "close_mid", "20SMA", "20Upper", "20Lower"]].values
            #print(target_data.shape)
            t_list.append(target_data)
            
            x_text_region_list = []
            week_4_data_list = []
            for i in range(week, week+4):
                # テクニカル指標データ
                week_data = data[data["Week"] == i]
                x_tech = week_data.reset_index(drop=True)
                #print(len(x_tech))
                x_tech = x_tech.drop(["Week", "Year", "US",	"JP", "EU", "CH", "GE"], axis=1)
                x_tech = x_tech.values
                x_tech = torch.tensor(x_tech, dtype=torch.float32)
                week_4_data_list.append(x_tech)

                # センチメントデータ
                ids_list = []
                for region in ["US", "JP", "EU", "CH", "GE"]:
                    # sentiment = week_data[week_data["Week"] == i][region][0]

                    sentiment = week_data[week_data["Week"] == i].iloc[0][region]

                    encoding = tokenizer(
                                        sentiment, 
                                        max_length = 850, 
                                        padding ="max_length", 
                                        truncation=True,
                                        return_tensors="pt"
                                        )
                    ids_list.append(encoding.input_ids)
                x_text_region = torch.stack(ids_list, dim=0)
                x_text_region_list.append(x_text_region)

            x_text_list.append(torch.stack(x_text_region_list, dim=0))
            x_tech_list.append(week_4_data_list)

        x_text = torch.stack(x_text_list, dim=0)
        t = torch.tensor(t_list, dtype=torch.float32)
        x_tech = x_tech_list
        return x_tech, x_text, t

# データ読み込み
data = pd.read_pickle('sentiment_tech_data.pkl')
# print(data)
financial_dataset = FinancialDataset(data)
epoch_x_tech, epoch_x_text, epoch_target = financial_dataset.get_epoch(32)
print("epoch_x_tech" , len(epoch_x_tech))
for i in range(len(epoch_x_tech)):
    print(len(epoch_x_tech[i]))
    x_tech = np.array(epoch_x_tech[i])

    print(x_tech)
    # リストをPyTorchテンソルに変換
    x_tech = torch.tensor(epoch_x_tech[i])
    x_text = torch.tensor(epoch_x_text[i])
    target = torch.tensor(epoch_target[i])



20
195
195
195
epoch_x_tech 7
32


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (32, 4) + inhomogeneous part.