In [27]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


DATA_DIR = 'gs://time_series_datasets'
LOCAL_CACHE_DIR = './data_loader/dataset/'

class DataExtractor:
    def __init__(self, df, row_length=10, tail_length=4):
        self.data = self.extract_contiguous_rows_with_stride(df, row_length, tail_length)

    def extract_contiguous_rows_with_stride(self, df, row_length=10, tail_length=4):
        num_rows = len(df)
        num_chunks = num_rows - row_length + 1

        contiguous_rows = []
        last_four_rows = []
        indices = []

        for i in range(num_chunks):
            chunk = df.iloc[i:i+row_length].values
            contiguous_rows.append(chunk[:row_length-tail_length])
            last_four_rows.append(chunk[-tail_length:])
            indices.append(i)  # Adding the index

        data = {
            "contiguous_rows": np.array(contiguous_rows),
            "last_four_rows": np.array(last_four_rows),
            "indices": np.array(indices)
        }

        return data

    def __len__(self):
        return len(self.data["indices"])

    def __getitem__(self, index):
        idx = self.data["indices"][index]
        return {
            "contiguous_rows": self.data["contiguous_rows"][idx],
            "last_four_rows": self.data["last_four_rows"][idx]
        }

In [28]:
class CustomDataset(Dataset):
    def __init__(self, data_extractor):
        self.data_extractor = data_extractor

    def __len__(self):
        return len(self.data_extractor)

    def __getitem__(self, index):
        data = self.data_extractor[index]
        contiguous_rows = torch.tensor(data["contiguous_rows"], dtype=torch.float32)
        last_four_rows = torch.tensor(data["last_four_rows"], dtype=torch.float32)
        return contiguous_rows, last_four_rows

In [29]:
class CustomDataModule(pl.LightningDataModule):
    def __init__(self, data, batch_size=32):
        super(CustomDataModule, self).__init__()
        self.data = data
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = CustomDataset(self.data)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

In [30]:
import torch.nn.functional as F

class SimpleMLP(pl.LightningModule):
    def __init__(self, input_dim, output_dim):
        super(SimpleMLP, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.fc1 = torch.nn.Linear(input_dim, input_dim)
        self.fc2 = torch.nn.Linear(input_dim, output_dim)
        

    def forward(self, x):
        x= F.relu(self.fc1(x))
        x= self.fc2(x)
        return x

    def training_step(self, batch, batch_idx):
        contiguous_rows, last_four_rows = batch
        contiguous_rows = contiguous_rows.view(contiguous_rows.size(0), -1)
        last_four_rows = last_four_rows.view(last_four_rows.size(0), -1)
        #nputs = torch.cat((contiguous_rows, last_four_rows), dim=1)
        outputs = self(contiguous_rows)
        loss = torch.nn.functional.mse_loss(outputs, last_four_rows)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

# Example usage:
# Creating a sample DataFrame with 25 rows and 10 columns
data = np.random.rand(25, 10)
df = pd.DataFrame(data)

# Creating a DataExtractor instance
data_extractor = DataExtractor(df)

# Creating a LightningDataModule instance
data_module = CustomDataModule(data_extractor)

# Creating a SimpleMLP model
input_dim = 100 #10 * 10 * 2  # Input dimension after concatenating contiguous_rows and last_four_rows
output_dim = 40 # 10 * 4  # Output dimension (last_four_rows)
model = SimpleMLP(input_dim, output_dim)

# Training the model using PyTorch Lightning Trainer
trainer = pl.Trainer(max_epochs=10, accelerator="mps")  # Use gpus=0 if you don't have a GPU
trainer.fit(model, datamodule=data_module)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name | Type   | Params
--------------------------------
0 | fc1  | Linear | 10.1 K
1 | fc2  | Linear | 4.0 K 
--------------------------------
14.1 K    Trainable params
0         Non-trainable params
14.1 K    Total params
0.057     Total estimated model params size (MB)


Epoch 0:   0%|                                                                                                             | 0/1 [00:00<?, ?it/s]

RuntimeError: linear(): input and weight.T shapes cannot be multiplied (16x60 and 100x100)

In [14]:
model

SimpleMLP(
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=40, bias=True)
)

In [17]:
model.fc1.weight.shape

torch.Size([100, 100])

In [18]:
model.fc2.weight.shape

torch.Size([40, 100])

In [26]:
data_extractor[1]

{'contiguous_rows': array([[0.43454086, 0.91774252, 0.81755908, 0.88681113, 0.28718882,
         0.28449548, 0.19368366, 0.60960398, 0.73235801, 0.73489605],
        [0.76002761, 0.06592186, 0.88510859, 0.4896565 , 0.90979782,
         0.35089878, 0.90794758, 0.37193497, 0.17532796, 0.3264832 ],
        [0.44617834, 0.67250446, 0.57958535, 0.5784431 , 0.4682064 ,
         0.56831142, 0.5356919 , 0.746383  , 0.4658705 , 0.93253102],
        [0.32192509, 0.75438999, 0.87487555, 0.74173823, 0.2140094 ,
         0.38117118, 0.36246459, 0.95003304, 0.4016367 , 0.96532168],
        [0.31361665, 0.25050078, 0.8575137 , 0.38618546, 0.98043371,
         0.03147002, 0.53097426, 0.25311736, 0.43975443, 0.49740552],
        [0.83072649, 0.62504849, 0.98332652, 0.39593384, 0.30251191,
         0.02406806, 0.95300034, 0.29199166, 0.78709345, 0.04215947],
        [0.42560292, 0.65642105, 0.0292086 , 0.80586793, 0.96638566,
         0.94847003, 0.34266104, 0.39191948, 0.80315779, 0.03798605],
        