### Huggingface load_dataset blog:
- https://huggingface.co/docs/datasets/v1.12.0/loading.html

### D4RL dataset repo:
- Original repo: https://github.com/Farama-Foundation/D4RL
- D4RL used DT example code : https://github.com/kzl/decision-transformer/blob/master/gym/data/download_d4rl_datasets.py

### Huggingface Decision Transformer explained:
- https://huggingface.co/blog/train-decision-transformers

In [27]:
import pickle as pkl
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
with open ('/home/berk/VS_Project/simglucose/examples/trajectories/hopper-medium-replay-v2.pkl', 'rb') as obj:
    df = pkl.load(obj)

In [12]:
class CustomDataset(Dataset):
    def __init__(self, observation, next_observation, action, reward, terminal):
        self.obs = observation
        self.nex_obs = next_observation
        self.actions = action
        self.rewards = reward
        self.terminals = terminal

    def __len__(self):
        return len(self.obs)

    def __getitem__(self, idx):
        s = self.obs[idx]
        n = self.nex_obs[idx]
        a = self.actions[idx]
        r = self.rewards[idx]
        d = self.terminals[idx]
        sample = {"observations": s, "next_observations": n, "actions":a, "rewards": r, "terminals":d}
        return sample


In [19]:
state = np.random.rand(1,2)
next_state = np.random.rand(1,2)
action = np.random.rand(1)
reward = np.random.rand(1)
done = False

memory = {'observations': [], 'next_observations':[], 'actions':[], 'rewards':[], 'terminals':[]}
for i in range(3):
    memory["observations"].append(state)
    memory["next_observations"].append(next_state)
    memory["actions"].append(action)
    memory["rewards"].append(reward)
    memory["terminals"].append(done)


In [45]:
def coll_function(batch):
    obs = batch["observations"]
    print(type(obs[0]))
    return obs

In [20]:
df = CustomDataset(memory["observations"],
                memory["next_observations"],
                memory["actions"],
                memory["rewards"],
                memory["terminals"])

In [46]:
# # Display text and label.
# print('\nFirst iteration of data set: ', next(iter(df)), '\n')
# # Print how many items are in the data set
# print('Length of data set: ', len(df), '\n')
# # Print entire data set
coll_function(df[0])
print('Entire data set: ', list(DataLoader(df, batch_size=3, shuffle=True, collate_fn=coll_function)), '\n')

<class 'numpy.ndarray'>


TypeError: list indices must be integers or slices, not str

In [17]:
# Import libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# create custom dataset class
class CustomTextDataset(Dataset):
    def __init__(self, text, labels):
        self.labels = labels
        self.text = text

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        data = self.text[idx]
        sample = {"Text": data, "Class": label}
        return sample

# define data and class labels
text = ['Happy', 'Amazing', 'Sad', 'Unhapy', 'Glum']
labels = ['Positive', 'Positive', 'Negative', 'Negative', 'Negative']

# create Pandas DataFrame
text_labels_df = pd.DataFrame({'Text': text, 'Labels': labels})

# define data set object
TD = CustomTextDataset(text_labels_df['Text'], text_labels_df['Labels'])

# Display image and label.
print('\nFirst iteration of data set: ', next(iter(TD)), '\n')

# Print how many items are in the data set
print('Length of data set: ', len(TD), '\n')

# Print entire data set
print('Entire data set: ', list(DataLoader(TD)), '\n')


# collate_fn
def collate_batch(batch):
    word_tensor = torch.tensor([[1.], [0.], [45.]])
    label_tensor = torch.tensor([[1.]])

    text_list, classes = [], []

    for (_text, _class) in batch:
        text_list.append(word_tensor)
        classes.append(label_tensor)

    text = torch.cat(text_list)
    classes = torch.tensor(classes)

    return text, classes

# create DataLoader object of DataSet object
bat_size = 2
DL_DS = DataLoader(TD, batch_size=bat_size, shuffle=True)

# loop through each batch in the DataLoader object
for (idx, batch) in enumerate(DL_DS):

    # Print the 'text' data of the batch
    print(idx, 'Text data: ', batch, '\n')

    # Print the 'class' data of batch
    print(idx, 'Class data: ', batch, '\n')


First iteration of data set:  {'Text': 'Happy', 'Class': 'Positive'} 

Length of data set:  5 

Entire data set:  [{'Text': ['Happy'], 'Class': ['Positive']}, {'Text': ['Amazing'], 'Class': ['Positive']}, {'Text': ['Sad'], 'Class': ['Negative']}, {'Text': ['Unhapy'], 'Class': ['Negative']}, {'Text': ['Glum'], 'Class': ['Negative']}] 

0 Text data:  {'Text': ['Amazing', 'Unhapy'], 'Class': ['Positive', 'Negative']} 

0 Class data:  {'Text': ['Amazing', 'Unhapy'], 'Class': ['Positive', 'Negative']} 

1 Text data:  {'Text': ['Glum', 'Sad'], 'Class': ['Negative', 'Negative']} 

1 Class data:  {'Text': ['Glum', 'Sad'], 'Class': ['Negative', 'Negative']} 

2 Text data:  {'Text': ['Happy'], 'Class': ['Positive']} 

2 Class data:  {'Text': ['Happy'], 'Class': ['Positive']} 

