In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from policy import *

In [2]:
def get_fully_observed_bandit(path='../data/letter-recognition.data'):
    """
    This loads in a multiclass classification problem and reformulates it as a fully observed bandit problem.
    
    """
    df_l = pd.read_csv(path, names = ['a']+[f'x{i}' for i in range(16)])
    X = df_l.drop(columns=['a'])

    # Convert labels to ints and one-hot
    y = df_l['a']
    # if y is not column of integers (that represent classes), then convert
    if not is_integer_dtype(y.dtype):
        y = y.astype('category').cat.codes

    ## Full rewards
    n = len(y)
    k = max(y)+1
    full_rewards = np.zeros([n, k])
    full_rewards[np.arange(0,n),y] = 1
    contexts = X
    best_actions = y
    return contexts, full_rewards, best_actions

In [3]:
contexts, full_rewards, best_actions = get_fully_observed_bandit()
n, k = full_rewards.shape
_, d = contexts.shape
print(f"There are {k} actions, the context space is {d} dimensional, and there are {n} examples.")
print(f"For example, the first item has context vector:\n{contexts.iloc[0:1]}.")
print(f"The best action is {best_actions[0]}.  The reward for that action is 1 and all other actions get reward 0.")
print(f"The reward information is store in full_rewards as the row\n{full_rewards[0]}.")

There are 26 actions, the context space is 16 dimensional, and there are 20000 examples.
For example, the first item has context vector:
   x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  x10  x11  x12  x13  x14  x15
0   2   8   3   5   1   8  13   0   6   6   10    8    0    8    0    8.
The best action is 19.  The reward for that action is 1 and all other actions get reward 0.
The reward information is store in full_rewards as the row
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0.].


In [4]:
## Choose train/test indices
rng = default_rng(7)
train_frac = 0.8
train_size = round(train_frac * n)
train_idx = rng.choice(n, size = train_size, replace = False)
test_idx = np.setdiff1d(np.arange(n), train_idx, assume_unique=True)

In [5]:
## Get train/test data
X_train = torch.tensor(contexts.iloc[train_idx].to_numpy(), dtype=torch.float, requires_grad=True)
y_train = torch.tensor(best_actions.iloc[train_idx].to_numpy())
full_rewards_train = torch.tensor(full_rewards[train_idx], dtype=torch.float)

X_test = torch.tensor(contexts.iloc[test_idx].to_numpy(), dtype=torch.float, requires_grad=True)
y_test = torch.tensor(best_actions.iloc[test_idx].to_numpy())
full_rewards_test = torch.tensor(full_rewards[test_idx], dtype=torch.float)#, dtype=torch.float, requires_grad=True)

In [6]:
## Get logging policy
uniform_policy = UniformActionPolicy(num_actions=k)
uniform_actions, uniform_props = uniform_policy.select_actions(X_train)

## Build DataLoader
train_dataset = TensorDataset(X_train, uniform_actions, uniform_props, y_train, full_rewards_train)
test_dataset = TensorDataset(X_test, y_test, full_rewards_test)

In [7]:
torch.save(train_dataset, '../data/train_dataset.pt')
torch.save(test_dataset, '../data/test_dataset.pt')