In [1]:
import abc

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeCV, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
from pandas.api.types import is_integer_dtype
import numpy as np
from numpy.random import default_rng
from scipy.special import expit
import seaborn as sns
import warnings;
warnings.filterwarnings('ignore');
import sys

In [2]:
def get_fully_observed_bandit():
    """
    This loads in a multiclass classification problem and reformulates it as a fully observed bandit problem.
    
    """
    df_l = pd.read_csv('data/letter-recognition.data',
                       names = ['a']+[f'x{i}' for i in range(16)])
    X = df_l.drop(columns=['a'])

    # Convert labels to ints and one-hot
    y = df_l['a']
    # if y is not column of integers (that represent classes), then convert
    if not is_integer_dtype(y.dtype):
        y = y.astype('category').cat.codes

    ## Full rewards
    n = len(y)
    k = max(y)+1
    full_rewards = np.zeros([n, k])
    full_rewards[np.arange(0,n),y] = 1
    contexts = X
    best_actions = y
    return contexts, full_rewards, best_actions

In [3]:
contexts, full_rewards, best_actions = get_fully_observed_bandit()
n, k = full_rewards.shape
_, d = contexts.shape
print(f"There are {k} actions, the context space is {d} dimensional, and there are {n} examples.")
print(f"For example, the first item has context vector:\n{contexts.iloc[0:1]}.")
print(f"The best action is {best_actions[0]}.  The reward for that action is 1 and all other actions get reward 0.")
print(f"The reward information is store in full_rewards as the row\n{full_rewards[0]}.")

There are 26 actions, the context space is 16 dimensional, and there are 20000 examples.
For example, the first item has context vector:
   x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  x10  x11  x12  x13  x14  x15
0   2   8   3   5   1   8  13   0   6   6   10    8    0    8    0    8.
The best action is 19.  The reward for that action is 1 and all other actions get reward 0.
The reward information is store in full_rewards as the row
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0.].


In [4]:
## Choose train/test indices
rng = default_rng(7)
train_frac = 0.5
train_size = round(train_frac * n)
train_idx = rng.choice(n, size = train_size, replace = False)
test_idx = np.setdiff1d(np.arange(n), train_idx, assume_unique=True)

In [5]:
class Policy:
    def __init__(self, num_actions=2):
        self.num_actions = num_actions

    @abc.abstractmethod
    def get_action_distribution(self, X):
        """   
        This method is intended to be overridden by each implementation of Policy.

        Args:
            X (pd.DataFrame): contexts

        Returns:
            2-dim numpy array with the same number of rows as X and self.num_actions columns. 
                Each rows gives the policy's probability distribution over actions conditioned on the context in the corresponding row of X
        """   
        raise NotImplementedError("Must override method")

    def get_action_propensities(self, X, actions):
        """   
        Args:
            X (pd.DataFrame): contexts, rows correspond to entries of actions
            actions (np.array): actions taken, represented by integers, corresponding to rows of X

        Returns:
            1-dim numpy array of probabilities (same size as actions) for taking each action in its corresponding context
        """   
        ## DONE
        action_distribution = self.get_action_distribution(X)
        return np.take_along_axis(action_distribution, actions.reshape(-1, 1), axis=1).flatten()

    def select_actions(self, X, rng=default_rng(1)):
        """   
        Args:
            X (pd.DataFrame): contexts, rows correspond to entries of actions and propensities returned

        Returns:
            actions (np.array): 1-dim numpy array of length equal to the number of rows of X.  Each entry is an integer indicating the action selected for the corresponding context in X. 
                The action is selected randomly according to the policy, conditional on the context specified in the appropriate row of X.
            propensities (np.array): 1-dim numpy array of length equal to the number of rows of X; gives the propensity for each action selected in actions

        """   
        ## DONE
        action_distribution = self.get_action_distribution(X)
        actions = np.array([np.random.choice(26, 1, p=action_distribution[i]) for i in range(X.shape[0])]).flatten()
        propensities = self.get_action_propensities(X, actions)
        assert len(actions) == len(propensities) == X.shape[0]
        
        return actions, propensities
        
        
    def get_value_estimate(self, X, full_rewards):
        """   
        Args:
            X (pd.DataFrame): contexts, rows correspond to entries of full_rewards
            full_rewards (np.array): 2-dim numpy array with the same number of rows as X and self.num_actions columns; 
                each row gives the rewards that would be received for each action for the context in the corresponding row of X.
                This would only be known in a full-feedback bandit, or estimated in a direct method

        Returns:
            scalar value giving the expected average reward received for playing the policy for contexts X and the given full_rewards

        """   
        ## DONE
        n = X.shape[0]
        actions, propensities = self.select_actions(X)
        action_distribution = self.get_action_distribution(X)
        
        return (full_rewards*action_distribution).sum()/n


class UniformActionPolicy(Policy):
    def __init__(self, num_actions=2):
        self.num_actions = num_actions

    def get_action_distribution(self, X):
        ## DONE
        return np.full((X.shape[0], self.num_actions), 1.0/self.num_actions)

In [6]:
X_train = contexts.iloc[train_idx].to_numpy()
y_train = best_actions.iloc[train_idx].to_numpy()
X_test = contexts.iloc[test_idx].to_numpy()
y_test = best_actions.iloc[test_idx].to_numpy()
full_rewards_test = full_rewards[test_idx]

uniform_policy = UniformActionPolicy(num_actions=k)

In [7]:
class SKLearnPolicy(Policy):
    """ 
    An SKLearnPolicy uses a scikit learn model to generate an action distribution.  If the SKLearnPolicy is built with is_deterministic=False, 
    then the predict distribution for a context x should be whatever predict_proba for the model returns.  If is_deterministic=True, then all the probability mass 
    should be concentrated on whatever predict of the model returns.
    """
    def __init__(self, model, num_actions=2, is_deterministic=False):
        self.is_deterministic = is_deterministic
        self.num_actions = num_actions
        self.model = model

    def get_action_distribution(self, X):
        ## DONE
        if (self.is_deterministic):
            predictions = self.model.predict(X)
            return np.eye(self.num_actions)[predictions.reshape(-1)] # one hot
        else:
            return self.model.predict_proba(X)


    def select_actions(self, X, rng=default_rng(1)):
        ## DONE
        if (self.is_deterministic):
            actions = self.model.predict(X)
            propensities = np.full(len(actions), 1.0)
            return actions, propensities
        else:
            actions, propensities = Policy.select_actions(self, X)
            return actions, propensities

In [8]:
model = LogisticRegression(multi_class='multinomial')
model.fit(X_train, y_train)
policy_stochastic = SKLearnPolicy(model=model, num_actions=k, is_deterministic=False)
policy_deterministic = SKLearnPolicy(model=model, num_actions=k, is_deterministic=True)

In [9]:
pi_w = policy_stochastic.get_action_propensities(X_test, y_test)

In [10]:
pi_0 = uniform_policy.get_action_propensities(X_test, y_test)

In [11]:
pi_w

array([0.96096041, 0.44768231, 0.54512254, ..., 0.06543477, 0.95818714,
       0.95826883])

In [12]:
pi_0

array([0.03846154, 0.03846154, 0.03846154, ..., 0.03846154, 0.03846154,
       0.03846154])

In [13]:
X_train = np.load("data/X_train.npy")
y_train = np.load("data/y_train.npy")
X_test = np.load("data/X_test.npy")
y_test = np.load("data/y_test.npy")
full_rewards_test = np.load("data/full_rewards_test.npy")

In [17]:
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

class MyDataset(Dataset):
    def __init__(self, data, targets):
        self.data = torch.LongTensor(data)
        self.targets = torch.LongTensor(targets)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]

        return x, y

    def __len__(self):
        return len(self.data)


dataset = MyDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=5)

In [24]:
next(iter(dataloader))

[tensor([[ 3,  5,  3,  6,  3,  9,  6,  6,  3,  8,  6, 11,  3,  9,  5,  8],
         [ 2,  4,  4,  3,  2,  7,  2,  2,  2,  5,  2,  8,  2,  5,  2,  7],
         [ 5, 11,  8,  8,  4,  8,  8,  5,  2,  7,  8,  8,  9,  9,  0,  8],
         [ 7, 10,  9,  7,  7,  9,  6,  4,  7,  9,  5,  6,  2,  8,  7, 10],
         [ 4,  7,  6,  5,  4,  7,  9,  2,  9, 11,  8,  5,  1,  8,  6,  5]]),
 tensor([16,  0, 22,  1, 25])]

In [27]:
actions, props = policy_stochastic.select_actions(X_train)

In [28]:
actions

array([16,  0, 22, ...,  4,  3, 16])

In [29]:
props

array([0.94485422, 0.87272023, 0.64129014, ..., 0.83652646, 0.30229926,
       0.55424757])

In [37]:
rewards = (actions == y_train).astype(float)

In [38]:
rewards

array([1., 1., 1., ..., 1., 0., 0.])

In [39]:
x = torch.rand(size=(3, 5))
x

tensor([[0.3023, 0.5194, 0.0963, 0.6305, 0.2436],
        [0.1353, 0.4597, 0.0173, 0.1843, 0.2086],
        [0.6548, 0.4161, 0.2754, 0.6642, 0.6489]])

In [54]:
x.gather(1, torch.LongTensor([0,1,4]).unsqueeze(1)).flatten()

tensor([0.3023, 0.4597, 0.6489])