In [1]:
import abc
import torch
from torch import nn

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeCV, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
from pandas.api.types import is_integer_dtype
import numpy as np
from numpy.random import default_rng
from scipy.special import expit
import seaborn as sns
import warnings;
warnings.filterwarnings('ignore');
import sys

## Get Data

In [2]:
def get_fully_observed_bandit():
    """
    This loads in a multiclass classification problem and reformulates it as a fully observed bandit problem.
    
    """
    df_l = pd.read_csv('data/letter-recognition.data',
                       names = ['a']+[f'x{i}' for i in range(16)])
    X = df_l.drop(columns=['a'])

    # Convert labels to ints and one-hot
    y = df_l['a']
    # if y is not column of integers (that represent classes), then convert
    if not is_integer_dtype(y.dtype):
        y = y.astype('category').cat.codes

    ## Full rewards
    n = len(y)
    k = max(y)+1
    full_rewards = np.zeros([n, k])
    full_rewards[np.arange(0,n),y] = 1
    contexts = X
    best_actions = y
    return contexts, full_rewards, best_actions

In [3]:
contexts, full_rewards, best_actions = get_fully_observed_bandit()
n, k = full_rewards.shape
_, d = contexts.shape
print(f"There are {k} actions, the context space is {d} dimensional, and there are {n} examples.")
print(f"For example, the first item has context vector:\n{contexts.iloc[0:1]}.")
print(f"The best action is {best_actions[0]}.  The reward for that action is 1 and all other actions get reward 0.")
print(f"The reward information is store in full_rewards as the row\n{full_rewards[0]}.")

There are 26 actions, the context space is 16 dimensional, and there are 20000 examples.
For example, the first item has context vector:
   x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  x10  x11  x12  x13  x14  x15
0   2   8   3   5   1   8  13   0   6   6   10    8    0    8    0    8.
The best action is 19.  The reward for that action is 1 and all other actions get reward 0.
The reward information is store in full_rewards as the row
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0.].


In [4]:
## Choose train/test indices
rng = default_rng(7)
train_frac = 0.5
train_size = round(train_frac * n)
train_idx = rng.choice(n, size = train_size, replace = False)
test_idx = np.setdiff1d(np.arange(n), train_idx, assume_unique=True)

In [5]:
X_train = torch.tensor(contexts.iloc[train_idx].to_numpy(), dtype=torch.float, requires_grad=True)
# y_train = torch.tensor(best_actions.iloc[train_idx].to_numpy(), requires_grad=True)

X_test = torch.tensor(contexts.iloc[test_idx].to_numpy(), dtype=torch.float, requires_grad=True)
# y_test = torch.tensor(best_actions.iloc[test_idx].to_numpy(), requires_grad=True)
full_rewards_test = torch.tensor(full_rewards[test_idx], requires_grad=True)

# R = torch.nn.Embedding.from_pretrained(full_rewards_test)

## Policy

In [6]:
class Policy(nn.Module):
    def __init__(self, num_actions=2):
        super(Policy, self).__init__()
        self.num_actions = num_actions

    @abc.abstractmethod
    def get_action_distribution(self, X):
        """   
        This method is intended to be overridden by each implementation of Policy.

        Args:
            X (pd.DataFrame): contexts

        Returns:
            2-dim numpy array with the same number of rows as X and self.num_actions columns. 
                Each rows gives the policy's probability distribution over actions conditioned on the context in the corresponding row of X
        """   
        raise NotImplementedError("Must override method")

    def get_action_propensities(self, X, actions):
        """   
        Args:
            X (pd.DataFrame): contexts, rows correspond to entries of actions
            actions (np.array): actions taken, represented by integers, corresponding to rows of X

        Returns:
            1-dim numpy array of probabilities (same size as actions) for taking each action in its corresponding context
        """   
        ## TODO
        Pi = self.get_action_distribution(X)
        propensities = torch.tensor([Pi[i][a] for i, a in enumerate(actions)], requires_grad=True)
        return propensities
    
    def select_actions(self, X, rng=default_rng(1)):
        """   
        Args:
            X (pd.DataFrame): contexts, rows correspond to entries of actions and propensities returned

        Returns:
            actions (np.array): 1-dim numpy array of length equal to the number of rows of X.  Each entry is an integer indicating the action selected for the corresponding context in X. 
                The action is selected randomly according to the policy, conditional on the context specified in the appropriate row of X.
            propensities (np.array): 1-dim numpy array of length equal to the number of rows of X; gives the propensity for each action selected in actions

        """   
        ## TODO
        Pi = self.get_action_distribution(X)
        actions = [np.random.choice(range(self.num_actions), p=Pi_i.detach().numpy()) for Pi_i in Pi]
        actions = torch.tensor(actions)
        propensities = self.get_action_propensities(X, actions)
        return actions, propensities
        
    def get_value_estimate(self, X, full_rewards):
        """   
        Args:
            X (pd.DataFrame): contexts, rows correspond to entries of full_rewards
            full_rewards (np.array): 2-dim numpy array with the same number of rows as X and self.num_actions columns; 
                each row gives the rewards that would be received for each action for the context in the corresponding row of X.
                This would only be known in a full-feedback bandit, or estimated in a direct method

        Returns:
            scalar value giving the expected average reward received for playing the policy for contexts X and the given full_rewards

        """   
        ## TODO
        Pi = self.get_action_distribution(X)
        value = torch.sum(Pi * full_rewards, axis=1).mean()
        return value


############################################################ 
class UniformActionPolicy(Policy):
    def __init__(self, num_actions=2):
        self.num_actions = num_actions

    def get_action_distribution(self, X):
        ## TODO
        p = 1 / self.num_actions
        Pi = torch.zeros([X.shape[0], self.num_actions]) + p
        return Pi
    
############################################################
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        outputs = self.linear(x)
        outputs = self.softmax(outputs)
        return outputs

class LogisticPolicy(Policy):
    def __init__(self, num_actions, num_features):
        super(LogisticPolicy, self).__init__()
        self.num_actions = num_actions
        self.model = LogisticRegression(num_features, num_actions)
    
    def get_action_distribution(self, X):
        Pi = self.model(X)
        return Pi
    

In [7]:
def get_rewards_vector(full_rewards, actions):
    return torch.tensor([full_rewards[i][a]for i, a in enumerate(actions)], requires_grad=True)

def snips_loss(pi_w, pi_0, r):
    return - torch.mean(r * pi_w / pi_0)

In [30]:
X = X_test

# initialize logging policy (Pi_0) and target policy (Pi_w)
uniform_policy = UniformActionPolicy(num_actions=k)
logistic_policy = LogisticPolicy(num_actions=k, num_features=X_train.shape[1])

# print('Weight at the beginning:')
# print(logistic_policy.model.linear.weight)
# print()
a = logistic_policy.model.linear.weight.clone()

# initialize optimizer
optimizer = torch.optim.SGD(logistic_policy.parameters(), lr=100)

# generate actions from logging policy & get Pi_0
actions, pi_0 = uniform_policy.select_actions(X)

# get Pi_w
pi_w = logistic_policy.get_action_propensities(X, actions)

# get rewords
r = get_rewards_vector(full_rewards_test, actions)

# calculate loss
loss = snips_loss(pi_w, pi_0, r)

# update parameters
optimizer.zero_grad()
for param in logistic_policy.parameters():
    param.retain_grad()
loss.backward()
optimizer.step()

# print('Weight after learning:')
# print(logistic_policy.model.linear.weight)
# print()
b = logistic_policy.model.linear.weight.clone()

In [31]:
print(loss.grad)

None


In [32]:
print(loss.data)

tensor(-0.0384, dtype=torch.float64)
