In [1]:
import numpy as np
import pandas as pd
import os 
import sys
import torch

DATASETS = '../datasets/'

def read_file(dir_name):
    main_df = pd.DataFrame()
    directory = DATASETS + dir_name
    for filename in os.listdir(directory):
        data = np.load(os.path.join(directory, filename))
        data_dict = {}
        for keys in data.keys():
            data_dict[keys] = list(data[keys])
        df = pd.DataFrame.from_dict(data_dict)
        # main_df = pd.concat([main_df, df])
    return df

if __name__ == '__main__':
    dir_name = 'mw/cls_basketball-v2'
    # print([name for name in os.listdir("../datasets/mw") if os.path.isdir(name)])
    read_file(dir_name)

In [2]:
data = read_file(dir_name)
data.reset_index(drop=True, inplace=True)

In [3]:
data.loc[data.done==True]

Unnamed: 0,obs,action,reward,done,discount
107,"[0.04225361, 0.6922199, 0.3393161, 0.6348106, ...","[0.20169806, 0.09580077, 0.34795704, 0.57062256]",10.0,True,1.0
217,"[0.03985697, 0.70180416, 0.34516203, 0.6245957...","[-0.092127666, 0.016195506, 0.04491708, 0.509507]",10.0,True,1.0
318,"[0.041954525, 0.69937974, 0.34240845, 0.627575...","[0.04885615, -0.13805023, 0.20181242, 0.6775277]",10.0,True,1.0
429,"[0.0426317, 0.6936216, 0.34391972, 0.6367297, ...","[0.079129174, -0.046790846, 0.15598248, 0.6076...",10.0,True,1.0
541,"[0.045894675, 0.6963733, 0.34360257, 0.635374,...","[0.08643522, -0.062440854, 0.2643433, 0.6480871]",10.0,True,1.0
660,"[0.038542774, 0.69685125, 0.3427321, 0.6403711...","[0.12917, 0.014268942, 0.23423716, 0.53333265]",10.0,True,1.0
767,"[0.046851985, 0.69781697, 0.34231716, 0.630933...","[-0.109134324, 0.047157288, 0.18217964, 0.6325...",10.0,True,1.0
877,"[0.045193467, 0.6921859, 0.34096196, 0.628385,...","[0.032395396, -0.05911544, 0.3193123, 0.7464293]",10.0,True,1.0
977,"[0.045409724, 0.6938885, 0.3422801, 0.62775636...","[0.038899023, 0.13717479, 0.46352172, 0.5921459]",10.0,True,1.0
1085,"[0.040127266, 0.69905657, 0.34206805, 0.624083...","[-0.08430083, -0.023767652, 0.39234892, 0.4676...",10.0,True,1.0


In [4]:
import itertools
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class YourModel(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(YourModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


class YourDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class PreferenceMAML:
    def __init__(
        self,
        ml10,
        input_size,
        hidden_size1,
        hidden_size2,
        output_size,
        num_support=10,
        num_query=10,
        num_inner_steps=5,
        inner_lr=0.1,
        learn_inner_lr=True,
        **kwargs,
    ):
        self.inner_lr = inner_lr
        self.learn_inner_lr = learn_inner_lr
        self.ml10 = ml10
        self.reward_criterion = nn.BCELoss()
        self.num_support = num_support
        self.num_query = num_query
        self.num_inner_steps = num_inner_steps

        self.model = YourModel(input_size, hidden_size1, hidden_size2, output_size)

    def construct_episodes(self):
        episodes = []
        episode = []
        for _, row in self.ml10.iterrows():
            episode.append(row)
            if row['done']:
                episodes.append(episode)
                episode = []
        return episodes

    def form_sigma_groups(self, episode):
        sigmas = []
        for i in range(len(episode) - 1):
            sigma1 = episode[:i+1]
            sigma2 = episode[i+1:]
            sigmas.append((sigma1, sigma2))
        return sigmas

    def compare_probabilities(self, sigma1, sigma2):
        exp_sum_rewards_sigma1 = np.exp(sum(row['reward'] for row in sigma1))
        exp_sum_rewards_sigma2 = np.exp(sum(row['reward'] for row in sigma2))
        prob = exp_sum_rewards_sigma1 / (exp_sum_rewards_sigma1 + exp_sum_rewards_sigma2)
        return [0,1] if prob > 0.5 else [1,0]

    def prepare_data(self):
        X = []
        y = []
        episodes = self.construct_episodes()
        for episode in episodes:
            sigmas = self.form_sigma_groups(episode)
            for sigma1, sigma2 in sigmas:
                obs_action_sigma1 = []
                for row in sigma1:
                    obs_action = list(row['obs']) + list(row['action'])  # Concatenate obs and action
                    obs_action_sigma1.append(obs_action)
                X.append(obs_action_sigma1)
                # Target value (y) for each sigma comparison
                y.append([self.compare_probabilities(sigma1, sigma2)])  # Wrap in a list to match output shape
        return X, y




    def setup_optimizers(self, optim_class, optim_kwargs):
        self.optim = optim_class(self.model.parameters(), **optim_kwargs)

    def _train_step(self, X, y):
        self.optim.zero_grad()
        loss = self._outer_step(X, y)
        loss.backward()
        self.optim.step()
        return loss.item()

    def _outer_step(self, X, y):
        outer_losses = []
        for i in range(len(X)):
            loss = self._compute_loss(X[i], y[i])
            outer_losses.append(loss)
        return torch.mean(torch.stack(outer_losses))

    def _compute_loss(self, X, y):
        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor([y], dtype=torch.float32)
        output = self.model(X_tensor)

        output_flat = output.view(-1)
        y_flat = y_tensor.view(-1)

        loss = self.reward_criterion(output_flat[-2:], y_flat)
        return loss

# Example usage
ml10 = data.copy()  # Assume ml10 is a DataFrame with obs, action, reward, done columns
input_size = 43  # Assuming obs has 39 numbers and action has 4 numbers
hidden_size1 = 128
hidden_size2 = 128
output_size = 2
num_epochs = 20
model = PreferenceMAML(ml10, input_size, hidden_size1, hidden_size2, output_size)
model.setup_optimizers(optim.Adam, {"lr": 0.001})

# Prepare data
X, y = model.prepare_data()

# Train the model
for epoch in range(num_epochs):
    loss = model._train_step(X, y)
    print(f"Epoch {epoch+1}, Loss: {loss}")


Epoch 1, Loss: 0.6941280961036682
Epoch 2, Loss: 0.673733651638031
Epoch 3, Loss: 0.6542643904685974
Epoch 4, Loss: 0.6355992555618286
Epoch 5, Loss: 0.6176444888114929
Epoch 6, Loss: 0.600238561630249
Epoch 7, Loss: 0.5830855965614319
Epoch 8, Loss: 0.5663089752197266
Epoch 9, Loss: 0.549755871295929
Epoch 10, Loss: 0.5332930684089661
Epoch 11, Loss: 0.5169082880020142
Epoch 12, Loss: 0.500662088394165
Epoch 13, Loss: 0.48463568091392517
Epoch 14, Loss: 0.4688534736633301
Epoch 15, Loss: 0.4533584713935852
Epoch 16, Loss: 0.43828192353248596
Epoch 17, Loss: 0.42373624444007874
Epoch 18, Loss: 0.40986621379852295
Epoch 19, Loss: 0.39680245518684387
Epoch 20, Loss: 0.38462498784065247


In [5]:
# # def preprocess_df(df, num_segments):
# df = data.copy()
# num_segments = 4

# episodes = []
# current_episode = []

# for index, row in df.iterrows():
#     current_episode.append(row)
#     if row['done'] or index == len(df) - 1:
#         D = []
#         segments = np.array_split(current_episode, num_segments)
#         for i in range(len(segments) - 1):
#             for j in range(i, len(segments) - 1):
#                 sigma1 = pd.DataFrame(segments[i])
#                 sigma2 = pd.DataFrame(segments[j])
#                 reward_sum_sigma1 = sum(sigma1[2])
#                 reward_sum_sigma2 = sum(sigma2[2])
#                 p = torch.tensor([[np.exp(reward_sum_sigma1) / (np.exp(reward_sum_sigma1) + np.exp(reward_sum_sigma2))]])
#                 y = torch.tensor([1]) if p.item() >= 0.5 else torch.tensor([0])

#                 s1_obs = torch.tensor(sigma1[0])
#                 s1_act = torch.tensor(sigma1[1])
#                 s1_reward = torch.tensor(sigma1[2])

#                 s2_obs = torch.tensor(sigma2[0])
#                 s2_act = torch.tensor(sigma2[1])
#                 s2_reward = torch.tensor(sigma2[2])

                
                
#                 d = (p, y)
#                 D.append(d)

#                 # x_obs = torch.tensor(sigma1[0].tolist())
#                 # x_act = torch.tensor(sigma1[1].tolist())
#                 # y_support = torch.tensor([[1, 0]]) if p.item() > 0.5 else torch.tensor([[0, 1]])
#                 # x_query = torch.tensor(sigma2[0].tolist())
#                 # y_query = torch.tensor([0]) if p.item() > 0.5 else torch.tensor([1])
#                 # episodes.append(((x_obs, x_act), y_support, (x_obs, x_act), y_query))
#         episodes.append(D)
#         current_episode = []

#     # return episodes

In [6]:
# episodes[0]

In [7]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import numpy as np
# import torch.nn.functional as F

# # Define your neural network architecture
# class Policy(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(Policy, self).__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size)
#         self.fc2 = nn.Linear(hidden_size, hidden_size)
#         self.fc3 = nn.Linear(hidden_size, output_size)
#         self.double()

#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.fc3(x)
#         return x

# # Define MAML algorithm
# def maml(epochs_inner, epochs_outer, episodes, alpha_inner, beta_outer, num_inner_updates):
#     # Initialize parameters
#     input_size =  1 # specify the input size of your neural network
#     output_size = 2 # specify the output size of your neural network
    
#     # Initialize policy network
#     policy_net = Policy(input_size, 128, output_size)
    
#     # Initialize optimizer
#     optimizer_outer = optim.Adam(policy_net.parameters(), lr=beta_outer)
    
#     # Define loss function
#     criterion = nn.CrossEntropyLoss()
    
#     # Outer loop
#     for epoch_outer in range(epochs_outer):
        
#         # Initialize meta-gradients
#         meta_gradients = []
        
#         # Inner loop
#         for episode in episodes:
#             # Clone policy network
#             policy_net_clone = Policy(input_size, 128, output_size)
#             policy_net_clone.load_state_dict(policy_net.state_dict())
            
#             # Initialize inner optimizer
#             optimizer_inner = optim.Adam(policy_net_clone.parameters(), lr=alpha_inner)
            
#             # Inner loop updates
#             for _ in range(num_inner_updates):
#                 input_data, target_data = ([episode[i][0] for i in range(len(episode))], [episode[i][1] for i in range(len(episode))])
                
#                 predictions = policy_net_clone(torch.tensor(np.array(input_data)))
#                 predictions = predictions.view(-1, predictions.shape[-1])

#                 loss = criterion(predictions, torch.tensor(target_data))
                
#                 optimizer_inner.zero_grad()


#                 loss.backward()
#                 optimizer_inner.step()
            
#             # Compute gradient of loss w.r.t. initial parameters
#             print(torch.autograd.grad(loss, policy_net.parameters(), allow_unused=False))
#             meta_gradients.append(torch.autograd.grad(loss, policy_net.parameters(), allow_unused=False))
#             print(meta_gradients)
        
#         # Update policy network using meta-gradients
#         mean_gradients = np.mean(meta_gradients, axis=0)
#         for param, grad in zip(policy_net.parameters(), mean_gradients):
#             param.data -= beta_outer * grad[0]
        
#         # Print loss after outer loop
#         print(f'Epoch {epoch_outer+1}/{epochs_outer}, Loss: {loss.item()}')


# # Define hyperparameters
# epochs_inner = 5
# epochs_outer = 100
# alpha_inner = 0.01
# beta_outer = 0.001
# num_inner_updates = 100

# # Run MAML
# model = maml(epochs_inner, epochs_outer, episodes, alpha_inner, beta_outer, num_inner_updates)


In [8]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import numpy as np

# # Define the reward function network
# class RewardFunction(nn.Module):
#     def __init__(self, state_dim, action_dim):
#         super(RewardFunction, self).__init__()
#         self.fc1 = nn.Linear(state_dim + action_dim, 64)
#         self.fc2 = nn.Linear(64, 1)
    
#     def forward(self, state, action):
#         x = torch.cat([state, action], dim=1)
#         x = torch.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x

# # Define the preference predictor
# class PreferencePredictor(nn.Module):
#     def __init__(self, state_dim, action_dim):
#         super(PreferencePredictor, self).__init__()
#         self.fc1 = nn.Linear(state_dim + action_dim, 64)
#         self.fc2 = nn.Linear(64, 1)
    
#     def forward(self, state1, action1, state2, action2):
#         x1 = torch.cat([state1, action1], dim=1)
#         x2 = torch.cat([state2, action2], dim=1)
#         x = torch.sigmoid(self.fc1(x1) - self.fc1(x2))
#         return x

# # Define the MAML algorithm
# def maml_update(reward_net, preference_predictor, task_data, alpha, beta):
#     reward_net_copy = RewardFunction(state_dim, action_dim)
#     reward_net_copy.load_state_dict(reward_net.state_dict())
#     optimizer = optim.Adam(reward_net_copy.parameters(), lr=alpha)
    
#     for _ in range(num_inner_updates):
#         optimizer.zero_grad()
#         loss = preference_loss(reward_net_copy, preference_predictor, task_data)
#         loss.backward()
#         optimizer.step()
    
#     outer_loss = preference_loss(reward_net_copy, preference_predictor, task_data)
#     reward_net_parameters = reward_net.parameters()
#     outer_grad = torch.autograd.grad(outer_loss, reward_net_parameters, create_graph=True, allow_unused=True)
#     for param, grad in zip(reward_net_parameters, outer_grad):
#         param.data -= beta * grad
    
#     return outer_loss.item()

# # Define the preference loss function
# def preference_loss(reward_net, preference_predictor, task_data):
#     loss = 0
#     for task in task_data:
#         states1, actions1, states2, actions2, labels = task
#         predicted_preferences = preference_predictor(states1, actions1, states2, actions2)
#         loss += -torch.mean(labels * torch.log(predicted_preferences + 1e-8) + (1 - labels) * torch.log(1 - predicted_preferences + 1e-8))
#     return loss


# # Parameters
# state_dim = 39
# action_dim = 4
# batch_size = 32
# num_inner_updates = 5
# alpha = 0.01  # Inner learning rate
# beta = 0.001  # Outer learning rate
# num_tasks = 100
# num_epochs = 10

# # Initialize reward function and preference predictor
# reward_net = RewardFunction(state_dim, action_dim)
# preference_predictor = PreferencePredictor(state_dim, action_dim)

# # Training loop
# for epoch in range(num_epochs):
#     total_loss = 0
#     for episode in episodes:
#         # Sample task data
        
#         # MAML update
#         loss = maml_update(reward_net, preference_predictor, episode, alpha, beta)
#         total_loss += loss
    
#     avg_loss = total_loss / num_tasks
#     print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}")


In [9]:
# episodes[0]

In [10]:
# import torch
# import torch.nn as nn
# import torch.optim as optim

# # Define your neural network model
# class Net(nn.Module):
#     def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
#         super(Net, self).__init__()
#         # Define your network architecture
#         self.fc1 = nn.Linear(input_size, hidden_size1)
#         self.relu1 = nn.ReLU()
#         self.fc2 = nn.Linear(hidden_size1, hidden_size2)
#         self.relu2 = nn.ReLU()
#         self.fc3 = nn.Linear(hidden_size2, output_size)
        
#     def forward(self, x):
#         x = self.relu1(self.fc1(x))
#         x = self.relu2(self.fc2(x))
#         x = self.fc3(x)
#         return x


In [11]:
# # Define MAML class
# class MAML:
#     def __init__(self, model, lr_inner=0.01, lr_outer=0.001):
#         self.model = model
#         self.lr_inner = lr_inner
#         self.lr_outer = lr_outer
#         self.optimizer_outer = optim.Adam(self.model.parameters(), lr=lr_outer)
        
#     def inner_loop(self, S1, S2, y):
#         # Inner loop training (fine-tuning)
#         criterion = nn.CrossEntropyLoss()
#         optimizer_inner = optim.SGD(self.model.parameters(), lr=self.lr_inner)
        
#         # Convert DataFrames to tensors
#         S1_input = torch.tensor(S1[[0, 1, 2]].values, dtype=torch.float32)
#         S2_input = torch.tensor(S2[[0, 1, 2]].values, dtype=torch.float32)
#         y_tensor = torch.tensor(y)
        
#         for _ in range(num_inner_updates=5):
#             # Concatenate S1 and S2 tensors
#             x = torch.cat([S1_input, S2_input], dim=0)
            
#             # Forward pass
#             logits = self.model(x)
            
#             # Compute loss
#             loss = criterion(logits, y_tensor)
            
#             # Backward pass
#             optimizer_inner.zero_grad()
#             loss.backward()
#             optimizer_inner.step()
    
#     def outer_loop(self, episodes, num_inner_updates=5):
#         # Outer loop meta-training
#         criterion = nn.CrossEntropyLoss()
#         for episode in episodes:
#             for S1, S2, y, _ in episode:
#                 # Inner loop (fine-tuning)
#                 self.inner_loop(S1, S2, y)
                
#                 # Convert DataFrames to tensors
#                 S1_input = torch.tensor(S1[[0, 1, 2]].values, dtype=torch.float32)
#                 S2_input = torch.tensor(S2[[0, 1, 2]].values, dtype=torch.float32)
#                 y_tensor = torch.tensor(y)
                
#                 # Concatenate S1 and S2 tensors
#                 x = torch.cat([S1_input, S2_input], dim=0)
                
#                 # Compute loss and update outer loop parameters
#                 logits = self.model(x)
#                 loss = criterion(logits, y_tensor)
#                 self.optimizer_outer.zero_grad()
#                 loss.backward()
#                 self.optimizer_outer.step()



In [12]:
# # Example usage
# input_size =  39# Define your input size
# hidden_size1 = 64  # Define your first hidden size
# hidden_size2 = 32  # Define your second hidden size
# output_size = 2  # Since you have binary classification
# model = Net(input_size, hidden_size1, hidden_size2, output_size)
# maml = MAML(model)

# # Assuming you have episodes as described in your scenario
# # Define your episodes
# num_inner_updates = 5  # Number of inner loop updates
# maml.outer_loop(episodes, num_inner_updates)

In [13]:
# # Example testing
# def test(model, test_data):
#     criterion = nn.CrossEntropyLoss()
#     correct = 0
#     total = 0
#     with torch.no_grad():
#         for S1, S2, y, _ in test_data:
#             x = torch.cat([S1, S2], dim=0)  # Assuming S1 and S2 are tensors
#             y = torch.tensor(y)
#             outputs = model(x)
#             _, predicted = torch.max(outputs.data, 1)
#             total += y.size(0)
#             correct += (predicted == y).sum().item()
#     accuracy = 100 * correct / total
#     print('Accuracy of the network on the test data: %d %%' % accuracy)

# # Assuming you have test data
# test_data = [...]  # Define your test data
# test(model, test_data)