In [1]:
import gym, os
from itertools import count, product
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import json
import matplotlib.pyplot as plt
import argparse
import warnings



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config_file = "/home/raghuram/Prateek_Codes/Final Off-Policy Experiments/MountainCar-Shaped/AC_TD0/train_config.json"

with open(config_file) as json_file:
    config = json.load(json_file)

def video_callable(episode_number):
    return episode_number%config['recording_frequency'] == 0

env = gym.make(config['env'])
ratio_estimation_env = gym.make(config['env'])

if config['record']:
    env = gym.wrappers.Monitor(env, config['recording_path'], force = True, video_callable=video_callable)

if config['episode_length'] is not None:
    env._max_episode_steps = config['episode_length']
    ratio_estimation_env._max_episode_steps = config['episode_length']

if config['numpy_seed'] is not None:
    np.random.seed(config['numpy_seed'])

if config['environment_seed'] is not None:
    env.seed(config['environment_seed'])
    ratio_estimation_env.seed(config['environment_seed'])

if config['pytorch_seed'] is not None:
    torch.manual_seed(config['pytorch_seed'])

state_size = env.observation_space.shape[0]
action_size = env.action_space.n
actor_h_layers_sizes = config['actor']['hidden_layer_neurons']
critic_h_layers_sizes = config['critic']['hidden_layer_neurons']
w_h_layers_sizes = config['w']['hidden_layer_neurons']
y_h_layers_sizes = config['y']['hidden_layer_neurons']
gamma = config['gamma']
lr_A = config['actor']['learning_rate']
lr_C = config['critic']['learning_rate']
lr_W = config['w']['learning_rate']
lr_Y = config['y']['learning_rate']
load_A = config['actor']['load']
load_C = config['critic']['load']
load_W = config['w']['load']
load_Y = config['y']['load']
random_behaviour = config['random_behaviour']
iterations = config['iterations']
estimation_samples = config['estimation_samples']

class Actor(nn.Module):
    def __init__(self, input_size, h_layers_sizes, output_size):
        super(Actor, self).__init__()
        self.input_size = input_size
        self.h_layers_sizes = h_layers_sizes
        self.all_layers_sizes = [input_size] + h_layers_sizes + [output_size]
        self.output_size = output_size
        self.linears = nn.ModuleList([nn.Linear(self.all_layers_sizes[i], self.all_layers_sizes[i+1], bias=False) for i in range(len(self.all_layers_sizes)-1)])

    def forward(self, state):
        output = torch.tanh(self.linears[0](state))
        for i in range(1,len(self.linears)-1):
            output = torch.tanh(self.linears[i](output))
        output = self.linears[-1](output)
        distribution = Categorical(F.softmax(output, dim=-1))
        return distribution


class Critic(nn.Module):
    def __init__(self, input_size, h_layers_sizes, output_size):
        super(Critic, self).__init__()
        self.input_size = input_size
        self.h_layers_sizes = h_layers_sizes
        self.all_layers_sizes = [input_size] + h_layers_sizes + [output_size]
        self.output_size = output_size
        self.linears = nn.ModuleList([nn.Linear(self.all_layers_sizes[i], self.all_layers_sizes[i+1]) for i in range(len(self.all_layers_sizes)-1)])

    def forward(self, state):
        value = F.relu(self.linears[0](state))
        for i in range(1,len(self.linears)-1):
            value = F.relu(self.linears[i](value))
        value = self.linears[-1](value)
        return value

class W(nn.Module):
    def __init__(self, input_size, h_layers_sizes, output_size):
        super(W, self).__init__()
        self.input_size = input_size
        self.h_layers_sizes = h_layers_sizes
        self.all_layers_sizes = [input_size] + h_layers_sizes + [output_size]
        self.output_size = output_size
        self.linears = nn.ModuleList([nn.Linear(self.all_layers_sizes[i], self.all_layers_sizes[i+1]) for i in range(len(self.all_layers_sizes)-1)])

    def forward(self, state):
        value = F.relu(self.linears[0](state))
        for i in range(1,len(self.linears)-1):
            value = F.relu(self.linears[i](value))
        value = torch.exp(self.linears[-1](value))
        return value

class Y(nn.Module):
    def __init__(self, input_size, h_layers_sizes, output_size):
        super(Y, self).__init__()
        self.input_size = input_size
        self.h_layers_sizes = h_layers_sizes
        self.all_layers_sizes = [input_size] + h_layers_sizes + [output_size]
        self.output_size = output_size
        self.linears = nn.ModuleList([nn.Linear(self.all_layers_sizes[i], self.all_layers_sizes[i+1]) for i in range(len(self.all_layers_sizes)-1)])

    def forward(self, state):
        value = F.relu(self.linears[0](state))
        for i in range(1,len(self.linears)-1):
            value = F.relu(self.linears[i](value))
        value = torch.exp(self.linears[-1](value))
        return value

def kernel(state1, state2, l = 1):
    d = torch.norm(state1 - state2)
    return torch.exp(-((d**2)/(2*(l**2))))

def estimate_W(target_policy, w, optimizerW, random_behaviour = True, behaviour_policy = None, iterations = 5000, samples = 1000):

    for iter in range(iterations):
        w_state1_list = []
        w_state2_list = []
        w_next_state1_list = []
        w_next_state2_list = []
        beta1_list = []
        beta2_list = []
        kernel_value_list = []
        z_w_state = 0

        training_data = []
        initial_state = ratio_estimation_env.reset()
        initial_state = torch.FloatTensor(initial_state).to(device)
        state = initial_state

        for i in range(samples):
            
            if random_behaviour == False:
                dist_behaviour = behaviour_policy(state)
                action = dist_behaviour.sample()
            else:
                action = torch.randint(0, action_size, (1,)).to(device)
                action = torch.squeeze(action)

            next_state, reward, done, _ = ratio_estimation_env.step(action.cpu().numpy())

            next_state = torch.FloatTensor(next_state).to(device)

            dist_target = target_policy(state)

            if random_behaviour == False:
                beta = (dist_target.probs[action]/dist_behaviour.probs[action]).detach()
            else:
                beta = dist_target.probs[action].detach()*action_size
            
            training_data.append([state,beta,next_state])

            state = next_state

            if done:
                break

        batch = [[None, None, initial_state]]

        for i in range(len(training_data)):
            d = np.random.uniform()
            if d < gamma**(i+1):
                batch.append(training_data[i])

        pairs = list(product(batch, repeat=2))

        for pair in pairs:
            sample1 = pair[0]
            sample2 = pair[1]

            if sample1[0] != None:
                w_state1 = w(sample1[0])
            else:
                w_state1 = None

            if sample2[0] != None:
                w_state2 = w(sample2[0])
            else:
                w_state2 = None

            beta1 = sample1[1]
            beta2 = sample2[1]

            w_next_state1 = w(sample1[2])
            w_next_state2 = w(sample2[2])

            kernel_value = kernel(sample1[2], sample2[2])

            w_state1_list.append(w_state1)
            w_state2_list.append(w_state2)
            w_next_state1_list.append(w_next_state1)
            w_next_state2_list.append(w_next_state2)
            beta1_list.append(beta1)
            beta2_list.append(beta2)
            kernel_value_list.append(kernel_value)

        for sample in batch[1:]:
            w_state = w(sample[0])
            z_w_state += w_state

        z_w_state /= len(batch)

        w_loss = 0

        for i in range(len(pairs)):
            if w_state1_list[i] == None and w_state2_list[i] == None:
                w_loss += (1 - (w_next_state1_list[i]/z_w_state))*(1 - (w_next_state2_list[i]/z_w_state))*kernel_value_list[i]

            elif w_state1_list[i] == None:
                w_loss += (1 - (w_next_state1_list[i]/z_w_state))*(beta2_list[i]*(w_state2_list[i]/z_w_state) - (w_next_state2_list[i]/z_w_state))*kernel_value_list[i]

            elif w_state2_list[i] == None:
                w_loss += (beta1_list[i]*(w_state1_list[i]/z_w_state) - (w_next_state1_list[i]/z_w_state))*(1 - (w_next_state2_list[i]/z_w_state))*kernel_value_list[i]

            else:
                w_loss += (beta1_list[i]*(w_state1_list[i]/z_w_state) - (w_next_state1_list[i]/z_w_state))*(beta2_list[i]*(w_state2_list[i]/z_w_state) - (w_next_state2_list[i]/z_w_state))*kernel_value_list[i]

        w_loss /= len(batch)
        optimizerW.zero_grad()
        w_loss.backward()
        optimizerW.step()
        optimizerW.zero_grad()

def estimate_Y(target_policy, y, optimizerY, random_behaviour = True, behaviour_policy = None, iterations = 5000, samples = 1000):

    for iter in range(iterations):
        y_state1_list = []
        y_state2_list = []
        y_next_state1_list = []
        y_next_state2_list = []
        beta1_list = []
        beta2_list = []
        kernel_value_list = []
        z_y_state = 0

        training_data = []
        state = ratio_estimation_env.reset()
        state = torch.FloatTensor(state).to(device)

        for i in range(samples):
            
            if random_behaviour == False:
                dist_behaviour = behaviour_policy(state)
                action = dist_behaviour.sample()
            else:
                action = torch.randint(0, action_size, (1,)).to(device)
                action = torch.squeeze(action)

            next_state, reward, done, _ = ratio_estimation_env.step(action.cpu().numpy())

            next_state = torch.FloatTensor(next_state).to(device)

            dist_target = target_policy(state)

            if random_behaviour == False:
                beta = (dist_target.probs[action]/dist_behaviour.probs[action]).detach()
            else:
                beta = dist_target.probs[action].detach()*action_size
            
            training_data.append([state,beta,next_state])

            state = next_state

            if done:
                break

        pairs = list(product(training_data, repeat=2))

        for pair in pairs:
            sample1 = pair[0]
            sample2 = pair[1]

            y_state1 = y(sample1[0])
            y_state2 = y(sample2[0])

            beta1 = sample1[1]
            beta2 = sample2[1]

            y_next_state1 = y(sample1[2])
            y_next_state2 = y(sample2[2])

            kernel_value = kernel(sample1[2], sample2[2])

            y_state1_list.append(y_state1)
            y_state2_list.append(y_state2)
            y_next_state1_list.append(y_next_state1)
            y_next_state2_list.append(y_next_state2)
            beta1_list.append(beta1)
            beta2_list.append(beta2)
            kernel_value_list.append(kernel_value)

        for sample in training_data:
            y_state = y(sample[0])
            z_y_state += y_state

        z_y_state /= len(training_data)

        y_loss = 0

        for i in range(len(pairs)):
            y_loss += (beta1_list[i]*(y_state1_list[i]/z_y_state) - (y_next_state1_list[i]/z_y_state))*(beta2_list[i]*(y_state2_list[i]/z_y_state) - (y_next_state2_list[i]/z_y_state))*kernel_value_list[i]

        y_loss /= len(training_data)
        optimizerY.zero_grad()
        y_loss.backward()
        optimizerY.step()
        optimizerY.zero_grad()

def lr_scheduler(optimizerA, optimizerC, total_reward):
    for schedule in config['learning_rate_scheduler']['schedule']:
        if total_reward >= schedule[0][0] and total_reward < schedule[0][1]:
            optimizerA.param_groups[0]['lr'] = schedule[1]['lr_A']
            optimizerC.param_groups[0]['lr'] = schedule[1]['lr_C']

def evaluate_policy(actor):
    state = env.reset()
    state = torch.FloatTensor(state).to(device)
    total_reward = 0

    for i in count():
        if config['render']:
            env.render()

        dist = actor(state)
        action = dist.sample()

        next_state, reward, done, _ = env.step(action.cpu().numpy())
        total_reward += reward

        next_state = torch.FloatTensor(next_state).to(device)
        state = next_state

        if done:
            break

    return total_reward

def trainIters(actor, critic, w, y, random_behaviour = True, behaviour_policy = None, n_iters = 5000):
    optimizerA = optim.Adam(actor.parameters(), lr=lr_A)
    optimizerC = optim.Adam(critic.parameters(), lr=lr_C)
    optimizerW = optim.Adam(w.parameters(), lr=lr_W)
    optimizerY = optim.Adam(y.parameters(), lr=lr_Y)
    running_total_reward = 0
    max_running_total_reward = -float('inf')
    reward_list = []

    for iter in range(n_iters):
        state = env.reset()
        state = torch.FloatTensor(state).to(device)
        estimate_W(actor, w, optimizerW, random_behaviour, behaviour_policy, iterations = 1, samples = estimation_samples)
        estimate_Y(actor, y, optimizerY, random_behaviour, behaviour_policy, iterations = 1, samples = estimation_samples)

        for i in count():
            if config['render']:
                env.render()
            
            if random_behaviour == False:
                dist_behaviour = behaviour_policy(state)
                action = dist_behaviour.sample()
            else:
                action = torch.randint(0, action_size, (1,)).to(device)
                action = torch.squeeze(action)

            dist_target = actor(state)

            next_state, reward, done, _ = env.step(action.cpu().numpy())
            
            if next_state[0] > 0.1 and next_state[0] < 0.4:
                reward += 10
            elif next_state[0] >= 0.4 and next_state[0] < 0.6:
                reward += 20
            if done == True and i < config['episode_length']-1:
                reward += 1000

            if random_behaviour == False:
                beta = (dist_target.probs[action]/dist_behaviour.probs[action]).detach()
            else:
                beta = dist_target.probs[action].detach()*action_size

            log_prob = dist_target.log_prob(action).unsqueeze(0)
            
            value = critic(state)

            next_state = torch.FloatTensor(next_state).to(device)
            next_value = critic(next_state)

            if done:                
                error = reward - value
                critic_loss = error.pow(2)
                optimizerC.zero_grad()
                critic_loss.backward()
                y_ratio = y(state).detach()
                for j in range(len(critic.all_layers_sizes)-1):
                    critic.linears[j].weight.grad = y_ratio*beta*critic.linears[j].weight.grad
                optimizerC.step()
                optimizerC.zero_grad()

                value = critic(state)
                next_value = critic(next_state)
                advantage = reward - value
                actor_loss = -log_prob * advantage.detach()
                optimizerA.zero_grad()
                actor_loss.backward()
                w_ratio = w(state).detach()
                for j in range(len(actor.all_layers_sizes)-1):
                    actor.linears[j].weight.grad = w_ratio*beta*actor.linears[j].weight.grad
                optimizerA.step()
                optimizerA.zero_grad()

                total_reward = evaluate_policy(actor)

                running_total_reward = total_reward if running_total_reward == 0 else running_total_reward * 0.9 + total_reward * 0.1
                print('Iteration: {}, Current Total Reward: {}, Running Total Reward: {}'.format(iter, total_reward, round(running_total_reward,2)))

                reward_list.append(running_total_reward)

                if max_running_total_reward <= running_total_reward:
                    torch.save(actor, config['actor']['final_save_path'])
                    torch.save(critic, config['critic']['final_save_path'])
                    max_running_total_reward = running_total_reward

                if config['learning_rate_scheduler']['required']:
                    lr_scheduler(optimizerA, optimizerC, max_running_total_reward)

                break
            else:
                error = reward + gamma * next_value.detach() - value
                critic_loss = error.pow(2)
                optimizerC.zero_grad()
                critic_loss.backward()
                y_ratio = y(state).detach()
                for j in range(len(critic.all_layers_sizes)-1):
                    critic.linears[j].weight.grad = y_ratio*beta*critic.linears[j].weight.grad
                optimizerC.step()
                optimizerC.zero_grad()

                value = critic(state)
                next_value = critic(next_state)
                advantage = reward + gamma * next_value.detach() - value
                actor_loss = -log_prob * advantage.detach()
                optimizerA.zero_grad()
                actor_loss.backward()
                w_ratio = w(state).detach()
                for j in range(len(actor.all_layers_sizes)-1):
                    actor.linears[j].weight.grad = w_ratio*beta*actor.linears[j].weight.grad
                optimizerA.step()
                optimizerA.zero_grad()

                state = next_state

    env.close()
    ratio_estimation_env.close()
    with open(config['rewards_path'], 'w') as fp:
        json.dump(reward_list, fp, indent=4)


if __name__ == '__main__':
    if load_A:
        path_A = config['actor']['load_path']
        actor = torch.load(path_A).to(device)
        print('Actor Model loaded')
    else:
        actor = Actor(state_size, actor_h_layers_sizes, action_size).to(device)
        torch.save(actor, config['actor']['initial_save_path'])
    
    if load_C:
        path_C = config['critic']['load_path']
        critic = torch.load(path_C).to(device)
        print('Critic Model loaded')
    else:    
        critic = Critic(state_size, critic_h_layers_sizes, 1).to(device)
        torch.save(critic, config['critic']['initial_save_path'])

    if load_W:
        path_W = config['w']['load_path']
        w = torch.load(path_W).to(device)
        print('W Model loaded')
    else:    
        w = W(state_size, w_h_layers_sizes, 1).to(device)
        torch.save(w, config['w']['initial_save_path'])

    if load_Y:
        path_Y = config['y']['load_path']
        y = torch.load(path_Y).to(device)
        print('Y Model loaded')
    else:    
        y = Y(state_size, y_h_layers_sizes, 1).to(device)
        torch.save(y, config['y']['initial_save_path'])

    if random_behaviour:
        behaviour_policy = None
    else:
        behaviour_policy = torch.load(config['behaviour_policy_path']).to(device)
        print('Behaviour Policy loaded')

    trainIters(actor, critic, w, y, random_behaviour, behaviour_policy, n_iters=iterations)

  return torch._C._cuda_getDeviceCount() > 0


Iteration: 0, Current Total Reward: -10000.0, Running Total Reward: -10000.0
Iteration: 1, Current Total Reward: -10000.0, Running Total Reward: -10000.0
Iteration: 2, Current Total Reward: -10000.0, Running Total Reward: -10000.0
Iteration: 3, Current Total Reward: -10000.0, Running Total Reward: -10000.0
Iteration: 4, Current Total Reward: -4343.0, Running Total Reward: -9434.3
Iteration: 5, Current Total Reward: -10000.0, Running Total Reward: -9490.87
Iteration: 6, Current Total Reward: -4419.0, Running Total Reward: -8983.68
Iteration: 7, Current Total Reward: -10000.0, Running Total Reward: -9085.31
Iteration: 8, Current Total Reward: -10000.0, Running Total Reward: -9176.78
Iteration: 9, Current Total Reward: -10000.0, Running Total Reward: -9259.1
Iteration: 10, Current Total Reward: -1835.0, Running Total Reward: -8516.69
Iteration: 11, Current Total Reward: -3741.0, Running Total Reward: -8039.12
Iteration: 12, Current Total Reward: -6957.0, Running Total Reward: -7930.91
Ite

Iteration: 107, Current Total Reward: -3542.0, Running Total Reward: -3174.65
Iteration: 108, Current Total Reward: -2332.0, Running Total Reward: -3090.38
Iteration: 109, Current Total Reward: -1756.0, Running Total Reward: -2956.95
Iteration: 110, Current Total Reward: -1871.0, Running Total Reward: -2848.35
Iteration: 111, Current Total Reward: -4405.0, Running Total Reward: -3004.02
Iteration: 112, Current Total Reward: -1432.0, Running Total Reward: -2846.81
Iteration: 113, Current Total Reward: -2338.0, Running Total Reward: -2795.93
Iteration: 114, Current Total Reward: -4884.0, Running Total Reward: -3004.74
Iteration: 115, Current Total Reward: -4062.0, Running Total Reward: -3110.47
Iteration: 116, Current Total Reward: -8776.0, Running Total Reward: -3677.02
Iteration: 117, Current Total Reward: -4841.0, Running Total Reward: -3793.42
Iteration: 118, Current Total Reward: -4614.0, Running Total Reward: -3875.48
Iteration: 119, Current Total Reward: -1973.0, Running Total Rew

Iteration: 213, Current Total Reward: -2452.0, Running Total Reward: -3064.08
Iteration: 214, Current Total Reward: -2572.0, Running Total Reward: -3014.87
Iteration: 215, Current Total Reward: -3089.0, Running Total Reward: -3022.28
Iteration: 216, Current Total Reward: -1600.0, Running Total Reward: -2880.06
Iteration: 217, Current Total Reward: -5243.0, Running Total Reward: -3116.35
Iteration: 218, Current Total Reward: -3812.0, Running Total Reward: -3185.92
Iteration: 219, Current Total Reward: -2150.0, Running Total Reward: -3082.32
Iteration: 220, Current Total Reward: -1817.0, Running Total Reward: -2955.79
Iteration: 221, Current Total Reward: -1331.0, Running Total Reward: -2793.31
Iteration: 222, Current Total Reward: -3248.0, Running Total Reward: -2838.78
Iteration: 223, Current Total Reward: -2424.0, Running Total Reward: -2797.3
Iteration: 224, Current Total Reward: -2156.0, Running Total Reward: -2733.17
Iteration: 225, Current Total Reward: -3818.0, Running Total Rewa

Iteration: 319, Current Total Reward: -2400.0, Running Total Reward: -2903.41
Iteration: 320, Current Total Reward: -1369.0, Running Total Reward: -2749.97
Iteration: 321, Current Total Reward: -1452.0, Running Total Reward: -2620.17
Iteration: 322, Current Total Reward: -5172.0, Running Total Reward: -2875.36
Iteration: 323, Current Total Reward: -2530.0, Running Total Reward: -2840.82
Iteration: 324, Current Total Reward: -1691.0, Running Total Reward: -2725.84
Iteration: 325, Current Total Reward: -2478.0, Running Total Reward: -2701.05
Iteration: 326, Current Total Reward: -2030.0, Running Total Reward: -2633.95
Iteration: 327, Current Total Reward: -2292.0, Running Total Reward: -2599.75
Iteration: 328, Current Total Reward: -3353.0, Running Total Reward: -2675.08
Iteration: 329, Current Total Reward: -5784.0, Running Total Reward: -2985.97
Iteration: 330, Current Total Reward: -2656.0, Running Total Reward: -2952.97
Iteration: 331, Current Total Reward: -4060.0, Running Total Rew

Iteration: 425, Current Total Reward: -2196.0, Running Total Reward: -2967.17
Iteration: 426, Current Total Reward: -3130.0, Running Total Reward: -2983.45
Iteration: 427, Current Total Reward: -2053.0, Running Total Reward: -2890.4
Iteration: 428, Current Total Reward: -1186.0, Running Total Reward: -2719.96
Iteration: 429, Current Total Reward: -1272.0, Running Total Reward: -2575.17
Iteration: 430, Current Total Reward: -3940.0, Running Total Reward: -2711.65
Iteration: 431, Current Total Reward: -2793.0, Running Total Reward: -2719.79
Iteration: 432, Current Total Reward: -1548.0, Running Total Reward: -2602.61
Iteration: 433, Current Total Reward: -1808.0, Running Total Reward: -2523.15
Iteration: 434, Current Total Reward: -2046.0, Running Total Reward: -2475.43
Iteration: 435, Current Total Reward: -762.0, Running Total Reward: -2304.09
Iteration: 436, Current Total Reward: -1593.0, Running Total Reward: -2232.98
Iteration: 437, Current Total Reward: -1368.0, Running Total Rewar