In [3]:
import torch
import sys
sys.path.insert(0,'..')
from copg_optim import CoPG
from torch.distributions import Categorical
import numpy as np
from bilineargame.bilinear_game import bilinear

from bilineargame.network import policy_g as policy

%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

## Competitive Policy Gradient (CoPG)

In [4]:
# Initialize policy for both agents for rock paper scissors
p1 = policy()
p2 = policy()
for p in p1.parameters():
    print(p)
for p in p2.parameters():
    print(p)

Parameter containing:
tensor([1.], requires_grad=True)
Parameter containing:
tensor([0.2000], requires_grad=True)
Parameter containing:
tensor([1.], requires_grad=True)
Parameter containing:
tensor([0.2000], requires_grad=True)


In [5]:
# Initialisation of CPG and game environement
optim = CoPG(p1.parameters(),p2.parameters(), lr = 0.05)
env = bilinear()

In [6]:
batch_size = 2000
num_episode = 400

In [9]:
### Competitive Policy gradient
p1 = policy() 
p2 = policy() 
# Initialisation of CPG and game environement
optim = CoPG(p1.parameters(),p2.parameters(), lr =0.1)
env = bilinear()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.ion()

fig.show()
fig.canvas.draw()

mat_ag1_cpg = []
mat_ag2_cpg = []

for t_eps in range(num_episode):
    mat_action = []
    mat_state1 = []
    mat_reward1 = []
    mat_done = []
    mat_state2 = []
    mat_reward2 = []
    state, _, _, _, _ = env.reset()
    
    #data_collection
    for i in range(batch_size):
        dist1 = p1()
        action1 = dist1.sample() 
    
        dist2 = p2()
        action2 = dist2.sample()
        action = np.array([action1, action2])

        state = np.array([0,0])
        mat_state1.append(torch.FloatTensor(state))
        mat_state2.append(torch.FloatTensor(state))
        mat_action.append(torch.FloatTensor(action))

        state, reward1, reward2, done, _ = env.step(action)
        
        mat_reward1.append(torch.FloatTensor([reward1]))
        mat_reward2.append(torch.FloatTensor([reward2]))

        mat_done.append(torch.FloatTensor([1 - done]))
    
    action_both = torch.stack(mat_action)
    mat_ag1_cpg.append(torch.mean(action_both[:,0]))
    mat_ag2_cpg.append(torch.mean(action_both[:,1]))
    action_both = torch.stack(mat_action)

    val1_p = torch.stack(mat_reward1).transpose(0,1)

    if val1_p.size(0)!=1:
        raise 'error'

    pi_a1_s = p1()
    action_both = torch.stack(mat_action)
    log_probs1 = pi_a1_s.log_prob(action_both[:,0])

    pi_a2_s = p2()
    log_probs2 = pi_a2_s.log_prob(action_both[:,1])

    objective = log_probs1*log_probs2*(val1_p)
    if objective.size(0)!=1:
        raise 'error'

    ob = objective.mean()

    s_log_probs1 = log_probs1.clone() 
    s_log_probs2 = log_probs2.clone()

    for i in range(1,log_probs1.size(0)):
        s_log_probs1[i] = torch.add(s_log_probs1[i - 1],log_probs1[i])
        s_log_probs2[i] = torch.add(s_log_probs2[i - 1], log_probs2[i])

    objective2 = s_log_probs1*log_probs2*(val1_p)
    ob2 = objective2.mean()

    objective3 = log_probs1*s_log_probs2*(val1_p)
    ob3 = objective3.mean()

    lp1 = log_probs1*val1_p
    lp1=lp1.mean()
    lp2 = log_probs2*val1_p
    lp2=lp2.mean()
    optim.zero_grad()

    optim.step(ob, lp1, lp2)

    ax.clear()
    plt.title('Agent''s policy in bilinear game')
    plt.xlabel('$Iterations$')
    plt.ylabel('$\mu$')
    ax.plot(np.array(mat_ag1_cpg), label='player 1')
    ax.plot(np.array(mat_ag2_cpg), label='player 2')
    plt.legend(loc='upper left')
    fig.canvas.draw()


<IPython.core.display.Javascript object>

### The plot above show convergence of agent's policy using Competitive Policy Gradient (CoPG)

## Gradient Descent Ascent (GDA)

In [11]:
# Gradient descent ascent
p1 = policy()
p2 = policy()
for p in p1.parameters():
    print(p)
optim_p1 = torch.optim.SGD(p1.parameters(), lr=0.05)
optim_p2 = torch.optim.SGD(p2.parameters(), lr=0.05)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.ion()

batch_size = 2000
num_episode = 400

env = bilinear()
mat_ag1_gda = []
mat_ag2_gda = []

for t_eps in range(num_episode):
    mat_action = []
    mat_state1 = []
    mat_reward1 = []
    mat_done = []
    mat_state2 = []
    mat_reward2 = []
    state, _, _, _, _ = env.reset()
    
    #data_collection
    for i in range(batch_size):
        dist1 = p1()
        action1 = dist1.sample()

        dist2 = p2()
        action2 = dist2.sample()
        action = np.array([action1, action2])

        state = np.array([0,0])
        mat_state1.append(torch.FloatTensor(state))
        mat_state2.append(torch.FloatTensor(state))
        mat_action.append(torch.FloatTensor(action))

        state, reward1, reward2, done, _ = env.step(action)
        mat_reward1.append(torch.FloatTensor([reward1]))
        mat_reward2.append(torch.FloatTensor([reward2]))

        mat_done.append(torch.FloatTensor([1 - done]))

    action_both = torch.stack(mat_action)
    mat_ag1_gda.append(torch.mean(action_both[:,0]))
    mat_ag2_gda.append(torch.mean(action_both[:,1]))

    val1_p = torch.stack(mat_reward1).transpose(0,1)
    pi_a1_s = p1()
    log_probs1 = pi_a1_s.log_prob(action_both[:,0])
    objective = -log_probs1 * (val1_p)
    ob = objective.mean()
    optim_p1.zero_grad()
    ob.backward()
    optim_p1.step()

    val2_p = -val1_p
    pi_a2_s = p2()
    log_probs2 = pi_a2_s.log_prob(action_both[:, 1])

    objective2 = -log_probs2 * (val2_p)
    optim_p2.zero_grad()
    ob2 = objective2.mean()
    ob2.backward()
    optim_p2.step()
    
    ax.clear()
    ax.plot(np.array(mat_ag1_gda),np.array(mat_ag2_gda),label='GDA')
    ax.plot(np.array(mat_ag1_cpg),np.array(mat_ag2_cpg),label='CoPG')
    plt.title('Agent''s policy in bilinear game')
    plt.xlabel('$\mu^1$')
    plt.ylabel('$\mu^2$')
    plt.legend(loc='upper left')
    fig.canvas.draw()

Parameter containing:
tensor([1.], requires_grad=True)
Parameter containing:
tensor([0.2000], requires_grad=True)


<IPython.core.display.Javascript object>

In the above plot, both CPG and GDA starts at (1,1). GDA diverges, whereas CPG converges to the unique Nash equilibrium (0,0).