In [1]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [2]:
# Function to read expert data

def read_data(datasets_dir="./data", frac = 0.1):
    print("... read data")
    data_file = os.path.join(datasets_dir, 'transitions.pkl')
    data = np.load(data_file, allow_pickle=True)
    
    X = []
    y = []
    
    for i in range(len(data)):
        for j in range(len(data[i])):
            X.append(data[i][j][0])
            y.append(data[i][j][1])

    states = torch.cat(X).to(torch.float32)
    actions = torch.cat(y).to(torch.float32)
    print(states.shape)
    print(actions.shape)    
    return states, actions

In [3]:
# read data 

states, actions = read_data()
print(f"States: {states}, Actions: {actions}")



... read data
torch.Size([20000, 4])
torch.Size([20000, 1])
States: tensor([[-0.0227,  0.0431,  0.0228, -0.0495],
        [-0.0218, -0.1524,  0.0218,  0.2503],
        [-0.0249,  0.0424,  0.0268, -0.0354],
        ...,
        [-0.0908, -0.2321, -0.0044,  0.3062],
        [-0.0954, -0.0369,  0.0017,  0.0121],
        [-0.0962,  0.1581,  0.0020, -0.2801]]), Actions: tensor([[0.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [0.]])


In [4]:
# Visualize the data as panda's data frame

import pandas as pd
data= pd.DataFrame({"X1": states[:, 0], "X2": states[:, 1], "X3": states[:, 2], "X4": states[:, 3], "action": actions.flatten()})


data.head(10)

Unnamed: 0,X1,X2,X3,X4,action
0,-0.022675,0.043077,0.02281,-0.049454,0.0
1,-0.021814,-0.152365,0.021821,0.250337,1.0
2,-0.024861,0.042439,0.026827,-0.035384,1.0
3,-0.024012,0.237166,0.02612,-0.319483,0.0
4,-0.019269,0.041682,0.01973,-0.018679,1.0
5,-0.018435,0.236516,0.019357,-0.305072,0.0
6,-0.013705,0.041123,0.013255,-0.006348,1.0
7,-0.012882,0.236053,0.013128,-0.29482,0.0
8,-0.008161,0.040746,0.007232,0.001975,1.0
9,-0.007346,0.235764,0.007271,-0.288418,0.0


In [5]:
data.action.value_counts()

0.0    10004
1.0     9996
Name: action, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(states, actions, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

torch.Size([16000, 4])
torch.Size([16000, 1])
torch.Size([4000, 4])
torch.Size([4000, 1])


In [7]:
import torch
from torch import nn

device = "cuda" if torch.cuda.is_available() else "cpu"

device

'cuda'

In [8]:
class StateActionModel(nn.Module):

    def __init__(self,
                 input_size: int = 4,
                 hiddenLayer1_size: int = 128,
                 hiddenLayer2_size: int = 128,
                 output_size: int = 1) -> None: 
        
        super(StateActionModel, self).__init__()
        self.hiddenLayer1 = nn.Linear(in_features= input_size,
                             out_features= hiddenLayer1_size)
        self.hiddenLayer2 = nn.Linear(in_features= hiddenLayer1_size,
                             out_features= hiddenLayer2_size)
        self.output = nn.Linear(in_features= hiddenLayer2_size,
                             out_features= output_size)        
    def forward(self, x):
        x = F.relu(self.hiddenLayer1(x))
        x = F.relu(self.hiddenLayer2(x))
        x = self.output(x)
        return x

In [9]:
model = StateActionModel().to(device)
model

StateActionModel(
  (hiddenLayer1): Linear(in_features=4, out_features=128, bias=True)
  (hiddenLayer2): Linear(in_features=128, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
)

In [10]:
# Make predictions with the model
untrained_preds = model(X_test.to(device))
print(f"Length of predictions: {len(untrained_preds)}, Shape: {untrained_preds.shape}")
print(f"Length of test samples: {len(y_test)}, Shape: {y_test.shape}")
print(f"\nFirst 10 predictions:\n{untrained_preds[:10]}")
print(f"\nFirst 10 test labels:\n{y_test[:10]}")

Length of predictions: 4000, Shape: torch.Size([4000, 1])
Length of test samples: 4000, Shape: torch.Size([4000, 1])

First 10 predictions:
tensor([[-0.0623],
        [-0.0547],
        [-0.0653],
        [-0.0640],
        [-0.0608],
        [-0.0794],
        [-0.0617],
        [-0.0611],
        [-0.0679],
        [-0.0590]], device='cuda:0', grad_fn=<SliceBackward0>)

First 10 test labels:
tensor([[1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]])


In [11]:
# Create a loss function
# loss_fn = nn.BCELoss() # BCELoss = no sigmoid built-in
loss_fn = nn.BCEWithLogitsLoss() # BCEWithLogitsLoss = sigmoid built-in

# Create an optimizer
optimizer = torch.optim.SGD(params=model.parameters(), 
                            lr=0.001)

In [12]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

In [13]:
# View the frist 5 outputs of the forward pass on the test data
y_logits = model(X_test.to(device))
y_logits

tensor([[-0.0623],
        [-0.0547],
        [-0.0653],
        ...,
        [-0.1163],
        [-0.0594],
        [-0.0664]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [14]:
# Find the predicted labels (round the prediction probabilities)
y_preds = torch.round(y_logits)
print(f"predictions: {y_preds}")

# In full
y_pred_labels = torch.round((model(X_test.to(device))))

# Check for equality
print(torch.eq(y_preds.squeeze(), y_pred_labels.squeeze()))

# Get rid of extra dimension
y_preds.squeeze()

predictions: tensor([[-0.],
        [-0.],
        [-0.],
        ...,
        [-0.],
        [-0.],
        [-0.]], device='cuda:0', grad_fn=<RoundBackward0>)
tensor([True, True, True,  ..., True, True, True], device='cuda:0')


tensor([-0., -0., -0.,  ..., -0., -0., -0.], device='cuda:0',
       grad_fn=<SqueezeBackward0>)

In [15]:
y_test

tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [0.],
        [1.]])

In [19]:
torch.manual_seed(42)

epochs = 10000 # Train for longer

# Put data to target device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

model.train()
for epoch in range(epochs):
    ### Training
    # 1. Forward pass
    y_logits = model(X_train)
    y_pred = torch.round(torch.sigmoid(y_logits)) # logits -> predicition probabilities -> prediction labels

    # 2. Calculate loss/accuracy
    loss = loss_fn(y_logits, y_train)
    acc = accuracy_fn(y_true=y_train, 
                      y_pred=y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model(X_test)
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test)
        test_acc = accuracy_fn(y_true=y_test,
                               y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 100 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")


Epoch: 0 | Loss: 0.41969, Accuracy: 80.16% | Test loss: 0.42026, Test acc: 80.47%
Epoch: 100 | Loss: 0.41758, Accuracy: 80.15% | Test loss: 0.41814, Test acc: 80.50%
Epoch: 200 | Loss: 0.41553, Accuracy: 80.12% | Test loss: 0.41608, Test acc: 80.53%
Epoch: 300 | Loss: 0.41353, Accuracy: 80.13% | Test loss: 0.41406, Test acc: 80.53%
Epoch: 400 | Loss: 0.41157, Accuracy: 80.12% | Test loss: 0.41210, Test acc: 80.55%
Epoch: 500 | Loss: 0.40967, Accuracy: 80.09% | Test loss: 0.41019, Test acc: 80.55%
Epoch: 600 | Loss: 0.40782, Accuracy: 80.10% | Test loss: 0.40833, Test acc: 80.47%
Epoch: 700 | Loss: 0.40602, Accuracy: 80.12% | Test loss: 0.40652, Test acc: 80.42%
Epoch: 800 | Loss: 0.40426, Accuracy: 80.12% | Test loss: 0.40476, Test acc: 80.42%
Epoch: 900 | Loss: 0.40255, Accuracy: 80.12% | Test loss: 0.40304, Test acc: 80.42%
Epoch: 1000 | Loss: 0.40089, Accuracy: 80.11% | Test loss: 0.40137, Test acc: 80.42%
Epoch: 1100 | Loss: 0.39927, Accuracy: 80.16% | Test loss: 0.39975, Test acc:

Epoch: 9700 | Loss: 0.34576, Accuracy: 80.24% | Test loss: 0.34604, Test acc: 80.60%
Epoch: 9800 | Loss: 0.34558, Accuracy: 80.25% | Test loss: 0.34586, Test acc: 80.60%
Epoch: 9900 | Loss: 0.34540, Accuracy: 80.25% | Test loss: 0.34568, Test acc: 80.58%


In [17]:
torch.save(model.state_dict(), os.path.join("model_0.pkl"))
print("Model saved in file")

Model saved in file


In [20]:


from __future__ import print_function

from datetime import datetime
import numpy as np
import gym
import os
import json
import torch


def run_episode(env, agent, rendering=True, max_timesteps=1000):
    
    episode_reward = 0
    step = 0

    state = env.reset()
    while True:

#         get action
        agent.eval()
        tensor_state = torch.from_numpy(state)
        tensor_action = agent(tensor_state)
        a = torch.round(torch.sigmoid(tensor_action)).squeeze().to(torch.int).detach().numpy()
#         a = np.random.randint(0, 1)
        next_state, r, done, info = env.step(a)   
        episode_reward += r       
        state = next_state
        step += 1
        
        if rendering:
            env.render()

        if done or step > max_timesteps: 
            break

    return episode_reward


if __name__ == "__main__":

    # important: don't set rendering to False for evaluation (you may get corrupted state images from gym)
    rendering = True                      
    
    n_test_episodes = 100                  # number of episodes to test

    # load agent
    agent = StateActionModel()
    agent.load_state_dict(torch.load("model_0.pkl"))

    env = gym.make('CartPole-v1').unwrapped

    episode_rewards = []
    for i in range(n_test_episodes):
        episode_reward = run_episode(env, agent, rendering=rendering)
        episode_rewards.append(episode_reward)

    # save results in a dictionary and write them into a .json file
    results = dict()
    results["episode_rewards"] = episode_rewards
    results["mean"] = np.array(episode_rewards).mean()
    results["std"] = np.array(episode_rewards).std()
    print(results)
            
    env.close()
    print('... finished')

{'episode_rewards': [79.0, 55.0, 66.0, 55.0, 85.0, 57.0, 89.0, 58.0, 72.0, 109.0, 117.0, 81.0, 54.0, 104.0, 104.0, 58.0, 98.0, 158.0, 54.0, 96.0, 60.0, 66.0, 57.0, 100.0, 82.0, 70.0, 80.0, 96.0, 142.0, 68.0, 81.0, 54.0, 87.0, 57.0, 76.0, 73.0, 78.0, 87.0, 56.0, 85.0, 67.0, 125.0, 48.0, 52.0, 52.0, 86.0, 106.0, 65.0, 75.0, 120.0, 64.0, 68.0, 133.0, 95.0, 83.0, 62.0, 91.0, 72.0, 50.0, 58.0, 98.0, 77.0, 84.0, 98.0, 72.0, 59.0, 113.0, 95.0, 110.0, 110.0, 144.0, 103.0, 61.0, 77.0, 73.0, 124.0, 68.0, 89.0, 50.0, 59.0, 89.0, 70.0, 91.0, 58.0, 58.0, 52.0, 99.0, 51.0, 68.0, 91.0, 74.0, 114.0, 85.0, 67.0, 56.0, 74.0, 92.0, 107.0, 78.0, 50.0], 'mean': 80.64, 'std': 23.47701855006295}
... finished
