In [1]:
import gym
import numpy as np
from keras.models     import Sequential
from keras.layers     import Dense
from keras.optimizers import Adam
import random

Using TensorFlow backend.


In [2]:
env = gym.make('CartPole-v1')
env.reset()
goal_steps = 500
score_requirement = 60
intial_games = 10000

In [3]:
def playgame():
    for step_index in range(goal_steps):
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print("Step {}:".format(step_index))
        print("action: {}".format(action))
        print("observation: {}".format(observation))
        print("reward: {}".format(reward))
        print("done: {}".format(done))
        print("info: {}".format(info))
        if done:
            break
    env.reset()

In [4]:
playgame()

Step 0:
action: 1
observation: [ 0.00590369  0.15193039 -0.0484187  -0.28354717]
reward: 1.0
done: False
info: {}
Step 1:
action: 0
observation: [ 0.0089423  -0.04246872 -0.05408965 -0.00651977]
reward: 1.0
done: False
info: {}
Step 2:
action: 1
observation: [ 0.00809292  0.15338555 -0.05422004 -0.31576614]
reward: 1.0
done: False
info: {}
Step 3:
action: 1
observation: [ 0.01116063  0.34923622 -0.06053537 -0.6250433 ]
reward: 1.0
done: False
info: {}
Step 4:
action: 1
observation: [ 0.01814536  0.54514869 -0.07303623 -0.93616026]
reward: 1.0
done: False
info: {}
Step 5:
action: 1
observation: [ 0.02904833  0.7411756  -0.09175944 -1.25087029]
reward: 1.0
done: False
info: {}
Step 6:
action: 0
observation: [ 0.04387184  0.54734144 -0.11677684 -0.98828166]
reward: 1.0
done: False
info: {}
Step 7:
action: 0
observation: [ 0.05481867  0.35396012 -0.13654248 -0.73443968]
reward: 1.0
done: False
info: {}
Step 8:
action: 1
observation: [ 0.06189787  0.55067772 -0.15123127 -1.06678853]
reward:

In [5]:
def modeldp():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 2)
            observation, reward, done, info = env.step(action)
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])                
            previous_observation = observation
            score += reward
            if done:
                break            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1]
                elif data[1] == 0:
                    output = [1, 0]
                training_data.append([data[0], output])
        
        env.reset()

    print(sum(accepted_scores)/len(accepted_scores))
    
    return training_data

In [6]:
training_data = modeldp()

69.84705882352941


In [7]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(1024, input_dim=input_size, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model

In [8]:
def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    model.fit(X, y, epochs=20)
    return model

In [9]:
trained_model = train_model(training_data)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [10]:
scores =  []
choices = []
for each_game in range(100):
    score = 0
    prev_obs = []
    for step_index in range(goal_steps):
        #env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1,len(prev_obs)))[0])
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        score+=reward
        if done:
            break
    env.reset()
    scores.append(score)
print(scores)
print('Average Score:',sum(scores)/len(scores))

[250.0, 500.0, 329.0, 352.0, 500.0, 500.0, 500.0, 243.0, 276.0, 500.0, 500.0, 500.0, 500.0, 245.0, 500.0, 500.0, 211.0, 343.0, 500.0, 500.0, 342.0, 281.0, 500.0, 500.0, 500.0, 343.0, 500.0, 500.0, 500.0, 309.0, 500.0, 500.0, 500.0, 318.0, 352.0, 500.0, 500.0, 500.0, 175.0, 500.0, 500.0, 500.0, 500.0, 196.0, 500.0, 210.0, 500.0, 500.0, 169.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 225.0, 500.0, 293.0, 193.0, 500.0, 213.0, 500.0, 500.0, 476.0, 500.0, 176.0, 500.0, 459.0, 500.0, 343.0, 221.0, 296.0, 500.0, 500.0, 192.0, 500.0, 203.0, 500.0, 201.0, 199.0, 209.0, 500.0, 289.0, 302.0, 237.0, 500.0, 289.0, 500.0, 500.0, 500.0, 500.0, 500.0, 400.0, 500.0, 500.0, 500.0, 500.0]
Average Score: 413.6
