<h2>OpenAI gym CartPole experiment</h2>

In [1]:
import gym
import numpy as np

In [2]:
def simulate(weights, env, num_episodes=1, timesteps=200, render=False):
    avg_value = 0
    for i_episode in range(num_episodes):
        observation = env.reset()
        run_reward = 0
        for t in range(timesteps):
            if render:
                env.render()
            action = int((np.sign(np.dot(observation, weights)) + 1) / 2.0)
            observation, reward, done, info = env.step(action)
            run_reward += reward
            if done:
                if render:
                    print("Episode finished after {} timesteps.".format(t+1))
                break
        value = run_reward / timesteps
        avg_value += value
    avg_value /= num_episodes
    return avg_value

In [3]:
def crossover(m1, m2, beta=0.5):
    crossover_point = np.random.randint(0, m1.shape[0])
    return np.append(m1[0:crossover_point], np.append(beta*m1[crossover_point] + (1-beta)*m2[crossover_point], m2[crossover_point + 1:]))

In [4]:
def evolve(env, population_size=20, generations = 100, selection_ratio = 0.5, tournament_size = 3, crossover_beta=0.5, mutation_ratio=0.05):
    new_population_size = int(selection_ratio * population_size)
    offsprings_size = population_size - new_population_size
    num_vars = env.observation_space.shape[0]
    mutation_size = int(population_size * mutation_ratio)
    
    population = (np.random.rand(population_size, num_vars) * 2.0) - 1.0
    best_value = -np.inf
    best_individual = None
    
    for generation in range(generations + 1):
        # Evaluation through simulation.
        values = np.apply_along_axis(simulate, 1, population, env, num_episodes=20, timesteps=1000)
        
        # Get survivors based on selection_ratio.
        indices = np.argsort(values)[::-1]
        if values[indices[0]] > best_value:
            best_individual = population[indices[0]]
            best_value = values[indices[0]]
        print("Gen %d, best value: %.3f" % (generation, best_value))
        if best_value > 0.99999:
            return best_value, best_individual
        survivors = population[indices[0 : new_population_size]]
        survivors_values = values[indices[0 : new_population_size]]
        
        offsprings = np.empty(shape=(offsprings_size, num_vars))
        # Run tournaments.
        for tournament in range(new_population_size):
            # Parent #1.
            # Get members of the tournament #1.
            members_inds = np.random.randint(0, new_population_size, tournament_size)
            members_values = survivors_values[members_inds]
            members = survivors[members_inds]
            # Get the winner 1.
            winner_1_index = np.argsort(members_values)[-1]
            winner_1 = members[winner_1_index]
            
            # Parent #2.
            # Get members of the tournament #2.
            members_inds = np.random.randint(0, new_population_size, tournament_size)
            members_values = survivors_values[members_inds]
            members = survivors[members_inds]
            # Get the winner 2.
            winner_2_index = np.argsort(members_values)[-1]
            winner_2 = members[winner_2_index]
            
            offspring = crossover(winner_1, winner_2, beta=crossover_beta)
            offsprings[tournament] = offspring
        
        population = np.append(survivors, offsprings, axis=0)
        
        # Mutation
        to_mutate_inds = np.random.randint(1, population_size, mutation_size)
        for i in to_mutate_inds:
            population[i] = (np.random.rand(num_vars) * 2.0) - 1.0
        
    env.close()
    return best_value, best_individual

In [5]:
np.random.seed(1313)
env = gym.make('CartPole-v0').env
best_value, best_individual = evolve(env,
                                     population_size=50,
                                     generations=10,
                                     tournament_size=8,
                                     mutation_ratio=0.2)

Gen 0, best value: 0.868
Gen 1, best value: 0.901
Gen 2, best value: 1.000


In [6]:
ret = simulate(best_individual, env, num_episodes=1, timesteps=1000, render=True)
env.close()
ret

1.0