In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('CartPole-v0')
n_actions = env.action_space.n
theta = np.random.rand(4) # state has 4 dimensions = (car position, car speed, pole angle, angular speed)

def run_episode(theta):
    state = env.reset()
    ep_reward = 0
    done = False
    while not done:
        #env.render()
        action = 0 if np.dot(theta, state) >= 0 else 1
        state, reward, done, _ = env.step(action)
        ep_reward += reward
        #print("Reward received = ", reward)
        #print("New state =", new_state)
    #print("Episode reward = ", ep_reward)
    return ep_reward

def run_episodes(theta, no_episodes=100):
    total_reward = 0
    for _ in range(no_episodes):
        total_reward += run_episode(theta)
    
    #print("Total reward/no_episodes =", total_reward/no_episodes)
    #print("Used theta =", theta)
    return total_reward/no_episodes
    

#run_episodes(theta, 10)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
#Random search
theta_best = np.random.rand(4)
score_best = 0
for it in range(1000):
    #candidate solution
    theta = np.random.rand(4)
    score = run_episodes(theta, 100) # = fitness
    if score > score_best:
        score_best = score
        theta_best = theta
        print("NEW BEST:", score)
    if it % 200==0: print("Iteration:", it)

print("Final score:", score_best)
print("Final best theta:", theta_best)
        

NEW BEST: 9.24
Iteration: 0
NEW BEST: 9.76
NEW BEST: 31.39
NEW BEST: 31.55
NEW BEST: 31.95
NEW BEST: 34.91
NEW BEST: 37.81
Iteration: 200
NEW BEST: 38.99
NEW BEST: 42.09
Iteration: 400
Iteration: 600
Iteration: 800
Final score: 42.09
Final best theta: [0.03997316 0.68318706 0.00630608 0.00586531]


In [42]:
def init_candidates(n):
    cands = []
    for _ in range(n):
        cands.append(np.random.rand(4))
    return np.array(cands)

def evaluate_and_sort(cands):
    cands_and_scores = []
    for cand in cands:
        cands_and_scores.append(np.array([cand,run_episodes(cand)]))
        
    cands_and_scores = np.array(cands_and_scores)
    sorted_cands_and_scores = sorted(cands_and_scores,key=lambda x: x[1])[::-1] # sort according to 2nd column and then reverse the array
    return np.array(sorted_cands_and_scores)

def get_top_cands(sorted_cands_and_scores, selection_rate):
    #print("Sorted cands and scores:\n", sorted_cands_and_scores)
    n = (int)(selection_rate * len(sorted_cands_and_scores))
    top_cands = np.copy(sorted_cands_and_scores[0:n,0])
    size = (n,len(top_cands[0]))
    #print("Wanted size =", size)
    #print("Top cands:\n", top_cands)
    top_cands = np.concatenate(top_cands)
    top_cands = np.reshape(top_cands, size)
    #print("Top cands after reshape:\n", top_cands)
    return top_cands

def crossover(cand1, cand2):
    """
    50% chance to take gene from one of the parents, probability per gene.
    """
    child = np.copy(cand1)
    choices = np.random.randint(2, size=len(cand1))
    #print("choices=",choices)
    child *= choices
    choices*=2
    choices[choices==0] = 1
    choices[choices==2] = 0
    child += cand2*choices
    return child

def get_child(top_cands):
    """
    Parents selected randomly from top selection_rate*pop_size candidates.
    """
    parent_ids = np.random.randint(len(top_cands), size=2)
    parent1 = top_cands[parent_ids[0]]
    parent2 = top_cands[parent_ids[1]]
    return crossover(parent1, parent2)

def refill_population(top_cands, pop_size):
    """
    Refills population of top candidates with their children.
    """
    no_children = pop_size - len(top_cands)
    children = []
    for _ in range(no_children):
        children.append(get_child(top_cands))

    children = np.array(children)
    #print("Top=", top_cands)
    #print("Children=", children)
    return np.append(top_cands, children, axis=0)

def mutate(population, mut_rate):
    """
    Adds Gaussian noise to each candidate. Sigma = mutation rate.
    """
    mu, sigma = 0, mut_rate
    noise = np.random.normal(mu, sigma, population.shape)
    population += noise
    return population

def population_fitness(sorted_cands_and_scores):
    scores = sorted_cands_and_scores[:,1]
    mean = np.mean(scores)
    return mean

In [44]:
# Genetic algorithm
def run_GA(generations=100, population_size=10, selection_rate=0.5, mutation_rate=0.1):
    theta_best = np.random.rand(4)
    score_best = 0
    #candidate solutions
    population = init_candidates(n=population_size)
    for it in range(generations):
        sorted_cands_and_scores = evaluate_and_sort(population)
        top_cands = get_top_cands(sorted_cands_and_scores, selection_rate)

        best_cand_score = sorted_cands_and_scores[0,1]
        if best_cand_score > score_best:
            score_best = best_cand_score
            theta_best = sorted_cands_and_scores[0,0]
            print("NEW BEST:", score_best)

        population = refill_population(top_cands, population_size)
        population = mutate(population, mutation_rate)

        if it % 5==0: 
            print("Generation:", it, " mean fitness =", population_fitness(sorted_cands_and_scores))
            

    print("Final best score:", score_best)
    print("Final best theta:", theta_best)

run_GA()

NEW BEST: 30.25
Generation: 0  mean fitness = 15.941999999999998
NEW BEST: 43.89
NEW BEST: 53.46
NEW BEST: 57.79
NEW BEST: 65.2
Generation: 5  mean fitness = 48.094
NEW BEST: 93.9
NEW BEST: 99.21
NEW BEST: 109.0
NEW BEST: 111.32
Generation: 10  mean fitness = 95.031
NEW BEST: 118.73
NEW BEST: 127.66
NEW BEST: 135.36
NEW BEST: 149.73
NEW BEST: 198.67
Generation: 15  mean fitness = 136.45299999999997
NEW BEST: 200.0
Generation: 20  mean fitness = 197.19500000000002
Generation: 25  mean fitness = 182.248
Generation: 30  mean fitness = 196.805
Generation: 35  mean fitness = 200.0
Generation: 40  mean fitness = 200.0
Generation: 45  mean fitness = 199.655
Generation: 50  mean fitness = 200.0
Generation: 55  mean fitness = 196.261
Generation: 60  mean fitness = 196.247
Generation: 65  mean fitness = 200.0
Generation: 70  mean fitness = 200.0
Generation: 75  mean fitness = 200.0
Generation: 80  mean fitness = 200.0
Generation: 85  mean fitness = 200.0
Generation: 90  mean fitness = 181.035
Ge

In [3]:
theta = [ 0.62887362, -0.77817081, -0.28637888, -0.84578956]
state = env.reset()
ep_reward = 0
done = False
while not done:
    env.render()
    action = 0 if np.dot(theta, state) >= 0 else 1
    state, reward, done, _ = env.step(action)
    ep_reward += reward
    #print("Reward received = ", reward)
    #print("New state =", new_state)
print("Episode reward = ", ep_reward)
    

Episode reward =  200.0
