In [2]:
import numpy as np
import time
import random
import gym

In [25]:

import numpy as np
import pandas as pd
import time

np.random.seed(2)  # reproducible


N_STATES = 6   # the length of the 1 dimensional world
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.9   # greedy police
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 13   # maximum episodes
FRESH_TIME = 0.3    # fresh time for one move


def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),     # q_table initial values
        columns=actions,    # actions's name
    )
    # print(table)    # show table
    return table


def choose_action(state, q_table):
    # This is how to choose an action
    state_actions = q_table.iloc[state, :]
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
        action_name = np.random.choice(ACTIONS)
    else:   # act greedy
        action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
    return action_name


def get_env_feedback(S, A):
    # This is how agent will interact with the environment
    if A == 'right':    # move right
        if S == N_STATES - 2:   # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:   # move left
        R = 0
        if S == 0:
            S_ = S  # reach the wall
        else:
            S_ = S - 1
    return S_, R


def update_env(S, episode, step_counter):
    # This is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)


def rl():
    # main part of RL loop
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:

            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)  # take action & get next state and reward
            q_predict = q_table.loc[S, A]
            if S_ != 'terminal':
                q_target = R + GAMMA * q_table.iloc[S_, :].max()   # next state is not terminal
            else:
                q_target = R     # next state is terminal
                is_terminated = True    # terminate this episode

            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # update
            S = S_  # move to next state

            update_env(S, episode, step_counter+1)
            step_counter += 1
    return q_table


if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)

                                
Q-table:

       left     right
0  0.000000  0.004320
1  0.000000  0.025005
2  0.000030  0.111241
3  0.000000  0.368750
4  0.027621  0.745813
5  0.000000  0.000000


In [20]:

import numpy as np
import pandas as pd
import time

np.random.seed(2)  # reproducible


N_STATES = 6   # the length of the 1 dimensional world
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.9   # greedy police
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 10   # maximum episodes
FRESH_TIME = 0.3    # fresh time for one move


def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),     # q_table initial values
        columns=actions,    # actions's name
    )
    # print(table)    # show table
    return table


def choose_action(state, q_table):
    # This is how to choose an action
    state_actions = q_table.iloc[state, :]
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
        action_name = np.random.choice(ACTIONS)
    else:   # act greedy
        action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
    return action_name


def get_env_feedback(S, A):
    # This is how agent will interact with the environment
    if A == 'left':    # move LEFT
        if S == 1:   # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S - 1
            R = 0
    else:   # move right
        R = 0
        if S == N_STATES-1:
            S_ = N_STATES-1  # reach the wall
        else:
            S_ = S + 1
    return S_, R


def update_env(S, episode, step_counter):
    # This is how environment be updated
    env_list =['T']+['_']*(N_STATES-1)   # 'T---------' our environment
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'O'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)


def rl():
    # main part of RL loop
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = N_STATES-1
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:

            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)  # take action & get next state and reward
            q_predict = q_table.loc[S, A]
            if S_ != 'terminal':
                q_target = R + GAMMA * q_table.iloc[S_, :].max()   # next state is not terminal
            else:
                q_target = R     # next state is terminal
                is_terminated = True    # terminate this episode

            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # update
            S = S_  # move to next state

            update_env(S, episode, step_counter+1)
            step_counter += 1
    return q_table


if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)

                                
Q-table:

       left         right
0  0.000000  0.000000e+00
1  0.651322  2.268000e-03
2  0.244485  2.217284e-03
3  0.067385  0.000000e+00
4  0.012487  5.904900e-07
5  0.001490  0.000000e+00


In [24]:
rom tetrisrl.environment import Environment,Action
from tetrisrl.baseline import LowestCenterOfGravityAgent
from tetrisrl.agents import HumanAgent, RandomAgent
from tetrisrl.rl import QLearningAgent
from tetrisrl.serialize import ObservationSerializer
import json
import random
import sys
import logging
import os
import shutil
import time


class Globals(object):
    SCREEN_DIMS=(640,480)

class Colors(object):
    WHITE=(255,255,255)
    GRAY=(128,128,128)
    GREEN=(0,255,0)
    RED=(255,0,0)
    BLUE=(0,0,255)
    BLACK=(0,0,0)

class Engine(object):
    def __init__(self, environment,agent,config,output_dir):
        self.environment=environment
        self.s = self.environment.initial_state()
        self.total_pos_r = 0.0
        self.total_neg_r = 0.0
        self.agent=agent
        self.fps = config["fps"]
        self.show = config["show"]
        self.max_time = config["max_time"]
        self.output_dir = output_dir
        self.oserializer = ObservationSerializer()
        self.debug_mode = config["debug_mode"]

        if config["log_observations"]:
            self.obs_log_file = open("{}/observations.json".format(output_dir),"w")
        else:
            self.obs_log_file = None

        if config["replay_observations"]:
            for i in range(config["replay_count"]):
                for fn in config["replay_observations"]:
                    self.replay_observations(fn)

        if self.show:
            import pygame
            from pygame.locals import *
            pygame.init()
            self.font = pygame.font.SysFont(None, 28)    
            self.screen=pygame.display.set_mode(Globals.SCREEN_DIMS,0,32)
            self.clock = pygame.time.Clock()
            pygame.display.set_caption("Tetris")
            self.draw()

    def replay_observations(self, fn):
        with open(fn) as fin:
            print "Replaying from file: {}...".format(fn)
            for l in fin:
                s,a,r,sprime,pfbm = self.oserializer.deserialize_json(json.loads(l))
                self.agent.observe_sars_tuple(s,a,r,sprime,pfbm=pfbm)

    def detect_quit(self):
        if self.show:
            if pygame.event.peek(QUIT):
                pygame.quit()
                sys.exit()

    def loop(self):
        def bitmap_mean_active_column(b):
            if b.sum()==0:
                return 0
            else:
                cols = b.nonzero()[1]
                return cols.mean()
        self.draw()
        t = 0
        start = time.clock()
        while True:
            t += 1
            if t % 1000 == 0:
                self.agent.save_model("{}/model.{:06d}iters".format(self.output_dir, t))

            if t > self.max_time:
                break

            if self.show:
                self.clock.tick(self.fps)
            self.detect_quit()
            a,debug_info = self.agent.act(self.s,debug_mode=self.debug_mode)
            if self.debug_mode:
                for pfbm in sorted(debug_info["pfbms"], cmp=lambda x,y: cmp(bitmap_mean_active_column(x),bitmap_mean_active_column(y))):
                    self.clock.tick(6)
                    self.draw_bitmap(pfbm)
                    pygame.display.update()
                    self.clock.tick(6)
            sprime,r,pfbm,rcounts = self.environment.next_state_and_reward(self.s, a)
            if "rows_cleared" in rcounts:
                logging.info("ROWS_CLEARED: {}".format(rcounts["rows_cleared"]))
            if "game_over" in rcounts:
                logging.info("GAME_OVER")

            if r > 0:
                self.total_pos_r += r
            else:
                self.total_neg_r += r

            if self.obs_log_file:
                self.obs_log_file.write("{}\n".format(json.dumps(self.oserializer.serialize_json(self.s,a,r,sprime,pfbm=pfbm))))

            self.agent.observe_sars_tuple(self.s,a,r,sprime,pfbm=pfbm)
            self.s = sprime
            self.draw()
            duration = time.clock()-start
            print "Runtime={:.2f}s  T={}  Total Reward: {:.2f}  {:.2f}".format(duration, t, self.total_pos_r, self.total_neg_r)
            
    def draw(self):
        if not self.show:
            return

        self.screen.fill(Colors.BLACK)
        w = 20
        b = self.s.arena.bitmap
        ls = self.s.lshape

        text = self.font.render("Total Reward: {:.2f}  {:.2f}".format(self.total_pos_r, self.total_neg_r), True, Colors.WHITE, Colors.BLUE)
        textRect = text.get_rect()
        textRect.centerx = (w * b.shape[1]) + 250
        textRect.centery = self.screen.get_rect().centery
        self.screen.blit(text, textRect)
        
        self.draw_bitmap(b)
        self.draw_lshape(ls)

        pygame.display.update()

    def draw_bitmap(self, b):
        w = 20
        for r in range(b.shape[0]):
            for c in range(b.shape[1]):
                rect = (w+(w*c),w+(w*r),w,w)
                if b[r,c]:
                    pygame.draw.rect(self.screen, Colors.GREEN, rect)
                else:
                    pygame.draw.rect(self.screen, Colors.GRAY, rect)

    def draw_lshape(self, ls):
        w = 20
        for coord in ls.coords():
            r,c = (coord[0],coord[1])
            pygame.draw.rect(self.screen, Colors.BLUE, (w+(w*c),w+(w*r),w,w))




# Modifies in place
def override_config(c, o):
    for key,val in o.iteritems():
        keys = key.split("/")
        p = c
        for k in keys[:-1]:
            p = p[k]
        p[keys[-1]] = val

def process_config(cf, of, sf):
    with open(cf,"r") as fin:
        config = json.load(fin)

    if of.lower() != "none": 
        with open(of,"r") as fin:
            override = json.load(fin)
        override_config(config, override)
        
    with open(sf, "w") as fout:
        json.dump(config, fout, indent=4, sort_keys=True)

    return config



config_file = sys.argv[1]
config_override_file = sys.argv[2]
output_dir = sys.argv[3]

# output dir
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# config
saved_config_file = "{}/config".format(output_dir)
config = process_config(config_file, config_override_file, saved_config_file)

# log file
log_file = "{}/log".format(output_dir)
logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG)


# initialize everything
e = Environment(config["environment"])

agent_type = config["agent"]["type"]

if agent_type == "rl":
    agent = QLearningAgent(e,config["agent"])
elif agent_type == "lcog":
    agent = LowestCenterOfGravityAgent(e)
elif agent_type == "human":
    agent = HumanAgent()
    assert config["engine"]["show"]
elif agent_type == "random":
    agent = RandomAgent()
else:
    raise Exception("Unknown agent type: {}".format(config["agent"]["type"]))

engine = Engine(e,agent,config["engine"],output_dir)
engine.loop()


SyntaxError: invalid syntax (<ipython-input-24-5695e5fe7633>, line 1)

In [23]:
import matplotlib.pyplot as plt
from collections import OrderedDict
import argparse
import sys
import os

def windowed_average(a, wsize):
    results = []
    if not a:
        return [],[]
    s = sum(a[:wsize])
    xs = []
    for i in range(wsize-1,len(a)):
        xs.append(i)
        results.append(s / wsize)
        s -= a[i+1-wsize]
        if i < len(a)-1:
            s += a[i+1]
    return xs,results

def read_windowed_stats(fn, windows):
    r = []
    d = []
    absd = []

    with open(fn) as fin:
        for line in fin:
            tokens = line.strip().split()
            if line.startswith("INFO:root:DELTA:"):
                d.append(float(tokens[-1]))
                absd.append(abs(float(tokens[-1])))
            if line.startswith("INFO:root:REWARD:"):
                r.append(float(tokens[-1]))
    return {
        "reward": windowed_average(r, windows["reward"]), 
        "delta": windowed_average(d, windows["delta"]),
        "abs_delta": windowed_average(absd, windows["abs_delta"])
        }

def make_plot(fn, xlabel, ylabel, title, stats, stat_name):
    #line_styles = ["r--", "b-.", "g:", "o-", "p:"]
    line_styles = [
        {
            "color": "blue"
        },
        {
            "color": "green"
        },
        {
            "color": "orange"
        },
        {
            "color": "red"
        },
        {
            "color": "pink"
        },
    ]
    for i,name in enumerate(stats.keys()):
        print "name: {},   stat_name={}".format(name,stat_name)
        xs,ys = stats[name][stat_name]
        kwargs = line_styles[i]
        plt.plot(xs,ys,label=name, **kwargs)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel) 
    plt.title(title)
    plt.legend(loc=3)
    plt.savefig(fn)
    plt.clf()
    

def plot_results(output_dir, input_dirs, windows):

    stats = OrderedDict()
    for input_dir in input_dirs:
        name = input_dir.split("/")[-1]
        log = "{}/log".format(input_dir)
        stats[name] = read_windowed_stats(log, windows)

    make_plot("{}/delta.png".format(output_dir), "Timestep (block placements)", "Delta", "Trailing average of {} latest delta values".format(windows["delta"]), stats, "delta")
    make_plot("{}/abs_delta.png".format(output_dir), "Timestep (block placements)", "Delta magnitude", "Trailing average of {} latest delta magnitudes".format(windows["abs_delta"]), stats, "abs_delta")
    make_plot("{}/reward.png".format(output_dir), "Timestep (game ticks)", "Reward", "Trailing average of {} latest reward values".format(windows["reward"]), stats, "reward")


def main():
    output_dir = "plots/{}".format(sys.argv[1])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    windows = {"delta": 500,
               "abs_delta": 500,
               "reward": 30000}

    plot_results(output_dir, sys.argv[2:], windows)



if __name__ == "__main__":
    main()

SyntaxError: invalid syntax (<ipython-input-23-a2462e0d3442>, line 60)

In [None]:
f