# Part1.  Initial regression training

In [1]:
%load_ext Cython

In [2]:
%%cython

import sys
import time

import numpy as np
cimport numpy as np

import interface as bb
cimport interface as bb

from scipy.linalg.cython_blas cimport sgemm

cimport cython

cdef float alpha = 1.0, beta = 0.0
cdef float[::1,:] s, y, w

f_sq = [1,2]
f_intr = [1,4,11,12,13]
cdef: 
    int n_sq = len(f_sq)
    int n_intr = len(f_intr)

s = np.empty((1,37 + n_sq + n_intr), np.float32, order="F")
w = np.empty((37 + n_sq + n_intr, 4), np.float32, order="F")
y = np.empty((1,4), np.float32, order="F")

cdef int NUM_CACHE = 51, NUM_ROLLOUT = 50
cdef int cache_i = 0, cache_n = 0
cdef float[::1,:] cache_s, cache_y

cache_s = np.empty((NUM_CACHE,36), np.float32, order="F")
cache_y = np.empty((NUM_CACHE,4), np.float32, order="F")


@cython.boundscheck(False)
cdef void fast_target(float *state, int use_cache = 0):
    global cache_i, cache_n
    cdef int i, c, m, n, k, lda, ldb, ldc
    
    if use_cache == 1:
        c = 0
        while c < cache_n:
            i = 0
            while i < 36:
                if cache_s[c,i] != state[i]:
                    break
                i += 1
            if i == 36:
                for i in xrange(4):
                    y[0,i] = cache_y[c,i]
                return
            c += 1
        cache_i += 1
        if cache_i == NUM_CACHE:
            cache_i = 0
        if cache_n < NUM_CACHE:
            cache_n += 1
        for i in xrange(36):
            cache_s[cache_i,i] = state[i]
            s[0,i] = state[i]
        for i in xrange(n_sq):
            s[0,i+36] = state[f_sq[i]]**2
        for i in xrange(n_intr):
            s[0,i+36 + n_sq] = state[f_intr[i]]*state[35]
        s[0,36 + n_sq + n_intr] = 1.
    else:
        for i in xrange(36):
            s[0,i] = state[i]
        for i in xrange(n_sq):
            s[0,i+36] = state[f_sq[i]]**2
        for i in xrange(n_intr):
            s[0,i+36 + n_sq] = state[f_intr[i]]*state[35]
        s[0,36 + n_sq + n_intr] = 1.
    
    lda = 1
    ldb = 37 + len(f_sq) + len(f_intr)
    ldc = 1
    m = 1
    n = 4
    k = 37 + len(f_sq) + len(f_intr)
    sgemm("N", "N", &m, &n, &k, &alpha, &s[0,0], &lda, &w[0,0], &ldb, &beta, &y[0,0], &ldc)
    
    if use_cache == 1:
        for i in xrange(4):
            cache_y[cache_i,i] = y[0,i]
    

@cython.boundscheck(False)
cdef int fast_action(float *state, int use_cache = 0):
    cdef int i, best_act = -1
    cdef best_val = -1e9
    fast_target(state, use_cache)
    for i in xrange(4):
        if y[0,i] > best_val:
            best_val = y[0,i]
            best_act = i
    return best_act


@cython.boundscheck(False)
cdef float fast_value(float *state):
    cdef int i
    cdef best_val = -1e9
    fast_target(state, 1)
    for i in xrange(4):
        if y[0,i] > best_val:
            best_val = y[0,i]
    return best_val

@cython.boundscheck(False)
def dump_weights(weights):
    cdef int i, j
    for i in xrange(4):
        for j in xrange(37 + n_sq + n_intr):
            w[j,i] = weights[j,i]


def prepare_bbox(level='train', verbose=0):
    global cache_i, cache_n
    cache_i = 0
    cache_n = 0
    if bb.is_level_loaded():
        bb.reset_level()
    bb.load_level('../levels/'+level+'_level.data', verbose)


cdef float _rewards[4]
cdef float _mask[4]

@cython.boundscheck(False)
cdef crollout(int epoch=0, float curriculum=0.7):
    cdef:
        int i, a, action, has_next, checkpoint_id, has_change
        float r, prev_score, init_state35, next_state35, next_state35_abs, prev_state35_abs
        float *state
    
    init_state35 = bb.c_get_state()[35]
    checkpoint_id = bb.create_checkpoint()
   
    for a in xrange(4):
        
        _rewards[a] = 0
        _mask[a] = 0
        
        prev_score = bb.c_get_score()
        has_next = bb.c_do_action(a)
        state = bb.c_get_state()
        next_state35 = state[35]  
        
        if init_state35 != next_state35 or np.random.rand() < curriculum:
                        
            r = bb.c_get_score() - prev_score
            prev_score = bb.c_get_score()
            
            if has_next == 1:
                for i in xrange(NUM_ROLLOUT-1):
                    if epoch > 0:
                        action = fast_action(state, 1)
                    else:
                        action = 3

                    has_next = bb.c_do_action(action)
                    r += bb.c_get_score() - prev_score
                    state = bb.c_get_state()
                    prev_score = bb.c_get_score()
                    if has_next == 0:
                        break
                
                if has_next == 1 and epoch > 0:
                    r += fast_value(state)

            _rewards[a] = r
            _mask[a] = 1
        
        bb.load_from_checkpoint(checkpoint_id)
    bb.clear_all_checkpoints()


@cython.boundscheck(False)
def rollout(epoch=0, curriculum=0.7):
    cdef int i
    crollout(epoch, curriculum)
    rewards = np.empty(4, dtype=np.float32)
    mask = np.empty(4, dtype=np.float32)
    for i in xrange(4):
        rewards[i] = _rewards[i]
        mask[i] = _mask[i]
    return rewards, mask


def solve_lsq(X, y, lmd = 1):
    #regularization
    if lmd >0:
        Xsq = X.T.dot(X)
        I = np.diag([1]*Xsq.shape[0])
        I[-1,-1] = 0
        return np.linalg.inv(Xsq + lmd*I).dot(X.T.dot(y))
    else:
        return np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))

def train_epoch(X, Y, M):
    weights = []
    for i in xrange(4):
        m = M[:,i]
        y = Y[m,i]
        x = X[m]
        weights.append(solve_lsq(x,y))
    
    weights = np.array(weights).T.astype(np.float32)
    return weights

@cython.boundscheck(False)
def policy_iteration(n_epochs=20):
    cdef int epoch, action
    
    X = []
    Y = []
    M = []
    
    weights_out = []
    for epoch in range(n_epochs):
        start = time.time()
        prepare_bbox('train')
        while True:
            rewards, mask = rollout(epoch)
            state = bb.get_state().copy()
            X.append(state)
            Y.append(rewards)
            M.append(mask)
            
            if epoch > 0:
                action = fast_action(bb.c_get_state(), 1)
            else:
                action = np.random.randint(4)

            if bb.c_do_action(action) == 0:
                train_score = bb.finish(verbose=0)
                break
            
        Xa = np.array(X).astype(np.float32)
        Ya = np.array(Y).astype(np.float32)
        Ma = np.array(M).astype(np.bool)
        
        Xa_sq = Xa[:,f_sq]**2
        Xa_intr = Xa[:, f_intr] * Xa[:, 35].reshape(-1,1)
        bias = np.ones((Xa.shape[0], 1), dtype=np.float32)
        Xa = np.concatenate([Xa, Xa_sq, Xa_intr, bias], axis = 1).astype(np.float32)
        
        del X[:]
        del Y[:]
        del M[:]
        
        weights = train_epoch(Xa, Ya, Ma)
        weights_out.append(weights)
        
        print 'Epoch: {}, time: {}'.format(epoch, int(time.time() - start)),
        test(weights)
        sys.stdout.flush()
       
        dump_weights(weights)
    
    return weights_out

def test(weights):
    cdef:
        int action, has_next
    
    dump_weights(weights)
    results = []
    for lvl in  ('train', 'test'):
        prepare_bbox(lvl)
        has_next = 1
        while has_next:
            action = fast_action(bb.c_get_state(), 0)
            has_next = bb.c_do_action(action)
        results.append(bb.finish(verbose=0))
    print 'average  {:.2f}, test {:.2f}, train {:.2f}'.format(0.5*sum(results), results[1], results[0])
    return results

In [None]:
W = policy_iteration(20)

Epoch: 0, time: 35 average  -4915.00, test -5689.80, train -4140.20
Epoch: 1, time: 98 average  -1370.31, test -1743.54, train -997.07
Epoch: 2, time: 99 average  -421.89, test -392.04, train -451.74


In [10]:
import cPickle
with open('list_weights_reg1.pkl', 'wb') as f:
    cPickle.dump(W, f)

In [12]:
import numpy as np
for s_i in xrange(len(W)):
    print 'start:', s_i,
    test(np.array(W[s_i:]).mean(axis =0))

start: 0 average  2741.61, test 2746.48, train 2736.74
start: 1 average  2905.43, test 2962.72, train 2848.14
start: 2 average  2854.27, test 2885.77, train 2822.78
start: 3 average  2888.65, test 2955.03, train 2822.27
start: 4 average  2932.34, test 2977.38, train 2887.30
start: 5 average  2930.94, test 2981.25, train 2880.62
start: 6 average  2879.30, test 2937.88, train 2820.72
start: 7 average  2941.56, test 2938.00, train 2945.12
start: 8 average  2943.39, test 2998.83, train 2887.95
start: 9 average  2915.78, test 2976.14, train 2855.42
start: 10 average  2928.78, test 3019.45, train 2838.12
start: 11 average  2925.29, test 3044.38, train 2806.19
start: 12 average  2913.74, test 3056.60, train 2770.87
start: 13 average  2905.89, test 3070.63, train 2741.15
start: 14 average  2904.88, test 3047.90, train 2761.86
start: 15 average  2870.33, test 3058.99, train 2681.67
start: 16 average  2826.53, test 2933.59, train 2719.48
start: 17 average  2779.25, test 2864.62, train 2693.88
st

In [13]:
import cPickle
Wens = np.array(W[7:]).mean(axis =0)
test(Wens)
with open('weights_reg1_ens_p1.pkl', 'wb') as f:
    cPickle.dump(Wens, f)

average  2941.56, test 2938.00, train 2945.12


# Part 2. Policy improvment: separate regressions for state[35] sign

In [6]:
%%cython

import sys
import time

import numpy as np
cimport numpy as np

import interface as bb
cimport interface as bb

from scipy.linalg.cython_blas cimport sgemm

cimport cython

cdef float alpha = 1.0, beta = 0.0
cdef float[::1,:] s, y, w0, wpos, wneg

f_sq = [1,2]
f_intr = [1,4,11,12,13]
cdef: 
    int n_sq = len(f_sq)
    int n_intr = len(f_intr)

s = np.empty((1,37 + n_sq + n_intr), np.float32, order="F")
w0 = np.empty((37 + n_sq + n_intr, 4), np.float32, order="F")
wpos = np.empty((37 + n_sq + n_intr, 4), np.float32, order="F")
wneg = np.empty((37 + n_sq + n_intr, 4), np.float32, order="F")

y = np.empty((1,4), np.float32, order="F")

cdef int NUM_CACHE = 51, NUM_ROLLOUT = 50
cdef int cache_i = 0, cache_n = 0
cdef float[::1,:] cache_s, cache_y

cache_s = np.empty((NUM_CACHE,36), np.float32, order="F")
cache_y = np.empty((NUM_CACHE,4), np.float32, order="F")


@cython.boundscheck(False)
cdef void fast_target(float *state, int use_cache = 0):
    global cache_i, cache_n
    cdef int i, c, m, n, k, lda, ldb, ldc
    cdef float s35_sign
    
    if use_cache == 1:
        c = 0
        while c < cache_n:
            i = 0
            while i < 36:
                if cache_s[c,i] != state[i]:
                    break
                i += 1
            if i == 36:
                for i in xrange(4):
                    y[0,i] = cache_y[c,i]
                return
            c += 1
        cache_i += 1
        if cache_i == NUM_CACHE:
            cache_i = 0
        if cache_n < NUM_CACHE:
            cache_n += 1
        for i in xrange(36):
            cache_s[cache_i,i] = state[i]
            s[0,i] = state[i]
        for i in xrange(n_sq):
            s[0,i+36] = state[f_sq[i]]**2
        for i in xrange(n_intr):
            s[0,i+36 + n_sq] = state[f_intr[i]]*state[35]
        s[0,36 + n_sq + n_intr] = 1.
    else:
        for i in xrange(36):
            s[0,i] = state[i]
        for i in xrange(n_sq):
            s[0,i+36] = state[f_sq[i]]**2
        for i in xrange(n_intr):
            s[0,i+36 + n_sq] = state[f_intr[i]]*state[35]
        s[0,36 + n_sq + n_intr] = 1.
    
    lda = 1
    ldb = 37 + len(f_sq) + len(f_intr)
    ldc = 1
    m = 1
    n = 4
    k = 37 + len(f_sq) + len(f_intr)
    
    s35_sign = round(state[35]*10)
    if s35_sign == 0:
        sgemm("N", "N", &m, &n, &k, &alpha, &s[0,0], &lda, &w0[0,0], &ldb, &beta, &y[0,0], &ldc)
    elif s35_sign > 0:
        sgemm("N", "N", &m, &n, &k, &alpha, &s[0,0], &lda, &wpos[0,0], &ldb, &beta, &y[0,0], &ldc)
    elif  s35_sign < 0:
        sgemm("N", "N", &m, &n, &k, &alpha, &s[0,0], &lda, &wneg[0,0], &ldb, &beta, &y[0,0], &ldc)
    
    if use_cache == 1:
        for i in xrange(4):
            cache_y[cache_i,i] = y[0,i]
    

@cython.boundscheck(False)
cdef int fast_action(float *state, int use_cache = 0):
    cdef int i, best_act = -1
    cdef best_val = -1e9
    fast_target(state, use_cache)
    for i in xrange(4):
        if y[0,i] > best_val:
            best_val = y[0,i]
            best_act = i
    return best_act


@cython.boundscheck(False)
cdef float fast_value(float *state):
    cdef int i
    cdef best_val = -1e9
    fast_target(state, 1)
    for i in xrange(4):
        if y[0,i] > best_val:
            best_val = y[0,i]
    return best_val

@cython.boundscheck(False)
def dump_weights(weights):
    cdef int i, j
    for k, v in weights.iteritems():
        for i in xrange(4):
            for j in xrange(37 + n_sq + n_intr):
                if k == 0:
                    w0[j,i] = v[j,i]
                elif k == -1:
                    wneg[j,i] = v[j,i]
                elif k == 1:
                    wpos[j,i] = v[j,i]


def prepare_bbox(level='train', verbose=0):
    global cache_i, cache_n
    cache_i = 0
    cache_n = 0
    if bb.is_level_loaded():
        bb.reset_level()
    bb.load_level('../levels/'+level+'_level.data', verbose)


cdef float _rewards[4]
cdef float _mask[4]

@cython.boundscheck(False)
cdef crollout(int epoch=0, float curriculum=0.7):
    cdef:
        int i, a, action, has_next, checkpoint_id, has_change
        float r, prev_score, init_state35, next_state35, next_state35_abs, prev_state35_abs
        float *state
    
    init_state35 = bb.c_get_state()[35]
    checkpoint_id = bb.create_checkpoint()
   
    for a in xrange(4):
        
        _rewards[a] = 0
        _mask[a] = 0
        
        prev_score = bb.c_get_score()
        has_next = bb.c_do_action(a)
        state = bb.c_get_state()
        next_state35 = state[35]  
        
        if init_state35 != next_state35 or np.random.rand() < curriculum:
                        
            r = bb.c_get_score() - prev_score
            prev_score = bb.c_get_score()
            
            if has_next == 1:
                for i in xrange(NUM_ROLLOUT-1):
                    if epoch > 0:
                        action = fast_action(state, 1)
                    else:
                        action = 3

                    has_next = bb.c_do_action(action)
                    r += bb.c_get_score() - prev_score
                    state = bb.c_get_state()
                    prev_score = bb.c_get_score()
                    if has_next == 0:
                        break
                
                if has_next == 1 and epoch > 0:
                    r += fast_value(state)

            _rewards[a] = r
            _mask[a] = 1
        
        bb.load_from_checkpoint(checkpoint_id)
    bb.clear_all_checkpoints()


@cython.boundscheck(False)
def rollout(epoch=0, curriculum=0.7):
    cdef int i
    crollout(epoch, curriculum)
    rewards = np.empty(4, dtype=np.float32)
    mask = np.empty(4, dtype=np.float32)
    for i in xrange(4):
        rewards[i] = _rewards[i]
        mask[i] = _mask[i]
    return rewards, mask


def solve_lsq(X, y, lmd = 1):
    #regularization
    if lmd >0:
        Xsq = X.T.dot(X)
        I = np.diag([1]*Xsq.shape[0])
        I[-1,-1] = 0
        return np.linalg.inv(Xsq + lmd*I).dot(X.T.dot(y))
    else:
        return np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))

def train_epoch(X, Y, M):
    
    f0 = np.round(X[:, 35]*10) == 0 
    fpos = np.round(X[:, 35]*10) > 0
    fneg = np.round(X[:, 35]*10) < 0
    fs = [fneg, f0, fpos]
    
    weights = {}
    for  k in (-1, 0, 1):
        weights_tmp = []
        for i in xrange(4):
            m = M[:,i]
            y = Y[m & fs[k+1] ,i]
            x = X[m & fs[k+1]]
            weights_tmp.append(solve_lsq(x,y))

        weights[k] = np.array(weights_tmp).T.astype(np.float32)
    return weights

@cython.boundscheck(False)
def policy_iteration(n_epochs=20, weights = None):
    cdef int epoch, action
    
    s_epoch = 0
    if weights is not None:
        dump_weights(weights)
        s_epoch = 1
        n_epochs +=1
        
    
    X = []
    Y = []
    M = []
    
    weights_out = []
    for epoch in range(s_epoch, n_epochs):
        start = time.time()
        
        prepare_bbox('train')
        while True:
            rewards, mask = rollout(epoch, 1)
            state = bb.get_state().copy()
            X.append(state)
            Y.append(rewards)
            M.append(mask)
            
            if epoch > 0:
                action = fast_action(bb.c_get_state(), 1)
            else:
                action = np.random.randint(4)

            if bb.c_do_action(action) == 0:
                train_score = bb.finish(verbose=0)
                break
        
        prepare_bbox('test')
        while True:
            rewards, mask = rollout(epoch, 1)
            state = bb.get_state().copy()
            X.append(state)
            Y.append(rewards)
            M.append(mask)
            
            if epoch > 0:
                action = fast_action(bb.c_get_state(), 1)
            else:
                action = np.random.randint(4)

            if bb.c_do_action(action) == 0:
                train_score = bb.finish(verbose=0)
                break
            
        Xa = np.array(X).astype(np.float32)
        Ya = np.array(Y).astype(np.float32)
        Ma = np.array(M).astype(np.bool)
        
        Xa_sq = Xa[:,f_sq]**2
        Xa_intr = Xa[:, f_intr] * Xa[:, 35].reshape(-1,1)
        bias = np.ones((Xa.shape[0], 1), dtype=np.float32)
        Xa = np.concatenate([Xa, Xa_sq, Xa_intr, bias], axis = 1).astype(np.float32)
        
        del X[:]
        del Y[:]
        del M[:]
        
        weights = train_epoch(Xa, Ya, Ma)
        weights_out.append(weights)
        
        print 'Epoch: {}, time: {}'.format(epoch, int(time.time() - start)),
        test(weights)
        sys.stdout.flush()

        dump_weights(weights)
    
    return weights_out

def test(weights):
    cdef:
        int action, has_next
    
    dump_weights(weights)
    results = []
    for lvl in  ('train', 'test'):
        prepare_bbox(lvl)
        has_next = 1
        while has_next:
            action = fast_action(bb.c_get_state(), 0)
            has_next = bb.c_do_action(action)
        results.append(bb.finish(verbose=0))
    print 'average  {:.2f}, test {:.2f}, train {:.2f}'.format(0.5*sum(results), results[1], results[0])
    return results

In [4]:
import cPickle
import numpy as np
with open('weights_reg1_ens_p1.pkl', 'rb') as f:
    Wens = cPickle.load(f)

In [7]:
Winit = {-1: Wens,
         0: Wens, 
         1: Wens}

W2 = policy_iteration(5, Winit)

Epoch: 1, time: 286 average  3117.89, test 3352.31, train 2883.46
Epoch: 2, time: 286 average  3269.66, test 3561.61, train 2977.70
Epoch: 3, time: 290 average  3232.97, test 3537.41, train 2928.52
Epoch: 4, time: 288 average  3240.33, test 3551.22, train 2929.44
Epoch: 5, time: 287 average  3221.94, test 3584.27, train 2859.62


In [15]:
Wens2 = {-1: np.array([w[-1] for w in W2]).mean(axis =0),
          0: np.array([w[0] for w in W2]).mean(axis =0),
          1: np.array([w[1] for w in W2]).mean(axis =0)}
test(Wens2)

In [18]:
with open('weights_reg1.pkl', 'wb') as f:
    cPickle.dump(Wens2,f)