# Part1.  Initial regression training

In [1]:
%load_ext Cython

In [2]:
%%cython

import sys
import time

import numpy as np
cimport numpy as np

import interface as bb
cimport interface as bb

from scipy.linalg.cython_blas cimport sgemm

cimport cython

cdef float alpha = 1.0, beta = 0.0
cdef float[::1,:] s, y, w

f_sq = [0,1,2, 9, 10]
f_intr = [4, 11, 12, 13, 15, 18, 20]
cdef: 
    int n_sq = len(f_sq)
    int n_intr = len(f_intr)

s = np.empty((1,37 + n_sq + n_intr), np.float32, order="F")
w = np.empty((37 + n_sq + n_intr, 4), np.float32, order="F")
y = np.empty((1,4), np.float32, order="F")

cdef int NUM_CACHE = 51, NUM_ROLLOUT = 50
cdef int cache_i = 0, cache_n = 0
cdef float[::1,:] cache_s, cache_y

cache_s = np.empty((NUM_CACHE,36), np.float32, order="F")
cache_y = np.empty((NUM_CACHE,4), np.float32, order="F")


@cython.boundscheck(False)
cdef void fast_target(float *state, int use_cache = 0):
    global cache_i, cache_n
    cdef int i, c, m, n, k, lda, ldb, ldc
    
    if use_cache == 1:
        c = 0
        while c < cache_n:
            i = 0
            while i < 36:
                if cache_s[c,i] != state[i]:
                    break
                i += 1
            if i == 36:
                for i in xrange(4):
                    y[0,i] = cache_y[c,i]
                return
            c += 1
        cache_i += 1
        if cache_i == NUM_CACHE:
            cache_i = 0
        if cache_n < NUM_CACHE:
            cache_n += 1
        for i in xrange(36):
            cache_s[cache_i,i] = state[i]
            s[0,i] = state[i]
        for i in xrange(n_sq):
            s[0,i+36] = state[f_sq[i]]**2
        for i in xrange(n_intr):
            s[0,i+36 + n_sq] = state[f_intr[i]]*state[35]
        s[0,36 + n_sq + n_intr] = 1.
    else:
        for i in xrange(36):
            s[0,i] = state[i]
        for i in xrange(n_sq):
            s[0,i+36] = state[f_sq[i]]**2
        for i in xrange(n_intr):
            s[0,i+36 + n_sq] = state[f_intr[i]]*state[35]
        s[0,36 + n_sq + n_intr] = 1.
    
    lda = 1
    ldb = 37 + len(f_sq) + len(f_intr)
    ldc = 1
    m = 1
    n = 4
    k = 37 + len(f_sq) + len(f_intr)
    sgemm("N", "N", &m, &n, &k, &alpha, &s[0,0], &lda, &w[0,0], &ldb, &beta, &y[0,0], &ldc)
    
    if use_cache == 1:
        for i in xrange(4):
            cache_y[cache_i,i] = y[0,i]
    

@cython.boundscheck(False)
cdef int fast_action(float *state, int use_cache = 0):
    cdef int i, best_act = -1
    cdef best_val = -1e9
    fast_target(state, use_cache)
    for i in xrange(4):
        if y[0,i] > best_val:
            best_val = y[0,i]
            best_act = i
    return best_act


@cython.boundscheck(False)
cdef float fast_value(float *state):
    cdef int i
    cdef best_val = -1e9
    fast_target(state, 1)
    for i in xrange(4):
        if y[0,i] > best_val:
            best_val = y[0,i]
    return best_val

@cython.boundscheck(False)
def dump_weights(weights):
    cdef int i, j
    for i in xrange(4):
        for j in xrange(37 + n_sq + n_intr):
            w[j,i] = weights[j,i]


def prepare_bbox(level='train', verbose=0):
    global cache_i, cache_n
    cache_i = 0
    cache_n = 0
    if bb.is_level_loaded():
        bb.reset_level()
    bb.load_level('../levels/'+level+'_level.data', verbose)


cdef float _rewards[4]
cdef float _mask[4]

@cython.boundscheck(False)
cdef crollout(int epoch=0, float curriculum=0.7):
    cdef:
        int i, a, action, has_next, checkpoint_id, has_change
        float r, prev_score, init_state35, next_state35, next_state35_abs, prev_state35_abs
        float *state
    
    init_state35 = bb.c_get_state()[35]
    checkpoint_id = bb.create_checkpoint()
   
    for a in xrange(4):
        
        _rewards[a] = 0
        _mask[a] = 0
        
        prev_score = bb.c_get_score()
        has_next = bb.c_do_action(a)
        state = bb.c_get_state()
        next_state35 = state[35]  
        
        if init_state35 != next_state35 or np.random.rand() < curriculum:
                        
            r = bb.c_get_score() - prev_score
            prev_score = bb.c_get_score()
            
            if has_next == 1:
                for i in xrange(NUM_ROLLOUT-1):
                    if epoch > 0:
                        action = fast_action(state, 1)
                    else:
                        action = 3

                    has_next = bb.c_do_action(action)
                    r += bb.c_get_score() - prev_score
                    state = bb.c_get_state()
                    prev_score = bb.c_get_score()
                    if has_next == 0:
                        break
                
                if has_next == 1 and epoch > 0:
                    r += fast_value(state)

            _rewards[a] = r
            _mask[a] = 1
        
        bb.load_from_checkpoint(checkpoint_id)
    bb.clear_all_checkpoints()


@cython.boundscheck(False)
def rollout(epoch=0, curriculum=0.7):
    cdef int i
    crollout(epoch, curriculum)
    rewards = np.empty(4, dtype=np.float32)
    mask = np.empty(4, dtype=np.float32)
    for i in xrange(4):
        rewards[i] = _rewards[i]
        mask[i] = _mask[i]
    return rewards, mask


def solve_lsq(X, y, lmd = 1):
    #regularization
    if lmd >0:
        Xsq = X.T.dot(X)
        I = np.diag([1]*Xsq.shape[0])
        I[-1,-1] = 0
        return np.linalg.inv(Xsq + lmd*I).dot(X.T.dot(y))
    else:
        return np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))

def train_epoch(X, Y, M):
    weights = []
    for i in xrange(4):
        m = M[:,i]
        y = Y[m,i]
        x = X[m]
        weights.append(solve_lsq(x,y))
    
    weights = np.array(weights).T.astype(np.float32)
    return weights

@cython.boundscheck(False)
def policy_iteration(n_epochs=20):
    cdef int epoch, action
    
    X = []
    Y = []
    M = []
    
    weights_out = []
    for epoch in range(n_epochs):
        start = time.time()
        prepare_bbox('train')
        while True:
            rewards, mask = rollout(epoch)
            state = bb.get_state().copy()
            X.append(state)
            Y.append(rewards)
            M.append(mask)
            
            if epoch > 0:
                action = fast_action(bb.c_get_state(), 1)
            else:
                action = np.random.randint(4)

            if bb.c_do_action(action) == 0:
                train_score = bb.finish(verbose=0)
                break
            
        Xa = np.array(X).astype(np.float32)
        Ya = np.array(Y).astype(np.float32)
        Ma = np.array(M).astype(np.bool)
        
        Xa_sq = Xa[:,f_sq]**2
        Xa_intr = Xa[:, f_intr] * Xa[:, 35].reshape(-1,1)
        bias = np.ones((Xa.shape[0], 1), dtype=np.float32)
        Xa = np.concatenate([Xa, Xa_sq, Xa_intr, bias], axis = 1).astype(np.float32)
        
        del X[:]
        del Y[:]
        del M[:]
        
        weights = train_epoch(Xa, Ya, Ma)
        weights_out.append(weights)
        
        print 'Epoch: {}, time: {}'.format(epoch, int(time.time() - start)),
        test(weights)
        sys.stdout.flush()
       
        dump_weights(weights)
    
    return weights_out

def test(weights):
    cdef:
        int action, has_next
    
    dump_weights(weights)
    results = []
    for lvl in  ('train', 'test'):
        prepare_bbox(lvl)
        has_next = 1
        while has_next:
            action = fast_action(bb.c_get_state(), 0)
            has_next = bb.c_do_action(action)
        results.append(bb.finish(verbose=0))
    print 'average  {:.2f}, test {:.2f}, train {:.2f}'.format(0.5*sum(results), results[1], results[0])
    return results

In [3]:
W = policy_iteration(20)

Epoch: 0, time: 26 average  -3476.55, test -3243.94, train -3709.15
Epoch: 1, time: 75 average  -986.11, test -1055.43, train -916.79
Epoch: 2, time: 74 average  -218.22, test -236.44, train -199.99
Epoch: 3, time: 75 average  992.28, test 958.10, train 1026.47
Epoch: 4, time: 75 average  1638.00, test 1470.77, train 1805.24
Epoch: 5, time: 75 average  1774.21, test 1815.31, train 1733.11
Epoch: 6, time: 75 average  1516.39, test 1283.21, train 1749.56
Epoch: 7, time: 75 average  2421.10, test 2415.18, train 2427.02
Epoch: 8, time: 75 average  2337.21, test 2376.96, train 2297.46
Epoch: 9, time: 76 average  2342.52, test 2296.61, train 2388.43
Epoch: 10, time: 75 average  2623.64, test 2695.70, train 2551.58
Epoch: 11, time: 74 average  2075.61, test 2057.62, train 2093.59
Epoch: 12, time: 75 average  1904.54, test 1736.04, train 2073.04
Epoch: 13, time: 74 average  2165.65, test 2085.42, train 2245.88
Epoch: 14, time: 75 average  2356.36, test 2295.65, train 2417.06
Epoch: 15, time: 7

In [4]:
import cPickle
with open('list_weights_reg3.pkl', 'wb') as f:
    cPickle.dump(W, f)

In [5]:
for s_i in xrange(len(W)):
    print 'start i', s_i,
    test(np.array(W[s_i:]).mean(axis =0))

start i 0 average  2855.46, test 2875.96, train 2834.95
start i 1 average  2857.07, test 2914.41, train 2799.73
start i 2 average  2836.04, test 2882.52, train 2789.57
start i 3 average  2827.85, test 2894.01, train 2761.70
start i 4 average  2854.52, test 2911.03, train 2798.02
start i 5 average  2815.57, test 2881.23, train 2749.91
start i 6 average  2789.23, test 2842.81, train 2735.64
start i 7 average  2789.11, test 2848.41, train 2729.82
start i 8 average  2786.79, test 2818.85, train 2754.74
start i 9 average  2779.11, test 2774.93, train 2783.30
start i 10 average  2810.77, test 2783.60, train 2837.93
start i 11 average  2742.15, test 2658.12, train 2826.18
start i 12 average  2812.71, test 2722.24, train 2903.18
start i 13 average  2740.31, test 2610.78, train 2869.83
start i 14 average  2692.51, test 2505.62, train 2879.39
start i 15 average  2668.51, test 2524.30, train 2812.71
start i 16 average  2543.32, test 2445.99, train 2640.66
start i 17 average  2578.40, test 2480.21

In [6]:
import cPickle
Wens = np.array(W[4:]).mean(axis =0)
test(Wens)
with open('weights_reg3_ens_p1.pkl', 'wb') as f:
    cPickle.dump(Wens, f)

average  2854.52, test 2911.03, train 2798.02


# Part 2. Policy improvment: separate regressions for state[35] sign

In [2]:
%%cython

import sys
import time

import numpy as np
cimport numpy as np

import interface as bb
cimport interface as bb

from scipy.linalg.cython_blas cimport sgemm

cimport cython

cdef float alpha = 1.0, beta = 0.0
cdef float[::1,:] s, y, w0, wpos, wneg

f_sq = [0,1,2, 9, 10]
f_intr = [4, 11, 12, 13, 15, 18, 20]
cdef: 
    int n_sq = len(f_sq)
    int n_intr = len(f_intr)

s = np.empty((1,37 + n_sq + n_intr), np.float32, order="F")
w0 = np.empty((37 + n_sq + n_intr, 4), np.float32, order="F")
wpos = np.empty((37 + n_sq + n_intr, 4), np.float32, order="F")
wneg = np.empty((37 + n_sq + n_intr, 4), np.float32, order="F")

y = np.empty((1,4), np.float32, order="F")

cdef int NUM_CACHE = 51, NUM_ROLLOUT = 50
cdef int cache_i = 0, cache_n = 0
cdef float[::1,:] cache_s, cache_y

cache_s = np.empty((NUM_CACHE,36), np.float32, order="F")
cache_y = np.empty((NUM_CACHE,4), np.float32, order="F")


@cython.boundscheck(False)
cdef void fast_target(float *state, int use_cache = 0):
    global cache_i, cache_n
    cdef int i, c, m, n, k, lda, ldb, ldc
    cdef float s35_sign
    
    if use_cache == 1:
        c = 0
        while c < cache_n:
            i = 0
            while i < 36:
                if cache_s[c,i] != state[i]:
                    break
                i += 1
            if i == 36:
                for i in xrange(4):
                    y[0,i] = cache_y[c,i]
                return
            c += 1
        cache_i += 1
        if cache_i == NUM_CACHE:
            cache_i = 0
        if cache_n < NUM_CACHE:
            cache_n += 1
        for i in xrange(36):
            cache_s[cache_i,i] = state[i]
            s[0,i] = state[i]
        for i in xrange(n_sq):
            s[0,i+36] = state[f_sq[i]]**2
        for i in xrange(n_intr):
            s[0,i+36 + n_sq] = state[f_intr[i]]*state[35]
        s[0,36 + n_sq + n_intr] = 1.
    else:
        for i in xrange(36):
            s[0,i] = state[i]
        for i in xrange(n_sq):
            s[0,i+36] = state[f_sq[i]]**2
        for i in xrange(n_intr):
            s[0,i+36 + n_sq] = state[f_intr[i]]*state[35]
        s[0,36 + n_sq + n_intr] = 1.
    
    lda = 1
    ldb = 37 + len(f_sq) + len(f_intr)
    ldc = 1
    m = 1
    n = 4
    k = 37 + len(f_sq) + len(f_intr)
    
    s35_sign = round(state[35]*10)
    if s35_sign == 0:
        sgemm("N", "N", &m, &n, &k, &alpha, &s[0,0], &lda, &w0[0,0], &ldb, &beta, &y[0,0], &ldc)
    elif s35_sign > 0:
        sgemm("N", "N", &m, &n, &k, &alpha, &s[0,0], &lda, &wpos[0,0], &ldb, &beta, &y[0,0], &ldc)
    elif  s35_sign < 0:
        sgemm("N", "N", &m, &n, &k, &alpha, &s[0,0], &lda, &wneg[0,0], &ldb, &beta, &y[0,0], &ldc)
    
    if use_cache == 1:
        for i in xrange(4):
            cache_y[cache_i,i] = y[0,i]
    

@cython.boundscheck(False)
cdef int fast_action(float *state, int use_cache = 0):
    cdef int i, best_act = -1
    cdef best_val = -1e9
    fast_target(state, use_cache)
    for i in xrange(4):
        if y[0,i] > best_val:
            best_val = y[0,i]
            best_act = i
    return best_act


@cython.boundscheck(False)
cdef float fast_value(float *state):
    cdef int i
    cdef best_val = -1e9
    fast_target(state, 1)
    for i in xrange(4):
        if y[0,i] > best_val:
            best_val = y[0,i]
    return best_val

@cython.boundscheck(False)
def dump_weights(weights):
    cdef int i, j
    for k, v in weights.iteritems():
        for i in xrange(4):
            for j in xrange(37 + n_sq + n_intr):
                if k == 0:
                    w0[j,i] = v[j,i]
                elif k == -1:
                    wneg[j,i] = v[j,i]
                elif k == 1:
                    wpos[j,i] = v[j,i]


def prepare_bbox(level='train', verbose=0):
    global cache_i, cache_n
    cache_i = 0
    cache_n = 0
    if bb.is_level_loaded():
        bb.reset_level()
    bb.load_level('../levels/'+level+'_level.data', verbose)


cdef float _rewards[4]
cdef float _mask[4]

@cython.boundscheck(False)
cdef crollout(int epoch=0, float curriculum=0.7):
    cdef:
        int i, a, action, has_next, checkpoint_id, has_change
        float r, prev_score, init_state35, next_state35, next_state35_abs, prev_state35_abs
        float *state
    
    init_state35 = bb.c_get_state()[35]
    checkpoint_id = bb.create_checkpoint()
   
    for a in xrange(4):
        
        _rewards[a] = 0
        _mask[a] = 0
        
        prev_score = bb.c_get_score()
        has_next = bb.c_do_action(a)
        state = bb.c_get_state()
        next_state35 = state[35]  
        
        if init_state35 != next_state35 or np.random.rand() < curriculum:
                        
            r = bb.c_get_score() - prev_score
            prev_score = bb.c_get_score()
            
            if has_next == 1:
                for i in xrange(NUM_ROLLOUT-1):
                    if epoch > 0:
                        action = fast_action(state, 1)
                    else:
                        action = 3

                    has_next = bb.c_do_action(action)
                    r += bb.c_get_score() - prev_score
                    state = bb.c_get_state()
                    prev_score = bb.c_get_score()
                    if has_next == 0:
                        break
                
                if has_next == 1 and epoch > 0:
                    r += fast_value(state)

            _rewards[a] = r
            _mask[a] = 1
        
        bb.load_from_checkpoint(checkpoint_id)
    bb.clear_all_checkpoints()


@cython.boundscheck(False)
def rollout(epoch=0, curriculum=0.7):
    cdef int i
    crollout(epoch, curriculum)
    rewards = np.empty(4, dtype=np.float32)
    mask = np.empty(4, dtype=np.float32)
    for i in xrange(4):
        rewards[i] = _rewards[i]
        mask[i] = _mask[i]
    return rewards, mask


def solve_lsq(X, y, lmd = 1):
    #regularization
    if lmd >0:
        Xsq = X.T.dot(X)
        I = np.diag([1]*Xsq.shape[0])
        I[-1,-1] = 0
        return np.linalg.inv(Xsq + lmd*I).dot(X.T.dot(y))
    else:
        return np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))

def train_epoch(X, Y, M):
    
    f0 = np.round(X[:, 35]*10) == 0 
    fpos = np.round(X[:, 35]*10) > 0
    fneg = np.round(X[:, 35]*10) < 0
    fs = [fneg, f0, fpos]
    
    weights = {}
    for  k in (-1, 0, 1):
        weights_tmp = []
        for i in xrange(4):
            m = M[:,i]
            y = Y[m & fs[k+1] ,i]
            x = X[m & fs[k+1]]
            weights_tmp.append(solve_lsq(x,y))

        weights[k] = np.array(weights_tmp).T.astype(np.float32)
    return weights

@cython.boundscheck(False)
def policy_iteration(n_epochs=20, weights = None, use_test = True):
    cdef int epoch, action
    
    X = []
    Y = []
    M = []
    
    s_epoch = 0
    if weights is not None:
        dump_weights(weights)
        s_epoch = 1
        n_epochs +=1
    
    weights_out = []
    for epoch in range(s_epoch, n_epochs):
        start = time.time()
        
        prepare_bbox('train')
        while True:
            rewards, mask = rollout(epoch, 1)
            state = bb.get_state().copy()
            X.append(state)
            Y.append(rewards)
            M.append(mask)

            if epoch > 0:
                action = fast_action(bb.c_get_state(), 1)
            else:
                action = np.random.randint(4)

            if bb.c_do_action(action) == 0:
                train_score = bb.finish(verbose=0)
                break
        
        if use_test:
            prepare_bbox('test')
            while True:
                rewards, mask = rollout(epoch, 1)
                state = bb.get_state().copy()
                X.append(state)
                Y.append(rewards)
                M.append(mask)

                if epoch > 0:
                    action = fast_action(bb.c_get_state(), 1)
                else:
                    action = np.random.randint(4)

                if bb.c_do_action(action) == 0:
                    train_score = bb.finish(verbose=0)
                    break

            
        Xa = np.array(X).astype(np.float32)
        Ya = np.array(Y).astype(np.float32)
        Ma = np.array(M).astype(np.bool)
        
        Xa_sq = Xa[:,f_sq]**2
        Xa_intr = Xa[:, f_intr] * Xa[:, 35].reshape(-1,1)
        bias = np.ones((Xa.shape[0], 1), dtype=np.float32)
        Xa = np.concatenate([Xa, Xa_sq, Xa_intr, bias], axis = 1).astype(np.float32)
        
        del X[:]
        del Y[:]
        del M[:]
        
        weights = train_epoch(Xa, Ya, Ma)
        weights_out.append(weights)
        
        print 'Epoch: {}, time: {}'.format(epoch, int(time.time() - start)),
        test(weights)
        sys.stdout.flush()

        dump_weights(weights)
    
    return weights_out

def test(weights):
    cdef:
        int action, has_next
    
    dump_weights(weights)
    results = []
    for lvl in  ('train', 'test'):
        prepare_bbox(lvl)
        has_next = 1
        while has_next:
            action = fast_action(bb.c_get_state(), 0)
            has_next = bb.c_do_action(action)
        results.append(bb.finish(verbose=0))
    print 'average  {:.2f}, test {:.2f}, train {:.2f}'.format(0.5*sum(results), results[1], results[0])
    return results

In [4]:
import cPickle
import numpy as np
with open('weights_reg3_ens_p1.pkl', 'rb') as f:
    Wens = cPickle.load(f)

In [9]:
Winit = {-1: Wens,
          0: Wens, 
          1: Wens}
W2 = policy_iteration(3, Winit, True)
W2 = policy_iteration(3, W2[-1], False)
W2 = policy_iteration(3, W2[-1], True)
W2 = policy_iteration(3, W2[-1], False)

Epoch: 1, time: 207 average  3117.14, test 3296.73, train 2937.55
Epoch: 2, time: 209 average  3135.74, test 3336.25, train 2935.23
Epoch: 3, time: 209 average  3100.21, test 3346.91, train 2853.50
Epoch: 4, time: 209 average  3010.82, test 3168.37, train 2853.28
Epoch: 5, time: 209 average  3049.12, test 3246.55, train 2851.68
Epoch: 6, time: 209 average  3108.16, test 3308.54, train 2907.78
Epoch: 7, time: 208 average  3168.92, test 3407.94, train 2929.89
Epoch: 8, time: 207 average  3122.44, test 3249.72, train 2995.16
Epoch: 9, time: 207 average  3237.05, test 3547.54, train 2926.56
Epoch: 10, time: 209 average  3255.06, test 3502.18, train 3007.94


In [14]:
with open('weights_reg3.pkl', 'wb') as f:
    cPickle.dump(W2[-1],f)