In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

%matplotlib inline

In [2]:
np.random.normal(2, scale=0.1)

2.032566218796731

In [3]:
# k杆摇臂机
class K_armed_bandit(object):
    def __init__(self, num_k, seed= False):
        if seed:
            np.random.seed(1)
        self.loc = np.round(np.random.normal(size= num_k), decimals= 2)
        self.actions_list = np.arange(num_k).astype(np.int)
        
    def __call__(self, action):
        return self.run(action)
        
    def run(self, action):
        return np.random.normal(self.loc[action], scale= 0.1)
    
    def get_parms(self):
        return self.loc
    
    def get_actions_list(self):
        return self.actions_list

In [4]:
class Agent(object):
    
    def __init__(self,
                 environment,
                 epsilon_greedy= 1e-1,
                 step_size = 1e-3,
                 initial= 0.):
        
        self.environment = environment
        self.actions_list = environment.get_actions_list()
        self.num_actions = len(self.actions_list)
        self.greedy = 1.- epsilon_greedy
        self.step_size = step_size
        self.average_reward = 0
        self.action_value_array = np.array([initial] * self.num_actions)
        self.action_list = []
        self.reward_list = []
        
    def __call__(self, num_episodes= 1e3):
        for i in range(int(num_episodes)):
            self.step()
            
    def action_selection(self):
        if np.random.uniform() > self.greedy:
            action = np.random.choice(self.actions_list)
        else:
            action = self.get_argmax_action_value()
        return action
    
    def action_reward(self, action):
        return self.environment(action)
    
    def get_argmax_action_value(self):
        index = np.argmax(self.action_value_array)
        return self.actions_list[index]
    
    def get_next_action_value(self, action, reward):
        action_value = self.action_value_array[action]
        next_action_value = action_value + self.step_size * (reward - action_value)
        self.action_value_array[action] = next_action_value
    
    def step(self):
        action = self.action_selection()
        reward = self.action_reward(action)
        self.get_next_action_value(action, reward)
        # 记录训练过程
        self.action_list.append(action)
        self.reward_list.append(reward)

In [5]:
environment = K_armed_bandit(25)
environment.get_parms()

array([-0.02, -1.52,  0.92, -2.14, -0.26, -0.73,  0.28,  0.85,  1.46,
       -0.75,  0.11,  0.32, -1.95,  0.32,  1.6 , -1.35, -1.39, -0.34,
        1.65, -0.59,  0.61, -0.65,  0.21,  1.29,  1.5 ])

In [6]:
agent = Agent(environment)

In [7]:
agent(1e6)

In [8]:
agent.action_value_array

array([-0.01880002, -1.48622212,  0.91994128, -2.09869382, -0.25448965,
       -0.71631271,  0.27327057,  0.83432282,  1.4343519 , -0.73318404,
        0.10619266,  0.31116251, -1.91339544,  0.31150261,  1.56916426,
       -1.32237158, -1.36305728, -0.33369344,  1.65575383, -0.57869061,
        0.59856524, -0.63780024,  0.20490615,  1.26386248,  1.47364477])