In [1]:
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import random
import time


class CliffWalkingEnv:
    def __init__(self, ncol, nrow):
        self.nrow = nrow
        self.ncol = ncol
        self.x = 0  # 记录当前智能体位置的横坐标
        self.y = self.nrow - 1  # 记录当前智能体位置的纵坐标

    def step(self, action):  # 外部调用这个函数来改变当前位置
        # 4种动作, change[0]:上, change[1]:下, change[2]:左, change[3]:右。坐标系原点(0,0)
        # 定义在左上角
        change = [[0, -1], [0, 1], [-1, 0], [1, 0]]
        self.x = min(self.ncol - 1, max(0, self.x + change[action][0]))
        self.y = min(self.nrow - 1, max(0, self.y + change[action][1]))
        next_state = self.y * self.ncol + self.x
        reward = -1
        done = False
        if self.y == self.nrow - 1 and self.x > 0:  # 下一个位置在悬崖或者目标
            done = True
            if self.x != self.ncol - 1:
                reward = -100
        return next_state, reward, done

    def reset(self):  # 回归初始状态,起点在左上角
        self.x = 0
        self.y = self.nrow - 1
        return self.y * self.ncol + self.x

In [2]:
from minirl.core.dynaQ import DynaQ
import hirlite

tdb = hirlite.Rlite(encoding='utf8',path="test.db")

In [3]:
def DynaQ_CliffWalking(n_planning):
    ncol = 12
    nrow = 4
    env = CliffWalkingEnv(ncol, nrow)
    epsilon = 0.01
    alpha = 0.1
    gamma = 0.9
    state_space = [i for i in range(nrow * ncol)]
    actions_list = [i for i in range(4)]
    agent = DynaQ(state_space=state_space,
        actions = actions_list,
        alpha=0.7,
        gamma=0.9,
        random_seed=0,
        eps=0.02,
        model_db=tdb,
        score_db=tdb,
        his_db=tdb,
        N=n_planning,  # no. of steps in planning phase
        n=2,)
    #agent = DynaQ(ncol, nrow, epsilon, alpha, gamma, n_planning)
    num_episodes = 300  # 智能体在环境中运行多少条序列

    return_list = []  # 记录每一条序列的回报
    for i in range(10):  # 显示10个进度条
        # tqdm的进度条功能
        with tqdm(total=int(num_episodes / 10),
                  desc='Iteration %d' % i) as pbar:
            for i_episode in range(int(num_episodes / 10)):  # 每个进度条的序列数
                episode_return = 0
                state = env.reset()

                done = False
                while not done:
                    action = agent.act(state,"local_model","local_model",use_doubleQ=True)
                    #print(agent.model)
                    #print(actions_list)
                    #action = random.choice(actions_list)
                    action = int(action)
                    next_state, reward, done = env.step(action)
                    #print(next_state,reward,type(reward))
                    episode_return += reward  # 这里回报的计算不进行折扣因子衰减
                    agent.learn(state, reward, "local_model","local_model",use_doubleQ=True,use_dyna=True)
                    #agent.update(state, action, reward, next_state,"local_model")
                    model_id="local_model"
                    score_key1 = f"{model_id}:{state}:Qscore1"

                    f=agent._score_db.zrange(score_key1,'0','0')
                    #print(f,score_key1)
                    state = next_state
                return_list.append(episode_return)
                if (i_episode + 1) % 10 == 0:  # 每10条序列打印一下这10条序列的平均回报
                    pbar.set_postfix({
                        'episode':
                        '%d' % (num_episodes / 10 * i + i_episode + 1),
                        'return':
                        '%.3f' % np.mean(return_list[-10:])
                    })
                pbar.update(1)
    return return_list

In [None]:
np.random.seed(0)
random.seed(0)
n_planning_list = [2]
for n_planning in n_planning_list:
    print('Q-planning步数为：%d' % n_planning)
    time.sleep(0.5)
    return_list = DynaQ_CliffWalking(n_planning)
    episodes_list = list(range(len(return_list)))
    plt.plot(episodes_list,
             return_list,
             label=str(n_planning) + ' planning steps')
plt.legend()
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('Dyna-Q on {}'.format('Cliff Walking'))
plt.show()

Q-planning步数为：2
simple two layer neural network based on numpy
creating nn: #input:20 #hidden:64 #output:[0, 1, 2, 3]


Iteration 0:   7%|█▍                   | 2/30 [47:32:29<690:23:49, 88765.34s/it]

In [1]:
from datetime import datetime
from datetime import timezone
from datetime import timedelta
from hashlib import sha1

SHA_TZ = timezone(
    timedelta(hours=8),
    name="Asia/Shanghai",
)

def get_week_day():
    utc_now = datetime.utcnow().replace(tzinfo=timezone.utc)
    beijing_now = utc_now.astimezone(SHA_TZ)

    return beijing_now.weekday()

import numpy as np


def get_bj_day():
    utc_now = datetime.utcnow().replace(tzinfo=timezone.utc)
    beijing_now = utc_now.astimezone(SHA_TZ)
    _bj = beijing_now.strftime("%Y-%m-%d")  # 结果显示：'2017-10-07'

    return _bj


def get_week_day():
    utc_now = datetime.utcnow().replace(tzinfo=timezone.utc)
    beijing_now = utc_now.astimezone(SHA_TZ)

    return beijing_now.weekday()

In [2]:
get_week_day()

0

In [3]:
get_bj_day()

'2023-05-29'

In [15]:
import gym
import gym_SnakeGame

ModuleNotFoundError: No module named 'gym_SnakeGame'

In [21]:
for i in range(10):
    c=np.random.random()
    f=np.random.randint(3)
    print(c,f,type(f))

0.29179100482276166 0 <class 'int'>
0.9315005114590366 1 <class 'int'>
0.5337193218697627 0 <class 'int'>
0.2131580423994559 1 <class 'int'>
0.3405950469463369 2 <class 'int'>
0.6866293207374123 2 <class 'int'>
0.5915465669600858 0 <class 'int'>
0.7879784295343054 2 <class 'int'>
0.8327014251880721 2 <class 'int'>
0.9432362214537114 1 <class 'int'>


In [22]:
NN_ARCHITECTURE = [
    {"input_dim": 2, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 1, "activation": "sigmoid"},
]

def init_layers(nn_architecture, seed = 99):
    # random seed initiation
    np.random.seed(seed)
    # number of layers in our neural network
    number_of_layers = len(nn_architecture)
    # parameters storage initiation
    params_values = {}
    
    # iteration over network layers
    for idx, layer in enumerate(nn_architecture):
        # we number network layers from 1
        layer_idx = idx + 1
        
        # extracting the number of units in layers
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]
        
        # initiating the values of the W matrix
        # and vector b for subsequent layers
        params_values['W' + str(layer_idx)] = np.random.randn(
            layer_output_size, layer_input_size) * 0.1
        params_values['b' + str(layer_idx)] = np.random.randn(
            layer_output_size, 1) * 0.1
        
    return params_values

init_layers(NN_ARCHITECTURE)

{'W1': array([[-0.01423588,  0.20572217],
        [ 0.02832619,  0.1329812 ],
        [-0.01546219, -0.00690309],
        [ 0.07551805,  0.08256466],
        [-0.01130692, -0.23678376],
        [-0.01670494,  0.0685398 ],
        [ 0.00235001,  0.04562013],
        [ 0.02704928, -0.14350081],
        [ 0.08828171, -0.05800817],
        [-0.05015653,  0.05909533],
        [-0.07316163,  0.02617555],
        [-0.08557956, -0.01875259],
        [-0.03734863, -0.0461971 ],
        [-0.08164661, -0.00451233],
        [ 0.01213278,  0.09259528],
        [-0.05738197,  0.00527031],
        [ 0.22073106,  0.03918219],
        [ 0.04827134,  0.0433334 ],
        [-0.17042917, -0.02439081],
        [-0.21397038,  0.08613227],
        [ 0.17002844, -0.05287848],
        [ 0.17634779, -0.11216078],
        [-0.11919342,  0.05527319],
        [-0.08159809, -0.04966468],
        [ 0.10862256, -0.09746753]]),
 'b1': array([[-0.02821358],
        [-0.01172141],
        [ 0.03785473],
        [ 0.07321

In [16]:
import numpy as np
import h5py
import os
from numba import jit
import random

In [14]:
datetime(2023, 5, 29, 3, 24, 43, 719715).replace(tzinfo=timezone.utc).astimezone(SHA_TZ)-timedelta(hours=11)



datetime.datetime(2023, 5, 29, 0, 24, 43, 719715, tzinfo=datetime.timezone(datetime.timedelta(seconds=28800), 'Asia/Shanghai'))

In [4]:
datetime.utcnow()

datetime.datetime(2023, 5, 29, 3, 24, 43, 719715)

In [12]:
from datetime import datetime
from datetime import timezone
from datetime import timedelta
from hashlib import sha1

SHA_TZ = timezone(
    timedelta(hours=8),
    name="Asia/Shanghai",
)

import numpy as np


def get_bj_day():
    utc_now = datetime.utcnow().replace(tzinfo=timezone.utc)
    beijing_now = utc_now.astimezone(SHA_TZ)
    _bj = beijing_now.strftime("%Y-%m-%d")  # 结果显示：'2017-10-07'

    return _bj


def get_week_day():
    utc_now = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta(hours=11)
    beijing_now = utc_now.astimezone(SHA_TZ)

    return beijing_now.weekday()
get_week_day()

0