In [5]:
import numpy as np
import scipy.stats as sps


class ShootEnv:
    def __init__(self, eps=0.1, scale=3.0, radius=0.5, reward_type="norm"):
        self.eps = eps
        self.alpha = 0
        self.s = 0
        self.g = 9.8
        self.scale = scale
        self.radius = radius
        self.reward_type = reward_type
        

    def reset(self):
        self.alpha = np.random.uniform(low=self.eps, high=np.pi / 2 - self.eps, size=1)[0]
        self.s = np.random.uniform(low=self.eps, high=50, size=1)[0]
        return [self.alpha, self.s]

    def step(self, action):
        s_calc = action ** 2 * np.sin(2 * self.alpha) / self.g
        if self.reward_type == "norm":
            reward = sps.norm.pdf(s_calc, loc=self.s, scale=self.scale)
        elif self.reward_type == "uniform":
            reward = 1 if np.abs(self.s - s_calc) <= self.radius else 0
        else:
            pass
        return [-1, -1], reward, True, {"real_s": s_calc}
    

In [21]:
import torch
import torch.nn as nn

from torch.optim import Adam

import time


class CEM(nn.Module):
    def __init__(self, state_dim, action_n):
        super().__init__()
        self.state_dim = state_dim
        self.action_n = action_n
        
        self.network = nn.Sequential(
            nn.Linear(self.state_dim, 64), 
            nn.ReLU(), 
            nn.Linear(64, self.action_n)
        )

        self.layers_list = [self.network[0], self.network[2]]
        
        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

        self.epoch = 0
        self.init_eps = 30.0
        self.eps = self.init_eps
        self.lambda_eps = lambda epoch: 0.99 ** epoch
        
    def forward(self, _input):
        return self.network(_input) 
    
    def get_action(self, state):
        state = torch.FloatTensor(state)
        determ_action = self.forward(state).detach().numpy()
        action = determ_action + np.random.uniform(low=-self.eps / 2, high=self.eps / 2, size=1)
        return action
    
    def update_policy(self, elite_trajectories):
        elite_states = []
        elite_actions = []
        for trajectory in elite_trajectories:
            elite_states.extend(trajectory['states'])
            elite_actions.extend(trajectory['actions'])
        #print(elite_states)
        #print(elite_actions)
        elite_states = torch.FloatTensor(elite_states)
        elite_actions = torch.FloatTensor(elite_actions)

        pred_actions = self.forward(elite_states)

        loss = self.loss(pred_actions, elite_actions)
        loss.backward()

        self.optimizer.step()
        self.optimizer.zero_grad()

        self.epoch += 1
        self.eps = self.init_eps * self.lambda_eps(self.epoch)
        
        
def get_trajectory(env, agent, trajectory_len = 1, visualize=False):
    trajectory = {'states':[], 'actions': [], 'total_reward': 0}
    
    state = env.reset()
    trajectory['states'].append(state)
    
    for _ in range(trajectory_len):
        
        action = agent.get_action(state)
        trajectory['actions'].append(action)
        
        next_state, reward, done, _ = env.step(action)

        trajectory['total_reward'] += reward # награда за доистижение наивысшей точки

        if done:
            break
            
    return trajectory

def get_elite_trajectories(trajectories, q_param):
    total_rewards = [trajectory['total_reward'] for trajectory in trajectories]
    # quantile = np.quantile(total_rewards, q=q_param) 
    return [trajectory for trajectory in trajectories if trajectory['total_reward'] > 0]


def teach_model(env, agent, episode_n, trajectory_n, q_param):  
    average_rewards = []
    training_time_start = time.time()
    for episode in range(episode_n):
        start = time.time()
        trajectories = [get_trajectory(env, agent) for _ in range(trajectory_n)]
        
        mean_total_reward = np.mean([trajectory['total_reward'] for trajectory in trajectories])
        average_rewards.append(mean_total_reward)
        elite_trajectories = get_elite_trajectories(trajectories, q_param)
        print(len(elite_trajectories))
        
        if len(elite_trajectories) > 0:
            agent.update_policy(elite_trajectories)
        print(f'episode: {episode}, mean_total_reward = {mean_total_reward}  ### time: {time.time() - start}')
    print(f'##### total training time: {time.time() - training_time_start}')

    return agent, average_rewards  
    


env = ShootEnv(reward_type="uniform")

state_dim = 2
action_n = 1

agent = CEM(state_dim, action_n)
episode_n = 1000
trajectory_n = 500
trajectory_len = 1
q_param = 0.95

k_list = [20]
q_list = [0.5]

agent = teach_model(env, agent,episode_n, trajectory_n, q_param)

10
episode: 0, mean_total_reward = 0.02  ### time: 0.1473388671875
7
episode: 1, mean_total_reward = 0.014  ### time: 0.12026095390319824
10
episode: 2, mean_total_reward = 0.02  ### time: 0.10442638397216797
8
episode: 3, mean_total_reward = 0.016  ### time: 0.10309481620788574
7
episode: 4, mean_total_reward = 0.014  ### time: 0.09738874435424805
7
episode: 5, mean_total_reward = 0.014  ### time: 0.116485595703125
11
episode: 6, mean_total_reward = 0.022  ### time: 0.25702357292175293
10
episode: 7, mean_total_reward = 0.02  ### time: 0.134596586227417
4
episode: 8, mean_total_reward = 0.008  ### time: 0.1154487133026123
16
episode: 9, mean_total_reward = 0.032  ### time: 0.09752273559570312
13
episode: 10, mean_total_reward = 0.026  ### time: 0.0976254940032959
9
episode: 11, mean_total_reward = 0.018  ### time: 0.2473607063293457
12
episode: 12, mean_total_reward = 0.024  ### time: 0.15346288681030273
14
episode: 13, mean_total_reward = 0.028  ### time: 0.10933947563171387
5
episod

In [13]:
sps.norm.pdf(0, loc=0, scale=1) # точное совпадение - стремимся к этому

0.3989422804014327

In [43]:
sps.norm.pdf(np.clip(1, -1, 1), loc=0, scale=1)*10

2.4197072451914337