In [1]:
import sys
sys.path.append('./abides/')
import numpy as np
import pandas as pd
import datetime as dt
# from dateutil.parser import parse
from Kernel import Kernel
from util import util
from util.order import LimitOrder
from util.oracle.SparseMeanRevertingOracle import SparseMeanRevertingOracle
from tqdm import tqdm
from agent.ExchangeAgent import ExchangeAgent
from agent.TradingAgent import TradingAgent
from agent.NoiseAgent import NoiseAgent
from agent.ValueAgent import ValueAgent
from agent.market_makers.AdaptiveMarketMakerAgent import AdaptiveMarketMakerAgent
from agent.examples.MomentumAgent import MomentumAgent
from agent.examples.ExampleExperimentalAgent import ExampleExperimentalAgentTemplate, ExampleExperimentalAgent
from model.LatencyModel import LatencyModel
from util.plotting.liquidity_telemetry import create_orderbooks
import os
root_path = 'abides'


In [43]:
bids[1]

[]

In [19]:
prices = []
bids = {1:[],2:[],3:[],4:[],5:[],6:[],7:[],8:[],9:[],10:[]}
asks = {1:[],2:[],3:[],4:[],5:[],6:[],7:[],8:[],9:[],10:[]}
class APIAgent(TradingAgent):

    def __init__(self, id, name, type, symbol=None, starting_cash=None, within=0.01,
                                random_state = None):
        # Base class init.
        super().__init__(id, name, type, starting_cash = starting_cash, random_state = random_state)

        self.symbol = symbol        # symbol to trade
        self.trading = True        # ready to trade
        self.traded = False         # has made its one trade
        # The amount of available "nearby" liquidity to consume when placing its order.
        # self.greed = greed            # trade this proportion of liquidity
        self.within = within        # within this range of the inside price


        # The agent begins in its "complete" state, not waiting for
        # any special event or condition.
        self.state = 'AWAITING_WAKEUP'



    def wakeup (self, currentTime):
        # Parent class handles discovery of exchange times and market_open wakeup call.
        super().wakeup(currentTime)

        if not self.mkt_open or not self.mkt_close:
            # TradingAgent handles discovery of exchange times.
            return
        else:
            if not self.trading:
                self.trading = False

                # Time to start trading!
                print ("{} is ready to start trading now.".format(self.name))
        self.getCurrentSpread()
        self.state = 'AWAITING_SPREAD'
        if self.last_trade.get(self.symbol) and self.mkt_closed == False:
            prices.append(self.last_trade[self.symbol])

        if self.known_bids.get(self.symbol):
            
            for i in range(min(10, len(self.known_bids[self.symbol]))):
                bids[i+1].append(self.known_bids[self.symbol][i])
            for i in range(min(10, len(self.known_asks[self.symbol]))):
                asks[i+1].append(self.known_asks[self.symbol][i])
        # Steady state wakeup behavior starts here.

        # First, see if we have received a MKT_CLOSED message for the day.    If so,
        # there's nothing to do except clean-up.
        self.setWakeup(currentTime + pd.Timedelta('1m'))
        if self.mkt_closed and (self.symbol in self.daily_close_price):
            # Market is closed and we already got the daily close price.
            return
        

        
        # If the market is closed and we haven't obtained the daily close price yet,
        # do that before we cease activity for the day.    Don't do any other behavior
#         # after market close.
#         if self.mkt_closed and (not self.symbol in self.daily_close_price):
#             self.getLastTrade()
#             self.state = 'AWAITING_LAST_TRADE'
#             print(self.state)
#             return
            

        # The impact agent will place one order based on the current spread.
        

    # Request the last trade price for our symbol.
    def getLastTrade (self):
        super().getLastTrade(self.symbol)


    # Request the spread for our symbol.
    def getCurrentSpread (self, depth=10):
        # Impact agent gets depth 10000 on each side (probably everything).
        super().getCurrentSpread(self.symbol, depth)


    def getWakeFrequency (self):
        return (pd.Timedelta('1ns'))

In [5]:
def get_data(seed=413):
#market config
    Exchange_Agent = 1
    POV_Market_Maker_Agent = 1
    Value_Agents = 150
    Momentum_Agents = 35
    Noise_Agents = 5000
    
    #agent config
    ticker = 'ABM'
    log_dir = f'log/experimental_agent_demo_short_2min_long_5min_{seed}'
    historical_date = 20200603
    start_time = dt.datetime.strptime('09:00:00','%H:%M:%S')
    end_time = dt.datetime.strptime('16:00:00','%H:%M:%S')

    verbose = False
    fund_vol = 1e-7
    experimental_agent = True
    ea_short_window = '2min'
    ea_long_window = '5min'



    # Print system banner.
    system_name = "ABIDES: Agent-Based Interactive Discrete Event Simulation"

    print ("=" * len(system_name))
    print (system_name)
    print ("=" * len(system_name))
    print ()

#     rgs, remaining_args = parser.parse_known_args()

    # if config_help:
    #     parser.print_help()
    #     sys.exit()

#     log_dir = log_dir  # Requested log directory.
#     seed = seed  # Random seed specification on the command line.
    if not seed: seed = int(pd.Timestamp.now().timestamp() * 1000000) % (2 ** 32 - 1)
    np.random.seed(seed)

    util.silent_mode = not verbose
    LimitOrder.silent_mode = not verbose

    exchange_log_orders = True
    log_orders = None
    book_freq = 0

    simulation_start_time = dt.datetime.now()
    print("Simulation Start Time: {}".format(simulation_start_time))
    print("Configuration seed: {}\n".format(seed))
    ########################################################################################################################
    ############################################### AGENTS CONFIG ##########################################################

    # Historical date to simulate.
    historical_date = pd.to_datetime(historical_date, format='%Y%m%d')
    historical_end = pd.to_datetime(20200610, format='%Y%m%d')
    mkt_open = historical_date + pd.to_timedelta(start_time.strftime('%H:%M:%S'))
    mkt_close = historical_date + pd.to_timedelta(end_time.strftime('%H:%M:%S'))
    agent_count, agents, agent_types = 0, [], []

    # Hyperparameters
    symbol = ticker
    starting_cash = 100000000  # Cash in this simulator is always in CENTS.

    r_bar = 1e5
    sigma_n = r_bar / 10
    kappa = 1.67e-15
    lambda_a = 7e-11

    # Oracle
    symbols = {symbol: {'r_bar': r_bar,
                        'kappa': 1.67e-16,
                        'sigma_s': 0,
                        'fund_vol': fund_vol,
                        'megashock_lambda_a': 2.77778e-18,
                        'megashock_mean': 1e3,
                        'megashock_var': 5e4,
                        'random_state': np.random.RandomState(seed=np.random.randint(low=0, high=2 ** 32, dtype='uint64'))}}

    oracle = SparseMeanRevertingOracle(mkt_open, mkt_close, symbols)

    # 1) Exchange Agent

    #  How many orders in the past to store for transacted volume computation
    # stream_history_length = int(pd.to_timedelta(mm_wake_up_freq).total_seconds() * 100)
    stream_history_length = 25000

    agents.extend([ExchangeAgent(id=0,
                                 name="EXCHANGE_AGENT",
                                 type="ExchangeAgent",
                                 mkt_open=mkt_open,
                                 mkt_close=mkt_close,
                                 symbols=[symbol],
                                 log_orders=exchange_log_orders,
                                 pipeline_delay=0,
                                 computation_delay=0,
                                 stream_history=stream_history_length,
                                 book_freq=book_freq,
                                 wide_book=True,
                                 random_state=np.random.RandomState(seed=np.random.randint(low=0, high=2 ** 32, dtype='uint64')))])
    agent_types.extend("ExchangeAgent")
    agent_count += 1

    # 2) Noise Agents
    num_noise = 5000
    noise_mkt_open = historical_date + pd.to_timedelta("09:00:00")  # These times needed for distribution of arrival times
                                                                    # of Noise Agents
    noise_mkt_close = historical_date + pd.to_timedelta("16:00:00")
    agents.extend([NoiseAgent(id=j,
                              name="NoiseAgent {}".format(j),
                              type="NoiseAgent",
                              symbol=symbol,
                              starting_cash=starting_cash,
                              wakeup_time=util.get_wake_time(noise_mkt_open, noise_mkt_close),
                              log_orders=log_orders,
                              random_state=np.random.RandomState(seed=np.random.randint(low=0, high=2 ** 32, dtype='uint64')))
                   for j in range(agent_count, agent_count + num_noise)])
    agent_count += num_noise
    agent_types.extend(['NoiseAgent'])

    # 3) Value Agents
    num_value = 100
    agents.extend([ValueAgent(id=j,
                              name="Value Agent {}".format(j),
                              type="ValueAgent",
                              symbol=symbol,
                              starting_cash=starting_cash,
                              sigma_n=sigma_n,
                              r_bar=r_bar,
                              kappa=kappa,
                              lambda_a=lambda_a,
                              log_orders=log_orders,
                              random_state=np.random.RandomState(seed=np.random.randint(low=0, high=2 ** 32, dtype='uint64')))
                   for j in range(agent_count, agent_count + num_value)])
    agent_count += num_value
    agent_types.extend(['ValueAgent'])

    # 4) Market Maker Agents

    """
    window_size ==  Spread of market maker (in ticks) around the mid price
    pov == Percentage of transacted volume seen in previous `mm_wake_up_freq` that
           the market maker places at each level
    num_ticks == Number of levels to place orders in around the spread
    wake_up_freq == How often the market maker wakes up

    """

    # each elem of mm_params is tuple (window_size, pov, num_ticks, wake_up_freq, min_order_size)
    mm_params = [('adaptive', 0.025, 10, '10S', 1),
                 ('adaptive', 0.025, 10, '10S', 1)
                 ]

    num_mm_agents = len(mm_params)
    mm_cancel_limit_delay = 50  # 50 nanoseconds

    agents.extend([AdaptiveMarketMakerAgent(id=j,
                                    name="ADAPTIVE_POV_MARKET_MAKER_AGENT_{}".format(j),
                                    type='AdaptivePOVMarketMakerAgent',
                                    symbol=symbol,
                                    starting_cash=starting_cash,
                                    pov=mm_params[idx][1],
                                    min_order_size=mm_params[idx][4],
                                    window_size=mm_params[idx][0],
                                    num_ticks=mm_params[idx][2],
                                    wake_up_freq=mm_params[idx][3],
                                    cancel_limit_delay=mm_cancel_limit_delay,
                                    skew_beta=0,
                                    level_spacing=5,
                                    spread_alpha=0.75,
                                    backstop_quantity=50000,
                                    log_orders=log_orders,
                                    random_state=np.random.RandomState(seed=np.random.randint(low=0, high=2 ** 32,
                                                                                              dtype='uint64')))
                   for idx, j in enumerate(range(agent_count, agent_count + num_mm_agents))])
    agent_count += num_mm_agents
    agent_types.extend('POVMarketMakerAgent')


    # 5) Momentum Agents
    num_momentum_agents = 25

    agents.extend([MomentumAgent(id=j,
                                 name="MOMENTUM_AGENT_{}".format(j),
                                 type="MomentumAgent",
                                 symbol=symbol,
                                 starting_cash=starting_cash,
                                 min_size=1,
                                 max_size=10,
                                 wake_up_freq='20s',
                                 log_orders=log_orders,
                                 random_state=np.random.RandomState(seed=np.random.randint(low=0, high=2 ** 32,
                                                                                           dtype='uint64')))
                   for j in range(agent_count, agent_count + num_momentum_agents)])
    agent_count += num_momentum_agents
    agent_types.extend("MomentumAgent")

    # 6) Experimental Agent

    #### Example Experimental Agent parameters
    
    if experimental_agent:
        experimental_agent = APIAgent(
            id=agent_count,
            name='data_AGENT',
            type='APIAgent',
            symbol=symbol,
            starting_cash=starting_cash,
#             levels=5,
#             subscription_freq=1e9,
#             wake_freq='1m',
#             order_size=100,
#             short_window=ea_short_window,
#             long_window=ea_long_window,
#             log_orders=True,
            random_state=np.random.RandomState(seed=np.random.randint(low=0, high=2 ** 32, dtype='uint64'))
        )
#     else:
#         experimental_agent = ExampleExperimentalAgentTemplate(
#             id=agent_count,
#             name='EXAMPLE_EXPERIMENTAL_AGENT',
#             type='ExampleExperimentalAgent',
#             symbol=symbol,
#             starting_cash=starting_cash,
#             levels=5,
#             subscription_freq=1e9,args, remaining_args = parser.parse_known_args()

    # if config_help:
    #     parser.print_help()
    #     sys.exit()


    experimental_agents = [experimental_agent]
    agents.extend(experimental_agents)
    agent_types.extend("ExperimentalAgent")
    agent_count += 1


    ########################################################################################################################
    ########################################### KERNEL AND OTHER CONFIG ####################################################

    kernel = Kernel("RMSC03 Kernel", random_state=np.random.RandomState(seed=np.random.randint(low=0, high=2 ** 32,
                                                                                                      dtype='uint64')))

    kernelStartTime = historical_date
    kernelStopTime = historical_date + pd.to_timedelta(end_time.strftime('%H:%M:%S')) + pd.to_timedelta('00:01:00')
    print(kernelStartTime,kernelStopTime)
    defaultComputationDelay = 50  # 50 nanoseconds

    # LATENCY

    latency_rstate = np.random.RandomState(seed=np.random.randint(low=0, high=2**32))
    pairwise = (agent_count, agent_count)

    # All agents sit on line from Seattle to NYC
    nyc_to_seattle_meters = 3866660
    pairwise_distances = util.generate_uniform_random_pairwise_dist_on_line(0.0, nyc_to_seattle_meters, agent_count,
                                                                            random_state=latency_rstate)
    pairwise_latencies = util.meters_to_light_ns(pairwise_distances)

    model_args = {
        'connected': True,
        'min_latency': pairwise_latencies
    }

    latency_model = LatencyModel(latency_model='deterministic',
                                 random_state=latency_rstate,
                                 kwargs=model_args
                                 )
    # KERNEL

    kernel.runner(agents=agents,
                  startTime=kernelStartTime,
                  stopTime=kernelStopTime,
                  agentLatencyModel=latency_model,
                  defaultComputationDelay=defaultComputationDelay,
                  oracle=oracle,
                  log_dir=log_dir)

    simulation_end_time = dt.datetime.now()
    print("Simulation End Time: {}".format(simulation_end_time))
    print("Time taken to run simulation: {}".format(simulation_end_time - simulation_start_time))

In [15]:
seed = 113
days = 60
columns = ['price'].append(['ask_'+idx for idx in range(1, len(bids))])
columns.append(['bid'+ ])
save_df = pd.Dataframe(columns=)
for i in range(0,days):
    get_data(seed)
    seed += days+i
    
    

ABIDES: Agent-Based Interactive Discrete Event Simulation

Simulation Start Time: 2022-03-16 13:47:55.057798
Configuration seed: 113

2020-06-03 00:00:00 2020-06-03 16:01:00

--- Simulation time: 2020-06-03 00:00:00, messages processed: 0, wallclock elapsed: 0 days 00:00:00.000020 ---


--- Simulation time: 2020-06-03 09:04:20.237670784, messages processed: 100000, wallclock elapsed: 0 days 00:00:07.295033 ---


--- Simulation time: 2020-06-03 09:15:49.444195683, messages processed: 200000, wallclock elapsed: 0 days 00:00:15.317221 ---


--- Simulation time: 2020-06-03 09:26:59.740688778, messages processed: 300000, wallclock elapsed: 0 days 00:00:25.430089 ---


--- Simulation time: 2020-06-03 09:38:12.102491070, messages processed: 400000, wallclock elapsed: 0 days 00:00:37.575406 ---


--- Simulation time: 2020-06-03 09:49:12.696674880, messages processed: 500000, wallclock elapsed: 0 days 00:00:50.976067 ---


--- Simulation time: 2020-06-03 10:00:17.929408347, messages processed: 

KeyboardInterrupt: 

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [13]:
df = pd.DataFrame(data = prices, 
                  index = range(1,len(prices))
                  columns = ['close'])


Unnamed: 0,close
0,99955
1,99957
2,99956
3,99983
4,100293
...,...
835,97916
836,97661
837,97390
838,97497


In [8]:
scl = MinMaxScaler()
pp = np.array(prices).reshape(-1,1)
scl.fit_transform(pp)


array([[0.47818648],
       [0.47837658],
       [0.47828153],
       [0.48084783],
       [0.51031271],
       [0.49786142],
       [0.48094288],
       [0.48769128],
       [0.50061781],
       [0.51278396],
       [0.50784146],
       [0.49396445],
       [0.48579032],
       [0.49301397],
       [0.50489497],
       [0.50793651],
       [0.50080791],
       [0.49539017],
       [0.47457466],
       [0.44682064],
       [0.44663055],
       [0.44862656],
       [0.44625036],
       [0.43893166],
       [0.42591008],
       [0.3942591 ],
       [0.42068244],
       [0.43560498],
       [0.43712575],
       [0.44244844],
       [0.43750594],
       [0.43646041],
       [0.43579508],
       [0.4417831 ],
       [0.45613535],
       [0.4558502 ],
       [0.42163292],
       [0.41792605],
       [0.42182302],
       [0.42106264],
       [0.4273358 ],
       [0.42828628],
       [0.4203973 ],
       [0.43018724],
       [0.43075753],
       [0.40537972],
       [0.39435415],
       [0.417

In [4]:
prices

[99955,
 100004,
 99997,
 99999,
 100010,
 100008,
 99997,
 99994,
 100014,
 100024,
 100002,
 99992,
 99982,
 99977,
 99983,
 99982,
 99971,
 99975,
 99955,
 99933,
 99930,
 99930,
 99932,
 99929,
 99921,
 99902,
 99912,
 99917,
 99922,
 99938,
 99930,
 99943,
 99957,
 99966,
 99975,
 99977,
 99956,
 99938,
 99948,
 99946,
 99949,
 99952,
 99942,
 99968,
 99964,
 99943,
 99921,
 99951,
 99959,
 99950,
 99921,
 99907,
 99901,
 99943,
 99937,
 99934,
 99934,
 99934,
 99931,
 99945,
 99947,
 99942,
 99932,
 99917,
 99891,
 99846,
 99831,
 99829,
 99834,
 99856,
 99854,
 99876,
 99888,
 99905,
 99924,
 99969,
 99987,
 99996,
 100003,
 100009,
 100016,
 100024,
 100031,
 100038,
 100045,
 100066,
 100057,
 100046,
 100009,
 99990,
 99990,
 99946,
 99928,
 99930,
 99912,
 99888,
 99874,
 99872,
 99867,
 99860,
 99844,
 99834,
 99842,
 99830,
 99845,
 99861,
 99848,
 99858,
 99872,
 99874,
 99877,
 99878,
 99904,
 99906,
 99881,
 99878,
 99881,
 99881,
 99892,
 99945,
 99953,
 99957,
 99954,

In [19]:
df = pd.read_csv('orderbook.csv',index_col=False)
df.columns=['ask1','ask1_v','bid1','bid1_v']
df.astype(np.int64)

Unnamed: 0,ask1,ask1_v,bid1,bid1_v
0,9999999999,9999999999,99981,39
1,9999999999,9999999999,99981,39
2,9999999999,9999999999,99953,29
3,9999999999,9999999999,99918,21
4,99970,37,99918,21
...,...,...,...,...
36094,99952,124,99946,15
36095,99952,124,99951,41
36096,99946,26,99945,20
36097,99945,10,99944,249


In [20]:
df

Unnamed: 0,ask1,ask1_v,bid1,bid1_v
0,1.000000e+10,1.000000e+10,99981.0,39.0
1,1.000000e+10,1.000000e+10,99981.0,39.0
2,1.000000e+10,1.000000e+10,99953.0,29.0
3,1.000000e+10,1.000000e+10,99918.0,21.0
4,9.997000e+04,3.700000e+01,99918.0,21.0
...,...,...,...,...
36094,9.995200e+04,1.240000e+02,99946.0,15.0
36095,9.995200e+04,1.240000e+02,99951.0,41.0
36096,9.994600e+04,2.600000e+01,99945.0,20.0
36097,9.994500e+04,1.000000e+01,99944.0,249.0


In [2]:
processed_orderbook, transacted_orders, cleaned_orderbook = create_orderbooks(os.path.join(root_path,'log/log/experimental_agent_demo_short_2min_long_5min_413/EXCHANGE_AGENT.bz2'), os.path.join(root_path,'log/log/experimental_agent_demo_short_2min_long_5min_413/ORDERBOOK_ABM_FULL.bz2'))

Constructing orderbook...


Processing order book: 100%|██████████| 36100/36100 [00:05<00:00, 6477.41it/s]


Orderbook construction complete!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transacted_orders['SIZE'] = transacted_orders['SIZE'] / 2


In [5]:
cleaned_orderbook

Unnamed: 0,ORDER_ID,PRICE,SIZE,BUY_SELL_FLAG,TYPE,ask_price_1,ask_size_1,bid_price_1,bid_size_1,MID_PRICE,SPREAD,ORDER_VOLUME_IMBALANCE,VWAP
1970-01-01 09:30:00.020425499,8,999.70,37,0,LIMIT_ORDER,99970.0,37.0,99918.0,21.0,999.440,0.52,0.637931,999.383563
1970-01-01 09:30:00.020700299,46,998.17,34,1,LIMIT_ORDER,99918.0,18.0,99817.0,34.0,998.675,1.01,0.346154,999.343981
1970-01-01 09:30:00.020703206,47,999.81,45,0,LIMIT_ORDER,99918.0,18.0,99817.0,34.0,998.675,1.01,0.346154,999.343981
1970-01-01 09:30:00.020707432,48,999.81,36,0,LIMIT_ORDER,99918.0,18.0,99817.0,34.0,998.675,1.01,0.346154,999.343981
1970-01-01 09:30:00.020713628,49,999.81,30,0,LIMIT_ORDER,99918.0,18.0,99817.0,34.0,998.675,1.01,0.346154,999.343981
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 10:29:59.708801583,51414,999.46,15,True,ORDER_EXECUTED,99946.0,26.0,99945.0,20.0,999.455,0.01,0.565217,999.377699
1970-01-01 10:29:59.898988886,51419,999.45,30,0,LIMIT_ORDER,99945.0,10.0,99944.0,249.0,999.445,0.01,0.038610,999.377699
1970-01-01 10:29:59.898988886,51419,999.45,20,0,ORDER_EXECUTED,99945.0,10.0,99944.0,249.0,999.445,0.01,0.038610,999.377700
1970-01-01 10:29:59.898988886,51409,999.45,20,True,ORDER_EXECUTED,99945.0,10.0,99944.0,249.0,999.445,0.01,0.038610,999.377701


In [20]:
prices = transacted_orders['PRICE']

1970-01-01 09:30:00.020755341     999.18
1970-01-01 09:30:00.020755341    1000.00
1970-01-01 09:30:00.020755341     999.70
1970-01-01 09:30:00.020755341    1000.00
1970-01-01 09:30:00.020884282     999.70
                                  ...   
1970-01-01 10:29:59.156436327     999.46
1970-01-01 10:29:59.708801583     999.46
1970-01-01 10:29:59.708801583     999.46
1970-01-01 10:29:59.898988886     999.45
1970-01-01 10:29:59.898988886     999.45
Name: PRICE, Length: 60604, dtype: float64

In [28]:
ddf = pd.read_csv('1minAllStock.csv')



0         False
1         False
2         False
3         False
4         False
          ...  
265672    False
265673    False
265674    False
265675    False
265676    False
Name: close, Length: 265677, dtype: bool

In [40]:
price_df = ddf[ddf['stock'] == 'AAL']['close']
price_df

0         28.8000
1         28.9000
2         28.9000
3         28.9000
4         28.8500
           ...   
265672    23.3400
265673    23.3499
265674    23.3400
265675    23.3500
265676    23.3600
Name: close, Length: 265677, dtype: float64

In [42]:
import gym
from gym import spaces
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

class CustomEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self,):
        super(CustomEnv, self).__init__()
        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions:
        self.action_space = spaces.Discrete(3)
        # Example for using image as input (channel-first; channel-last also works):
        self.observation_space = spaces.Box(low=0, high=2, shape=(1, ), dtype=np.float64)
        self.idx = 0
        self.hold_price = -1
        self.price = price_df.to_numpy()
    def step(self, action):
        reward = 0
        if action == 0: #sell
            if self.hold_price != -1:
                reward = self.price[self.idx] - self.hold_price
                self.hold_price = -1
        if action == 2:
            if self.hold_price != -1:
                self.hold_price = self.hold_price + self.price[self.idx] / 2
            else:
                self.hold_price = self.price[self.idx]
                
                
        observation = self.price[self.idx]
        
        self.idx += 1
        done = (self.idx == len(self.price))
        if done:
            print(self.idx,'****************************************************')
#         print(observation)
        return observation, reward, done, {}
    def reset(self):
        self.idx = 0
        observation = self.price[self.idx]
#         print(observation)
        return observation  # reward, done, info can't be included
    def render(self, mode='human'):
        ...
    def close (self):
        ...

In [43]:
env = CustomEnv()
# Define and Train the agent
model = A2C('MlpPolicy', env, verbose=1).learn(total_timesteps=300000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 1505     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.629   |
|    explained_variance | 0.0489   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.00243  |
|    value_loss         | 1.48e-05 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1503     |
|    iterations         | 200      |
|    time_elapsed       | 0        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.323   |
|    explained_variance | -7.84    |
|    learning_rate      | 0.0007   |
|    n_updates    

------------------------------------
| time/                 |          |
|    fps                | 1479     |
|    iterations         | 1700     |
|    time_elapsed       | 5        |
|    total_timesteps    | 8500     |
| train/                |          |
|    entropy_loss       | -0.0139  |
|    explained_variance | -0.0022  |
|    learning_rate      | 0.0007   |
|    n_updates          | 1699     |
|    policy_loss        | 8.59e-07 |
|    value_loss         | 2.98e-07 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1482     |
|    iterations         | 1800     |
|    time_elapsed       | 6        |
|    total_timesteps    | 9000     |
| train/                |          |
|    entropy_loss       | -0.00885 |
|    explained_variance | 0.00195  |
|    learning_rate      | 0.0007   |
|    n_updates          | 1799     |
|    policy_loss        | 5.96e-07 |
|    value_loss         | 4.02e-07 |
-

------------------------------------
| time/                 |          |
|    fps                | 1498     |
|    iterations         | 3300     |
|    time_elapsed       | 11       |
|    total_timesteps    | 16500    |
| train/                |          |
|    entropy_loss       | -0.00612 |
|    explained_variance | -0.0341  |
|    learning_rate      | 0.0007   |
|    n_updates          | 3299     |
|    policy_loss        | 9.49e-08 |
|    value_loss         | 2.13e-08 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1498      |
|    iterations         | 3400      |
|    time_elapsed       | 11        |
|    total_timesteps    | 17000     |
| train/                |           |
|    entropy_loss       | -0.00607  |
|    explained_variance | -0.0472   |
|    learning_rate      | 0.0007    |
|    n_updates          | 3399      |
|    policy_loss        | -7.91e-07 |
|    value_loss         | 

------------------------------------
| time/                 |          |
|    fps                | 1512     |
|    iterations         | 4900     |
|    time_elapsed       | 16       |
|    total_timesteps    | 24500    |
| train/                |          |
|    entropy_loss       | -0.00724 |
|    explained_variance | 0.048    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4899     |
|    policy_loss        | 6.96e-07 |
|    value_loss         | 7.84e-07 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1514     |
|    iterations         | 5000     |
|    time_elapsed       | 16       |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -0.00723 |
|    explained_variance | 0.365    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4999     |
|    policy_loss        | 8.09e-07 |
|    value_loss         | 1.03e-06 |
-

-------------------------------------
| time/                 |           |
|    fps                | 1525      |
|    iterations         | 6500      |
|    time_elapsed       | 21        |
|    total_timesteps    | 32500     |
| train/                |           |
|    entropy_loss       | -0.0121   |
|    explained_variance | -0.0268   |
|    learning_rate      | 0.0007    |
|    n_updates          | 6499      |
|    policy_loss        | -6.88e-07 |
|    value_loss         | 2.36e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1526      |
|    iterations         | 6600      |
|    time_elapsed       | 21        |
|    total_timesteps    | 33000     |
| train/                |           |
|    entropy_loss       | -0.0142   |
|    explained_variance | -0.000551 |
|    learning_rate      | 0.0007    |
|    n_updates          | 6599      |
|    policy_loss        | -9.26e-07 |
|    value_l

------------------------------------
| time/                 |          |
|    fps                | 1534     |
|    iterations         | 8100     |
|    time_elapsed       | 26       |
|    total_timesteps    | 40500    |
| train/                |          |
|    entropy_loss       | -0.0102  |
|    explained_variance | -38.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 8099     |
|    policy_loss        | 1.22e-07 |
|    value_loss         | 2.07e-08 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1535      |
|    iterations         | 8200      |
|    time_elapsed       | 26        |
|    total_timesteps    | 41000     |
| train/                |           |
|    entropy_loss       | -0.00736  |
|    explained_variance | 0.256     |
|    learning_rate      | 0.0007    |
|    n_updates          | 8199      |
|    policy_loss        | -2.49e-07 |
|    value_loss         | 

-------------------------------------
| time/                 |           |
|    fps                | 1537      |
|    iterations         | 9700      |
|    time_elapsed       | 31        |
|    total_timesteps    | 48500     |
| train/                |           |
|    entropy_loss       | -0.00346  |
|    explained_variance | -0.41     |
|    learning_rate      | 0.0007    |
|    n_updates          | 9699      |
|    policy_loss        | -3.16e-08 |
|    value_loss         | 9.82e-09  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1536     |
|    iterations         | 9800     |
|    time_elapsed       | 31       |
|    total_timesteps    | 49000    |
| train/                |          |
|    entropy_loss       | -0.00346 |
|    explained_variance | -0.121   |
|    learning_rate      | 0.0007   |
|    n_updates          | 9799     |
|    policy_loss        | 1.66e-07 |
|    value_loss         

------------------------------------
| time/                 |          |
|    fps                | 1537     |
|    iterations         | 11300    |
|    time_elapsed       | 36       |
|    total_timesteps    | 56500    |
| train/                |          |
|    entropy_loss       | -0.0031  |
|    explained_variance | -0.442   |
|    learning_rate      | 0.0007   |
|    n_updates          | 11299    |
|    policy_loss        | 1.1e-07  |
|    value_loss         | 1.38e-07 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1537     |
|    iterations         | 11400    |
|    time_elapsed       | 37       |
|    total_timesteps    | 57000    |
| train/                |          |
|    entropy_loss       | -0.00347 |
|    explained_variance | -0.72    |
|    learning_rate      | 0.0007   |
|    n_updates          | 11399    |
|    policy_loss        | 2.58e-07 |
|    value_loss         | 5.88e-07 |
-

------------------------------------
| time/                 |          |
|    fps                | 1537     |
|    iterations         | 12900    |
|    time_elapsed       | 41       |
|    total_timesteps    | 64500    |
| train/                |          |
|    entropy_loss       | -0.00319 |
|    explained_variance | 0.14     |
|    learning_rate      | 0.0007   |
|    n_updates          | 12899    |
|    policy_loss        | 1.13e-07 |
|    value_loss         | 1.27e-07 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1538      |
|    iterations         | 13000     |
|    time_elapsed       | 42        |
|    total_timesteps    | 65000     |
| train/                |           |
|    entropy_loss       | -0.00317  |
|    explained_variance | 0.0201    |
|    learning_rate      | 0.0007    |
|    n_updates          | 12999     |
|    policy_loss        | -3.34e-07 |
|    value_loss         | 

-------------------------------------
| time/                 |           |
|    fps                | 1538      |
|    iterations         | 14500     |
|    time_elapsed       | 47        |
|    total_timesteps    | 72500     |
| train/                |           |
|    entropy_loss       | -0.00312  |
|    explained_variance | 0.0204    |
|    learning_rate      | 0.0007    |
|    n_updates          | 14499     |
|    policy_loss        | -1.57e-07 |
|    value_loss         | 2.54e-07  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1538     |
|    iterations         | 14600    |
|    time_elapsed       | 47       |
|    total_timesteps    | 73000    |
| train/                |          |
|    entropy_loss       | -0.00313 |
|    explained_variance | 0.0453   |
|    learning_rate      | 0.0007   |
|    n_updates          | 14599    |
|    policy_loss        | 1.26e-07 |
|    value_loss         

------------------------------------
| time/                 |          |
|    fps                | 1538     |
|    iterations         | 16100    |
|    time_elapsed       | 52       |
|    total_timesteps    | 80500    |
| train/                |          |
|    entropy_loss       | -0.00313 |
|    explained_variance | -0.0197  |
|    learning_rate      | 0.0007   |
|    n_updates          | 16099    |
|    policy_loss        | 1.4e-07  |
|    value_loss         | 2.05e-07 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1538     |
|    iterations         | 16200    |
|    time_elapsed       | 52       |
|    total_timesteps    | 81000    |
| train/                |          |
|    entropy_loss       | -0.00313 |
|    explained_variance | 0.275    |
|    learning_rate      | 0.0007   |
|    n_updates          | 16199    |
|    policy_loss        | 7.1e-08  |
|    value_loss         | 5.28e-08 |
-

-------------------------------------
| time/                 |           |
|    fps                | 1536      |
|    iterations         | 17700     |
|    time_elapsed       | 57        |
|    total_timesteps    | 88500     |
| train/                |           |
|    entropy_loss       | -0.00313  |
|    explained_variance | -0.0435   |
|    learning_rate      | 0.0007    |
|    n_updates          | 17699     |
|    policy_loss        | -2.56e-07 |
|    value_loss         | 6.88e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1536      |
|    iterations         | 17800     |
|    time_elapsed       | 57        |
|    total_timesteps    | 89000     |
| train/                |           |
|    entropy_loss       | -0.00314  |
|    explained_variance | 0.0443    |
|    learning_rate      | 0.0007    |
|    n_updates          | 17799     |
|    policy_loss        | -1.88e-07 |
|    value_l

------------------------------------
| time/                 |          |
|    fps                | 1530     |
|    iterations         | 19300    |
|    time_elapsed       | 63       |
|    total_timesteps    | 96500    |
| train/                |          |
|    entropy_loss       | -0.00363 |
|    explained_variance | 0.0256   |
|    learning_rate      | 0.0007   |
|    n_updates          | 19299    |
|    policy_loss        | 1.99e-07 |
|    value_loss         | 3.01e-07 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1530     |
|    iterations         | 19400    |
|    time_elapsed       | 63       |
|    total_timesteps    | 97000    |
| train/                |          |
|    entropy_loss       | -0.00363 |
|    explained_variance | -0.0574  |
|    learning_rate      | 0.0007   |
|    n_updates          | 19399    |
|    policy_loss        | 2.44e-07 |
|    value_loss         | 4.5e-07  |
-

-------------------------------------
| time/                 |           |
|    fps                | 1525      |
|    iterations         | 20900     |
|    time_elapsed       | 68        |
|    total_timesteps    | 104500    |
| train/                |           |
|    entropy_loss       | -0.00153  |
|    explained_variance | 0.0523    |
|    learning_rate      | 0.0007    |
|    n_updates          | 20899     |
|    policy_loss        | -3.09e-08 |
|    value_loss         | 4.99e-08  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1525      |
|    iterations         | 21000     |
|    time_elapsed       | 68        |
|    total_timesteps    | 105000    |
| train/                |           |
|    entropy_loss       | -0.000995 |
|    explained_variance | -16.9     |
|    learning_rate      | 0.0007    |
|    n_updates          | 20999     |
|    policy_loss        | 3.86e-10  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1523      |
|    iterations         | 22500     |
|    time_elapsed       | 73        |
|    total_timesteps    | 112500    |
| train/                |           |
|    entropy_loss       | -0.0011   |
|    explained_variance | 0.0285    |
|    learning_rate      | 0.0007    |
|    n_updates          | 22499     |
|    policy_loss        | -3.29e-08 |
|    value_loss         | 1.23e-07  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1524     |
|    iterations         | 22600    |
|    time_elapsed       | 74       |
|    total_timesteps    | 113000   |
| train/                |          |
|    entropy_loss       | -0.0011  |
|    explained_variance | -0.0143  |
|    learning_rate      | 0.0007   |
|    n_updates          | 22599    |
|    policy_loss        | 8.87e-08 |
|    value_loss         

-------------------------------------
| time/                 |           |
|    fps                | 1526      |
|    iterations         | 24100     |
|    time_elapsed       | 78        |
|    total_timesteps    | 120500    |
| train/                |           |
|    entropy_loss       | -0.000671 |
|    explained_variance | -0.227    |
|    learning_rate      | 0.0007    |
|    n_updates          | 24099     |
|    policy_loss        | 2.86e-08  |
|    value_loss         | 2.64e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1526      |
|    iterations         | 24200     |
|    time_elapsed       | 79        |
|    total_timesteps    | 121000    |
| train/                |           |
|    entropy_loss       | -0.000671 |
|    explained_variance | -0.0422   |
|    learning_rate      | 0.0007    |
|    n_updates          | 24199     |
|    policy_loss        | 2e-08     |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1528      |
|    iterations         | 25700     |
|    time_elapsed       | 84        |
|    total_timesteps    | 128500    |
| train/                |           |
|    entropy_loss       | -0.000671 |
|    explained_variance | -0.422    |
|    learning_rate      | 0.0007    |
|    n_updates          | 25699     |
|    policy_loss        | -5.45e-09 |
|    value_loss         | 8.89e-09  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1528      |
|    iterations         | 25800     |
|    time_elapsed       | 84        |
|    total_timesteps    | 129000    |
| train/                |           |
|    entropy_loss       | -0.000671 |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 25799     |
|    policy_loss        | -2.93e-08 |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1529      |
|    iterations         | 27300     |
|    time_elapsed       | 89        |
|    total_timesteps    | 136500    |
| train/                |           |
|    entropy_loss       | -0.000672 |
|    explained_variance | -0.394    |
|    learning_rate      | 0.0007    |
|    n_updates          | 27299     |
|    policy_loss        | 3.52e-09  |
|    value_loss         | 3.9e-09   |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1529      |
|    iterations         | 27400     |
|    time_elapsed       | 89        |
|    total_timesteps    | 137000    |
| train/                |           |
|    entropy_loss       | -0.000671 |
|    explained_variance | -0.146    |
|    learning_rate      | 0.0007    |
|    n_updates          | 27399     |
|    policy_loss        | 1.49e-08  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1528      |
|    iterations         | 28900     |
|    time_elapsed       | 94        |
|    total_timesteps    | 144500    |
| train/                |           |
|    entropy_loss       | -0.000792 |
|    explained_variance | -0.489    |
|    learning_rate      | 0.0007    |
|    n_updates          | 28899     |
|    policy_loss        | 1.87e-08  |
|    value_loss         | 8.02e-08  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1528      |
|    iterations         | 29000     |
|    time_elapsed       | 94        |
|    total_timesteps    | 145000    |
| train/                |           |
|    entropy_loss       | -0.000792 |
|    explained_variance | 0.0212    |
|    learning_rate      | 0.0007    |
|    n_updates          | 28999     |
|    policy_loss        | 3.41e-08  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 30500     |
|    time_elapsed       | 99        |
|    total_timesteps    | 152500    |
| train/                |           |
|    entropy_loss       | -0.000513 |
|    explained_variance | -0.739    |
|    learning_rate      | 0.0007    |
|    n_updates          | 30499     |
|    policy_loss        | 6.95e-09  |
|    value_loss         | 3.1e-08   |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 30600     |
|    time_elapsed       | 100       |
|    total_timesteps    | 153000    |
| train/                |           |
|    entropy_loss       | -0.000512 |
|    explained_variance | -0.278    |
|    learning_rate      | 0.0007    |
|    n_updates          | 30599     |
|    policy_loss        | -3.41e-08 |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 32100     |
|    time_elapsed       | 105       |
|    total_timesteps    | 160500    |
| train/                |           |
|    entropy_loss       | -0.000511 |
|    explained_variance | -0.0479   |
|    learning_rate      | 0.0007    |
|    n_updates          | 32099     |
|    policy_loss        | 1.42e-08  |
|    value_loss         | 1.18e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 32200     |
|    time_elapsed       | 105       |
|    total_timesteps    | 161000    |
| train/                |           |
|    entropy_loss       | -0.000512 |
|    explained_variance | -0.0747   |
|    learning_rate      | 0.0007    |
|    n_updates          | 32199     |
|    policy_loss        | 3.04e-08  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1528      |
|    iterations         | 33700     |
|    time_elapsed       | 110       |
|    total_timesteps    | 168500    |
| train/                |           |
|    entropy_loss       | -0.000512 |
|    explained_variance | 0.0115    |
|    learning_rate      | 0.0007    |
|    n_updates          | 33699     |
|    policy_loss        | 1.55e-08  |
|    value_loss         | 1.4e-07   |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1528      |
|    iterations         | 33800     |
|    time_elapsed       | 110       |
|    total_timesteps    | 169000    |
| train/                |           |
|    entropy_loss       | -0.000511 |
|    explained_variance | -0.0386   |
|    learning_rate      | 0.0007    |
|    n_updates          | 33799     |
|    policy_loss        | 2.01e-08  |
|    value_l

------------------------------------
| time/                 |          |
|    fps                | 1527     |
|    iterations         | 35300    |
|    time_elapsed       | 115      |
|    total_timesteps    | 176500   |
| train/                |          |
|    entropy_loss       | -0.00051 |
|    explained_variance | -0.0318  |
|    learning_rate      | 0.0007   |
|    n_updates          | 35299    |
|    policy_loss        | 2.36e-08 |
|    value_loss         | 3.26e-07 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 35400     |
|    time_elapsed       | 115       |
|    total_timesteps    | 177000    |
| train/                |           |
|    entropy_loss       | -0.000416 |
|    explained_variance | 0.0312    |
|    learning_rate      | 0.0007    |
|    n_updates          | 35399     |
|    policy_loss        | 1.24e-08  |
|    value_loss         | 

-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 36900     |
|    time_elapsed       | 120       |
|    total_timesteps    | 184500    |
| train/                |           |
|    entropy_loss       | -0.000416 |
|    explained_variance | 0.00953   |
|    learning_rate      | 0.0007    |
|    n_updates          | 36899     |
|    policy_loss        | 1.36e-08  |
|    value_loss         | 1.71e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 37000     |
|    time_elapsed       | 121       |
|    total_timesteps    | 185000    |
| train/                |           |
|    entropy_loss       | -0.000416 |
|    explained_variance | -0.00317  |
|    learning_rate      | 0.0007    |
|    n_updates          | 36999     |
|    policy_loss        | 1.61e-08  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 38500     |
|    time_elapsed       | 126       |
|    total_timesteps    | 192500    |
| train/                |           |
|    entropy_loss       | -0.000417 |
|    explained_variance | -0.00017  |
|    learning_rate      | 0.0007    |
|    n_updates          | 38499     |
|    policy_loss        | 1.53e-08  |
|    value_loss         | 2.11e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 38600     |
|    time_elapsed       | 126       |
|    total_timesteps    | 193000    |
| train/                |           |
|    entropy_loss       | -0.000417 |
|    explained_variance | 0.00473   |
|    learning_rate      | 0.0007    |
|    n_updates          | 38599     |
|    policy_loss        | 1.24e-08  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 40100     |
|    time_elapsed       | 131       |
|    total_timesteps    | 200500    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | 0.00375   |
|    learning_rate      | 0.0007    |
|    n_updates          | 40099     |
|    policy_loss        | 8.63e-09  |
|    value_loss         | 1.73e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1527      |
|    iterations         | 40200     |
|    time_elapsed       | 131       |
|    total_timesteps    | 201000    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | -0.000585 |
|    learning_rate      | 0.0007    |
|    n_updates          | 40199     |
|    policy_loss        | 8.52e-09  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1528      |
|    iterations         | 41700     |
|    time_elapsed       | 136       |
|    total_timesteps    | 208500    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | -0.0078   |
|    learning_rate      | 0.0007    |
|    n_updates          | 41699     |
|    policy_loss        | 9.28e-09  |
|    value_loss         | 2e-07     |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1528      |
|    iterations         | 41800     |
|    time_elapsed       | 136       |
|    total_timesteps    | 209000    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | -0.00713  |
|    learning_rate      | 0.0007    |
|    n_updates          | 41799     |
|    policy_loss        | 8.47e-09  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1529      |
|    iterations         | 43300     |
|    time_elapsed       | 141       |
|    total_timesteps    | 216500    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | 0.0105    |
|    learning_rate      | 0.0007    |
|    n_updates          | 43299     |
|    policy_loss        | 8.34e-09  |
|    value_loss         | 1.62e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1529      |
|    iterations         | 43400     |
|    time_elapsed       | 141       |
|    total_timesteps    | 217000    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | 0.0376    |
|    learning_rate      | 0.0007    |
|    n_updates          | 43399     |
|    policy_loss        | 1.04e-08  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 44900     |
|    time_elapsed       | 146       |
|    total_timesteps    | 224500    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | 0.0234    |
|    learning_rate      | 0.0007    |
|    n_updates          | 44899     |
|    policy_loss        | 9.5e-09   |
|    value_loss         | 2.09e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 45000     |
|    time_elapsed       | 147       |
|    total_timesteps    | 225000    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | 0.0131    |
|    learning_rate      | 0.0007    |
|    n_updates          | 44999     |
|    policy_loss        | 3.35e-09  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 46500     |
|    time_elapsed       | 151       |
|    total_timesteps    | 232500    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | 0.00142   |
|    learning_rate      | 0.0007    |
|    n_updates          | 46499     |
|    policy_loss        | 7.6e-09   |
|    value_loss         | 1.34e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 46600     |
|    time_elapsed       | 152       |
|    total_timesteps    | 233000    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | -0.00138  |
|    learning_rate      | 0.0007    |
|    n_updates          | 46599     |
|    policy_loss        | 7.82e-09  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1529      |
|    iterations         | 48100     |
|    time_elapsed       | 157       |
|    total_timesteps    | 240500    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | 0.00228   |
|    learning_rate      | 0.0007    |
|    n_updates          | 48099     |
|    policy_loss        | 8.61e-09  |
|    value_loss         | 1.72e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1529      |
|    iterations         | 48200     |
|    time_elapsed       | 157       |
|    total_timesteps    | 241000    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | -0.0111   |
|    learning_rate      | 0.0007    |
|    n_updates          | 48199     |
|    policy_loss        | 8.43e-09  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1529      |
|    iterations         | 49700     |
|    time_elapsed       | 162       |
|    total_timesteps    | 248500    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | -0.0115   |
|    learning_rate      | 0.0007    |
|    n_updates          | 49699     |
|    policy_loss        | 8.81e-09  |
|    value_loss         | 1.81e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1529      |
|    iterations         | 49800     |
|    time_elapsed       | 162       |
|    total_timesteps    | 249000    |
| train/                |           |
|    entropy_loss       | -0.000274 |
|    explained_variance | -0.0117   |
|    learning_rate      | 0.0007    |
|    n_updates          | 49799     |
|    policy_loss        | 8.39e-09  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 51300     |
|    time_elapsed       | 167       |
|    total_timesteps    | 256500    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | -0.0031   |
|    learning_rate      | 0.0007    |
|    n_updates          | 51299     |
|    policy_loss        | 7.98e-09  |
|    value_loss         | 1.48e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 51400     |
|    time_elapsed       | 167       |
|    total_timesteps    | 257000    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | -0.0429   |
|    learning_rate      | 0.0007    |
|    n_updates          | 51399     |
|    policy_loss        | 7.84e-09  |
|    value_l

-------------------------------------
| time/                 |           |
|    fps                | 1531      |
|    iterations         | 52900     |
|    time_elapsed       | 172       |
|    total_timesteps    | 264500    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | -0.000897 |
|    learning_rate      | 0.0007    |
|    n_updates          | 52899     |
|    policy_loss        | 8.28e-09  |
|    value_loss         | 1.59e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1531      |
|    iterations         | 53000     |
|    time_elapsed       | 173       |
|    total_timesteps    | 265000    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | -0.00265  |
|    learning_rate      | 0.0007    |
|    n_updates          | 52999     |
|    policy_loss        | 8.48e-09  |
|    value_l

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1531      |
|    iterations         | 54300     |
|    time_elapsed       | 177       |
|    total_timesteps    | 271500    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | -0.0003   |
|    learning_rate      | 0.0007    |
|    n_updates          | 54299     |
|    policy_loss        | -8.28e-09 |
|    value_loss         | 1.59e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1531      |
|    iterations         | 54400     |
|    time_elapsed       | 177       |
|    total_timesteps    | 272000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 55600     |
|    time_elapsed       | 181       |
|    total_timesteps    | 278000    |
| train/                |           |
|    entropy_loss       | -0.000273 |
|    explained_variance | 0.0102    |
|    learning_rate      | 0.0007    |
|    n_updates          | 55599     |
|    policy_loss        | -8.24e-09 |
|    value_loss         | 1.58e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 55700     |
|    time_elapsed       | 181       |
|    total_timesteps    | 278500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 56900     |
|    time_elapsed       | 185       |
|    total_timesteps    | 284500    |
| train/                |           |
|    entropy_loss       | -0.000199 |
|    explained_variance | 0.00619   |
|    learning_rate      | 0.0007    |
|    n_updates          | 56899     |
|    policy_loss        | -4.84e-09 |
|    value_loss         | 1.08e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 57000     |
|    time_elapsed       | 186       |
|    total_timesteps    | 285000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 58200     |
|    time_elapsed       | 190       |
|    total_timesteps    | 291000    |
| train/                |           |
|    entropy_loss       | -0.0002   |
|    explained_variance | -1.42     |
|    learning_rate      | 0.0007    |
|    n_updates          | 58199     |
|    policy_loss        | -3.95e-10 |
|    value_loss         | 7.85e-10  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1530      |
|    iterations         | 58300     |
|    time_elapsed       | 190       |
|    total_timesteps    | 291500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1531      |
|    iterations         | 59500     |
|    time_elapsed       | 194       |
|    total_timesteps    | 297500    |
| train/                |           |
|    entropy_loss       | -0.000199 |
|    explained_variance | 0.0279    |
|    learning_rate      | 0.0007    |
|    n_updates          | 59499     |
|    policy_loss        | -1.33e-09 |
|    value_loss         | 8.15e-09  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 2.66e+05  |
|    ep_rew_mean        | -1.07e+03 |
| time/                 |           |
|    fps                | 1531      |
|    iterations         | 59600     |
|    time_elapsed       | 194       |
|    total_timesteps    | 298000    |
| train/    

In [54]:
import torch
import torch as th
import torch.nn as nn

class Mlp(nn.Module):  

    def __init__(self, n_inputs=1, n_actions=3):
        nn.Module.__init__(self)

        self.fc1 = nn.Linear(n_inputs, 64)
        self.fc2 = nn.Linear(64, 64)      
        self.fc3 = nn.Linear(64, n_actions)      
        self.activ_fn = nn.Tanh()
        self.out_activ = nn.Softmax(dim=0)

    def forward(self, x):
        x = self.activ_fn(self.fc1(x.view(-1,)))
        x = self.activ_fn(self.fc2(x))
        x = self.out_activ(self.fc3(x))
        return x


def copy_mlp_weights(baselines_model):
    torch_mlp = Mlp(n_inputs=1, n_actions=3)
    model_params = baselines_model.get_parameters()

    policy_keys = [key for key in model_params.keys() if "pi" in key]
    policy_params = [model_params[key] for key in policy_keys]
    
    for (th_key, pytorch_param), key, policy_param in zip(torch_mlp.named_parameters(), policy_keys, policy_params):
        param = th.from_numpy(policy_param)    
        #Copies parameters from baselines model to pytorch model
        print(th_key, key)
        print(pytorch_param.shape, param.shape, policy_param.shape)
        pytorch_param.data.copy_(param.data.clone().t())
    
    return torch_mlp

In [55]:
th_model = copy_mlp_weights(model)
# device = torch.device("cuda" if True else "cpu")
# th_model.to(device)
device = 'cpu'

## Adversarial Attack

In [None]:
loss = nn.CrossEntropyLoss()
eps = 0.01

In [68]:
def fgsm_attack(model, loss, price_vec, actions, eps = 0.007) :
    
#     images = images.to(device)
#     labels = labels.to(device)
    price_vec.requires_grad = True
            
    outputs = model(price_vec).view(1,3)
    
    model.zero_grad()
#     print(outputs, actions)
#     print(outputs.size(), actions)

    cost = loss(outputs, actions)
    cost.backward()
    
    attack_price = price_vec + eps*price_vec.grad.sign()
#     attack_price = torch.clamp(attack_price)
    if price_vec.grad.sign() > 0:
        print(price_vec , eps*price_vec.grad.sign(),attack_price)
    return attack_price

In [71]:
print("Attack price & Predicted action")

# model.eval()

correct = 0
total = 0

for prices in price_df.to_numpy():
    prices = th.Tensor(np.array(prices))
    actions = th_model(prices).argmax().view(-1,)
#     print('original', prices, actions)
    prices = fgsm_attack(th_model, loss, prices, actions, eps)
    
    labels = actions
    outputs = th_model(prices).view(1,3)
#     print(outputs.argmax())
    _, pre = torch.max(outputs.data, 1)
    
    total += 1
    correct += (pre == labels).sum()
#     print('changed',prices,pre)
    
#     print(labels,outputs)
#     imshow(torchvision.utils.make_grid(images.cpu().data, normalize=True), [normal_data.classes[i] for i in pre])
    
print('Accuracy of test text: %f %%' % (100 * float(correct) / total))

Attack price & Predicted action
Accuracy of test text: 100.000000 %
