# Homework week 8 : Reinforcement learning
## Reading the data

In [1]:
import numpy as np
import pandas as pd

In [94]:
ge = pd.read_csv("ge.csv")[["Close"]]
boa = pd.read_csv("bank_of_america.csv")[["Close"]]

print(ge.head())
print(boa.head())

       Close
0  19.950001
1  20.070000
2  20.020000
3  19.959999
4  19.740000
   Close
0   9.53
1   9.57
2   9.68
3   9.49
4   9.20


## Splitting it into train and test

In [95]:
def split_train_test(data, proportion=0.8):
    # For this exercise, chronology matters. Therefore, we will not shuffle the data,
    # but instead, use the start of the data set for training and the end for testing
    train_data = data[:int(proportion * data.shape[0])]\
            .reset_index(drop=True)
    test_data = data[int(proportion * data.shape[0]):]\
            .reset_index(drop=True)
    
    return train_data, test_data

In [96]:
ge_train, ge_test = split_train_test(ge, 0.7)
print(ge_train.info())
print(ge_test.info())
boa_train, boa_test = split_train_test(boa, 0.7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 1 columns):
Close    879 non-null float64
dtypes: float64(1)
memory usage: 6.9 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 1 columns):
Close    378 non-null float64
dtypes: float64(1)
memory usage: 3.0 KB
None


## Create the environment
The purpose of our agent will be to invest money profitably. We will need to create an environment with the data which contains a portfolio that our agent will have to manage. This environment will be modeled inside a class.
### Reward function
To compute our reward, we will use the value of the portfolio. That value is the sum of total cash and total value of the actions owned by our agent. Let's define a fonction to compute that now.

In [5]:
def compute_portfolio(cash, n_actions, action_value):
    return cash + n_actions * action_value

### Actions
Our agent will have access to three different actions : buy, hold and sell. Buy and sell will sell "chunks" of actions whose constant size will remain the same throughout our experiment, ie the agent can only buy or sell a certain amount of actions each day.
* Each day, one of the three possible action will happen, then the function new_day will be called, updating the portfolio with the new value of the actions.

### Converting continuous to discrete
The decisions will be taken according to values in a Q matrix. Every elements in the Q matrix will be initialised to 0. In order to fit it in a Q matrix, we will need to make the data discrete. For this we will create an array which can hold a categorical value. This value can be of three types : "rise", "fall", or "stable" (coded as 1, -1 or 0). We could have any number of such columns, each looking at the difference between the present and a different day in the past.

In [8]:
def discretise(serie, threshold=0.001, n_days=1):
    """
    Takes a pd.Series, looks at the "Close" column and returns the type of difference between this 
    line ("rise", "fall" or "stable") and the line that is n_days behind it.
    """
    result = []
    for index, row in serie.iteritems():
        # If there hasn't been enough days since the start yet, we consider the value as "stable"
        if index < n_days:
            result.append(0)
            
        else:
            # Compute the size of the threshold given the value for this particular day
            thresh = threshold * row
            # If the difference is under the threshold
            if np.abs(row - serie[index-1]) < thresh:
                result.append(0)
            # Else, we give it rise or fall accordingly
            elif serie[index] > serie[index-1]:
                result.append(1)
            else:
                result.append(-1)
                
    return result
            

def apply_discretise(data, threshold=0.001, factors=[1]):
    """
    Takes a dataframe as a parameter. Returns a numpy matrix, 
    which, for each days, says if value of the "Close" column rose, fell or stayed stable.
    To be considered stable, the absolute difference in value has to be under threshold*current_value
    """
    result = np.zeros((data.shape[0], len(factors)))
    for i, days in enumerate(factors):
        result[:, i] = discretise(data["Close"], threshold=0.001, n_days=days)
    
    return result

In [9]:
ge_test = apply_discretise(ge, threshold=0.005, factors=[1, 2])

print(ge_test.shape)

(1257, 2)


In [68]:
class Environment():
    def __init__(self, data, chunk=10, factors=[1]):
        self.cash = 5000
        self.actions = 0
        self.portfolio = self.cash
        self.day = 0
        self.data = data
        self.threshold = 0.001
        # We discretise the data we receive
        self.discrete = apply_discretise(data, threshold=self.threshold, factors=factors)
        # Add the third column
        self.chunk = chunk
        # This variable will list the actions taken throughout the simulation
        self.decision_history = []
        
    def __getattr__(self, name):
        if name == "action_value":
            if self.day < self.data.shape[0]:
                return self.data.loc[self.day, 'Close']
            return self.data.loc[self.data.shape[0]-1, 'Close']
        elif name == "current_state":
            return self.discrete[self.day, :]
        elif name == "finished":
            if self.day >= self.data.shape[0]:
                return True
            return False
        else:
            super().__getattr__(name)
            
    def new_day(self):
        """
        This function will model the passing of a new day, and compute the value of the portfolio,
        taking into account the new value of actions
        """
        self.day += 1
        self.portfolio = compute_portfolio(self.cash, self.actions, self.action_value)
            
    """
    action functions
    """
    def buy(self):
        # If we don't have enough cash to buy a chunk of actions, hold instead
        if self.cash < self.chunk * self.action_value:
            self.hold()
            return
        self.cash -= self.chunk * self.action_value
        self.actions += self.chunk
        self.decision_history.append(0)
        self.new_day()
    
    def hold(self):
        
        self.decision_history.append(1)
        self.new_day()
        
    def sell(self):
        # If we don't have enough actions to sell, hold instead.
        if self.actions < self.chunk:
            self.hold()
            return
        self.cash += self.chunk * self.action_value
        self.actions -= self.chunk
        self.decision_history.append(2)
        self.new_day()
        
    def execute_decision(self, value):
        if value == 0:
            self.buy()
        elif value == 1:
            self.hold()
        elif value == 2:
            self.sell()
            
    """
    Evaluation functions
    """
    def evaluate_decision(self):
        """
        In this function, we compare the decision we took last day to the variation of the stock
        since that day. If the comparison holds, we return a positive value. Otherwise, we return
        a negative value.
        """
        decision = self.decision_history[-1]
        value_change = self.action_value - self.data.loc[self.day-1, 'Close']
        
        if decision == 0:
            # if we decided to buy, we gain points if the value change was positive
            return value_change
        elif decision == 2:
            # if we decided to sell, we get rewarded if the value went down
            return - value_change
        elif decision == 1:
            # if we decided to hold, we get punished or rewarded based on the intensity of the value
            # change
            thresh_multiplied = self.action_value * self.threshold
            if value_change > thresh_multiplied:
                return - np.abs(value_change - thresh_multiplied)
            else:
                return thresh_multiplied

## Creating the Q matrix
Our agent will behave according to values on a Q matrix. This Q matrix will be a 2D matrix, with its rows representing the state and its columns representing the decision.
* Each of its element is the score attributed to a certain decision in a certain situation.

In [57]:
class Q_matrix():
    def __init__(self, environment):
        self.matrix = np.zeros((np.power(3, environment.discrete.shape[1]), 3))
        
    def get_line(self, state):
        line_number = 0
        for i, discrete in enumerate(state):
            
            line_number += (3 ** i) * (discrete + 1)
        return int(line_number)
        
    def get_coordinates(self, decision, state):
        col_number = decision
        line_number = self.get_line(state)
            
        return line_number, col_number
        
    def get_square(self, decision, state):
        l, c = self.get_coordinates(decision, state)
        return self.matrix[l, c]
    
    def set_square(self, decision, state, value):
        l, c = self.get_coordinates(decision, state)
        self.matrix[l, c] = value
        
    def increment_square(self, decision, state, value):
        self.set_square(decision, state, value + self.get_square(decision, state))
        
    def make_choice(self, state):
        """
        This function will search our matrix for the right state, and make a decision based on
        the highest rating in our Q matrix for this state.
        If the decisions are tied, we pick one at random
        """
        line = self.matrix[self.get_line(state), :]
        
        argmax = np.argmax(line)
        if line[argmax] = 0
        
        
        return np.argmax(line)

In [33]:
environment = Environment(ge, chunk=10, factors=[1, 10])

Q = Q_matrix(environment)

print(Q.matrix)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


## Training
Now our agent needs to fill the Q matrix with values. We make it take random decisions, and for each, it will check the results of its actions and modify the Q matrix accordingly.

### Paradigm
We go with the assumption (which may or may not be correct) that if the value of the actions went up, then we should have bought, if the value went down we should have sold, and if the value was stable we should have held. This way, on each step we can evaluate the decision taken by our agent.

To evaluate the action, we check how much money we made + how much money we can expect to save next turn with a factor of gamma.

### Chosing the action
We create a variable, called epsilon, which varies between 0 and 1 and is the rate at which our action is taken at random (exploration) or is decided using existing Q values (exploitation).
* If we are doing exploration, we split our decision like so : 40% chance of buying, 40% chance of selling, and 20% of holding. This is because in this paradigm holding is very unlikely to be positively evaluated.

### Modifying the matrix
After each action we can evaluate a reward. That reward will be added to the matrix in the corresponding square.

In [83]:
def training(environment, n_episodes, gamma=0.8, stop_cond=50, verbose=False):
    epsilon = 1
    Q = Q_matrix(environment)
    
    for i in range(n_episodes):
        # Run the training for one episode, ie through all the data or until we run out
        # of money
        while not environment.finished and environment.portfolio > stop_cond:
            # Generate a random number between 0 and 1
            rand = np.random.rand()
            # Store how the state is before taking the decision
            decision_state = environment.current_state
            # Compare it against epsilon to decide wether we do exploration or exploitation
            if rand < epsilon:
                # In this case, we are doing exploration
                # Select an action randomly from the three that are available to us
                decision = np.random.rand()

                if decision < 0.4:
                    decision = 0
                elif decision < 0.6:
                    decision = 1
                else:
                    decision = 2
            else:
                # Here we are doing exploitation, so we just look at the Q matrix and pick the highest
                # rated possibility
                decision = Q.make_choice(current_state)

            environment.execute_decision(decision)

            # Now we need to evaluate our decision based on the variation of actions
            # A day has passed
            reward = environment.evaluate_decision()
            Q.increment_square(decision, decision_state, reward)
        
        # Now we decrease epsilon linearily
        epsilon -= 1/n_episodes
        
        # We print the value of the portfolio
        if verbose:
            print("Finished episode {}, the portfolio value was {} at the end."\
                  .format(i, environment.portfolio))
            
    # At the end of training, return the Q matrix and the environment in order to analyze the results
    return Q, environment

In [79]:
# Train a Q matrix on the bank of america training set
environment = Environment(boa_train, chunk=15, factors=[1, 5, 20])

Q, environment = training(environment, 100, verbose=False)
print(environment.portfolio)

6074.2997749999995


In [80]:
print(Q.matrix)
print(environment.actions)

[[-1.568509 -6.122508 -6.075945]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.11     -0.32439   0.59    ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.03      0.        0.0092  ]
 [ 0.667156 -0.869861  1.120951]
 [ 0.        0.       -0.09083 ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [-0.66     -0.29258  -0.06    ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [ 0.        0.        0.      ]
 [-1.730058 -7.8131   -2.659404]]
195


## Testing
* Now that we have trained our algorithm, we run it on the test set to see how it does on data it has never seen

In [89]:
def run_Q_matrix(environment, Q):
    while not environment.finished:
        decision = Q.make_choice(environment.current_state)
        environment.execute_decision(decision)
        
        print("actions : {}, portfolio : {}, cash : {}, action value : {}"\
              .format(environment.actions, environment.portfolio, environment.cash, environment.action_value))
        
    return environment

In [101]:
train_environment = Environment(boa_train, chunk=15, factors=[1, 5, 10])
Q, train_environment = training(train_environment, 100)

test_environment = Environment(boa_test, chunk=15, factors=[1, 5, 10])
test_environment = run_Q_matrix(test_environment, Q)

print(test_environment.portfolio)

actions : 15, portfolio : 4998.2, cash : 4767.95, action value : 15.35
actions : 30, portfolio : 5005.099999999999, cash : 4537.7, action value : 15.58
actions : 30, portfolio : 5004.2, cash : 4537.7, action value : 15.55
actions : 45, portfolio : 4996.55, cash : 4304.45, action value : 15.38
actions : 60, portfolio : 5015.15, cash : 4073.75, action value : 15.69
actions : 75, portfolio : 5015.15, cash : 3838.4, action value : 15.69
actions : 90, portfolio : 5020.55, cash : 3603.05, action value : 15.75
actions : 105, portfolio : 5020.55, cash : 3366.8, action value : 15.75
actions : 120, portfolio : 5000.15, cash : 3130.55, action value : 15.58
actions : 120, portfolio : 4992.95, cash : 3130.55, action value : 15.52
actions : 135, portfolio : 4992.95, cash : 2897.75, action value : 15.52
actions : 150, portfolio : 5010.95, cash : 2664.95, action value : 15.64
actions : 165, portfolio : 5101.700165, cash : 2430.35, action value : 16.190001000000002
actions : 180, portfolio : 5089.10016

# Genetic algorithm
We would like to try another method of reinforcement learning. This method would allow us to forego the whole "discretisation" process, and should also allow us to get better results by not needing to make assumption about wether buying or selling is a good move.
## Foundation
We will use the same Environment class that we already built.
## Method
For this method, an agent will be characterised not by a Q matrix but instead by paramaters. Each agent will take into account the value of stock in the last few days. Agents will get generated with a 