#### Learning Exercise Policies for American Option

The linear architechture represents the state action function, and LSTD is used to compute an approximation of action value function of the policy. 

In [1]:
# the data is assumed to be a trajectory in the form of <s1, a1, R1, s2, a2, R2, ..., >
import numpy as np
from numpy.linalg import inv
# collect data
MU = 0
SIGMA = 0.3

s_0 = 12 # initial price of the stock
it = 10
T = 15
K = 12

# assume the movement of stock is a brownian motion

def next_price(s_0, mu, sigma):
    s_t = s_0 + mu + (2 * np.random.rand(1) - 1)*sigma
    return s_t[0]

def generate_training_data(it, s_0, T = T):
    prices = []
    for i in range(it): # 
        trail = []
        for j in range(T):
            if j == 0:
                trail.append(s_0)
            else:
                trail.append(next_price(trail[-1], MU, SIGMA))
        prices.append(trail)
    return prices


In [2]:
prices = np.array(generate_training_data(it, s_0)) # the trajactory of the stock prices
print (prices)

[[12.         12.15182297 12.08632474 12.09861926 12.12388873 12.24386035
  12.17709239 12.03873184 11.8845405  11.76306183 11.57953051 11.74114006
  11.68418188 11.73374586 11.98458624]
 [12.         11.87214538 11.61242249 11.66131234 11.84925861 11.81507867
  11.90022835 12.12418025 11.93131697 11.68637294 11.61091583 11.33827057
  11.59987489 11.59903701 11.87543592]
 [12.         12.1756104  11.94977678 11.73382413 11.94781288 12.21384034
  11.97112163 11.98932461 11.71232212 11.89447042 12.00135057 11.79656391
  11.51693965 11.35256766 11.46035859]
 [12.         11.97584284 11.68999665 11.94446921 12.14000662 12.37890753
  12.16082972 12.14027458 12.36331091 12.62207585 12.54468684 12.60788902
  12.51874426 12.64315675 12.66593628]
 [12.         11.97416433 12.15493933 12.23576947 12.36146407 12.57074218
  12.41668167 12.45524528 12.26756535 12.34302599 12.37542955 12.23289888
  12.06363308 12.35869408 12.34464621]
 [12.         11.93281099 12.19762331 12.03271188 11.98621088 12.

In [3]:
# assume American Call option to price
def generate_label(traj, K): # K is the strike price
    num, T = traj.shape
    labels = np.zeros((num, T))
    for i in range(num):
        exec_price = 0
        exec_time = -1
        current_traj = traj[i]
        for j in range(T):
            if (current_traj[j] - K > exec_price):
                exec_price = current_traj[j] - K
                exec_time = j
        labels[i, exec_time] = 1
    return labels

In [4]:
labels = generate_label(prices, K)
print (labels)

[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


In [5]:
class LSPT:
    def __init__(self, traj, labels, mu, sigma, w_0 = None, hidden_size = 2):
        self.traj = traj
        self.labels = labels
        self.mu = mu
        self.sigma = sigma
        if w_0 is None:
            self.w = np.random.rand(hidden_size)
        else:
            self.w = w_0
        self.hidden_size = hidden_size
        self.EPS = 1e-2
        
    def get_features(self, price):
        features = [price, price*price]
        return np.array(features)
    
    def LSTD(self, i, j):
        # generate the return following the current policy
        pass
    
    def inv(self, a):
        return (1/a)
    
    def train(self):
        num, T = self.traj.shape
        for i in range(num):
            current_traj = self.traj[i]
            for j in range(T):
                current_state = self.get_features(current_traj[j])
                next_policy = np.dot(self.w, current_state)
                if next_policy - self.labels[i, j] <= self.EPS:
                    print ('finish training...')
                    return
                self.w = self.labels[i, j] * self.inv(current_state)
                
    def predict(self):
        nums, T = self.traj.shape
        pred = np.zeros(nums)
        for i in range(nums):
            exec_time = -1
            for j in range(T):
                if np.dot(self.w, self.get_features(self.traj[i, j])) == 1:
                    exec_time = j
            price = np.max(self.traj[i, exec_time] - K, 0)
            pred[i] = price
        return np.mean(pred)

In [6]:
lspt = LSPT(prices, labels, MU, SIGMA)

In [7]:
lspt.train()

finish training...


In [8]:
pred = lspt.predict()

In [11]:
# this is a very simple example and the hidden size if set to 2
# and in reality, more information about the market should be taken into consideration
print ('The price of such option is {}'.format(pred))

The price of such option is 0.1027055221822339
