In [166]:
import warnings
warnings.simplefilter(action='ignore')

import datetime as dt
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [167]:
import numpy as np
import pandas as pd
# pd.set_option('display.max_rows', None)

In [168]:
from alpaca.data.requests import NewsRequest
from alpaca.data import StockHistoricalDataClient, TimeFrame 
from alpaca.data.requests import StockQuotesRequest, StockBarsRequest

from alpaca.trading.client import TradingClient

KEY = "PKQ9XJDJLTX686HE3ZL9"
SECRET = "4ES985YvYlOWz0eAtzdhcETJ1asEPBSw3gq9ZXs7"
ENDP = "https://paper-api.alpaca.markets"

In [169]:
import tensorflow as tf
import keras

from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, GlobalAveragePooling1D, Conv2D, ConvLSTM2D, ConvLSTM1D, Input, Flatten, Reshape, TextVectorization, concatenate


In [170]:
INTERVAL = 30
LOOKBACK = 50
STOCKS = 2

In [171]:
def build_model(time_steps, items):
    input_volatility = Input(shape=(time_steps, items), name="input_vol")
    input_prem = Input(shape=(1,), name="input_prem")
    input_dte = Input(shape=(1,), name="input_dte")
    input_pc = Input(shape=(1,), name="input_pc")

    lstm1 = LSTM(units=32, return_sequences=True,input_shape=(time_steps, items))(input_volatility)
    lstm2 = LSTM(units=16, return_sequences=True)(lstm1)
    lstm3 = LSTM(units=8, return_sequences=False)(lstm2)
    concatted1 = concatenate([input_prem, input_dte, input_pc])
    dense1 = Dense(units=1, activation="relu")(concatted1)
    concatted2 = concatenate([dense1, lstm3])
    dense2 = Dense(units=1, activation="sigmoid")(concatted2)
    model = keras.Model(inputs = [input_volatility, input_prem, input_dte, input_pc], outputs=dense2)

    model.compile(optimizer="adam", loss="binary_crossentropy")

    return model

    

In [172]:
def collect_dfs(dir_path):
    directory_path = dir_path
    dfs = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            print(filename)
            file_path = os.path.join(directory_path, filename)

            # Read the text file into a DataFrame (adjust read_csv parameters based on your file format)
            df = pd.read_csv(file_path, sep=", ")
            df = df[(df["[STRIKE_DISTANCE]"] < 1) & (df["[DTE]"] > 25) & (df["[DTE]"] < 35)]
            # Append the DataFrame to the list
            dfs.append(df)
    full_df = pd.concat(dfs, ignore_index=True)
    return full_df

In [173]:
def get_data(stocks, start_date, end_date):
    data_client = StockHistoricalDataClient(KEY, SECRET)

    request_params = StockBarsRequest(
        symbol_or_symbols=stocks,
        timeframe=TimeFrame.Day,
        adjustment="split",
        start=start_date,
        end=end_date
        )



    bars_df = data_client.get_stock_bars(request_params).df.tz_convert('America/New_York', level=1)
    stock_data = pd.DataFrame(bars_df)
    # print(stock_data)
    stock_data = stock_data.groupby(['symbol', 'timestamp']).mean().unstack(level=0)
    open_prices = stock_data["open"]
    log_returns = np.log((open_prices.pct_change()+1).dropna())
    return log_returns, open_prices

In [174]:
def compute_vol(raw_data, interval):
   
    data = raw_data
    while(len(data)%interval != 0):
        data.drop(data.tail(1).index,inplace=True)
   
    # var_data = np.array(data[len(data)%interval:]).reshape(len(data)//interval, interval,-1).var(axis=1)
    var_data = data.rolling(interval).var()
    var_data = var_data.dropna()
    return np.sqrt(var_data)

Plan: 

For every option:
    - Get the vol of past 5 months before option was released
    - Get price now, get price at exp. Get profit (if profit positive, "long" else "short")

In [175]:
def build_train_data():
    options_data = collect_dfs("./aapl")
    # quit()


    today = dt.datetime.now()
    returns, prices = get_data(["AAPL", "MSFT", "GOOG"], today - dt.timedelta(7000), today)
    vol_inputs = []
    prem_inputs = []
    dte_inputs = []
    labels = []
    put_call_vol_ratios = []
    for index, row in options_data.iterrows():
        try:
            quote_date = row["[QUOTE_DATE]"]
            exp_date   = row["[EXPIRE_DATE]"]

            strike = row["[STRIKE]"]

            call_last = row["[C_LAST]"]
            put_last = row["[P_LAST]"]

            dte = row["[DTE]"]

            civ = row["[C_IV]"]
            piv = row["[P_IV]"]

            put_vol = row["[P_VOLUME]"]
            call_vol = row["[C_VOLUME]"]
          

            quote_price = prices.loc[quote_date, "AAPL"]
            exp_price = prices.loc[exp_date, "AAPL"]
            # print(type(strike))
        
            if (exp_price-strike-call_last-put_last).item() > 0:
                labels.append(1)
            elif (strike-exp_price-call_last-put_last).item() > 0:
                labels.append(1)
            else:
                labels.append(0)
            
            now = dt.datetime.strptime(quote_date, '%Y-%m-%d').date()
            start = now -dt.timedelta(3000)

            # print(prices.loc[start:now], 30)

            temp = compute_vol(returns.loc[start:now], INTERVAL)
            temp = temp[-LOOKBACK:]
            # print(temp)
            temp["avg"] = temp.mean(axis=1)
            # print("test", temp)
            temp = temp[["AAPL", "avg"]]
            
            
            # print(temp)
            vol_inputs.append(temp)
            # prem_inputs.append(((call_last+put_last)/strike))
            prem_inputs.append(piv+civ)
            dte_inputs.append(dte)
            put_call_vol_ratios.append(call_vol/put_vol)
        except:
            continue
        # break
    # print(inputs)
    print(vol_inputs)
    return vol_inputs, prem_inputs, dte_inputs, put_call_vol_ratios, labels




    

In [176]:
def train_model(model):
    inputs, prem_inputs, dte_inputs, pc_ratio, labels = build_train_data()
    
    # labels = np.array(labels)
    # print(np.array(inputs).shape, np.array(labels).shape)
    # print(np.array(prem_inputs).shape, np.array(dte_inputs).shape)
    # print(inputs)
    
    inputs_train, inputs_test, prem_inputs_train, prem_inputs_test, dte_inputs_train, dte_inputs_test, pc_train, pc_test,y_train, y_test \
    = train_test_split(inputs, prem_inputs, dte_inputs, pc_ratio, labels, test_size=0.2, random_state=42)
    

  
    model.fit(
        [np.array(inputs_train),
        np.array(prem_inputs_train),
        np.array(dte_inputs_train),
        np.array(pc_train)],
        np.array(y_train),
        epochs=20,
        batch_size=16,
    )

    return inputs_test, prem_inputs_test, dte_inputs_test, pc_test, y_test



In [177]:
def make_prediction(model, vol, prem, dte):
    return model.predict([vol, prem, dte])

In [178]:
def backtest(model):
   
    options_data = collect_dfs("./test")

    today = dt.datetime.now()
    returns, prices = get_data(["AAPL", "MSFT", "GOOG"], today - dt.timedelta(7000), today)
    score = 0
    count = 0
    profits = []
    predicts = []
    values = []
    for index, row in options_data.iterrows():
        try:
            quote_date = row["[QUOTE_DATE]"]
            exp_date   = row["[EXPIRE_DATE]"]

            strike = row["[STRIKE]"]

            call_last = row["[C_LAST]"]
            put_last = row["[P_LAST]"]

            dte = row["[DTE]"]

            civ = row["[C_IV]"]
            piv = row["[P_IV]"]

            put_vol = row["[P_VOLUME]"]
            call_vol = row["[C_VOLUME]"]


            quote_price = prices.loc[quote_date, "AAPL"]
            exp_price = prices.loc[exp_date, "AAPL"]
            # print(type(strike))
        
            
            now = dt.datetime.strptime(quote_date, '%Y-%m-%d').date()
            start = now -dt.timedelta(3000)

            # print(prices.loc[start:now], 30)
            temp = compute_vol(returns.loc[start:now], INTERVAL)
            temp = temp[-LOOKBACK:]
            temp["avg"] = temp.mean(axis=1)
            temp = temp[["AAPL", "avg"]]
            # prem_input = (call_last+put_last)/strike
            prem_input = piv+civ
            prediction = model.predict([np.array([temp]), np.array([prem_input]), np.array([dte]), np.array([call_vol/put_vol])])
            
            prediction = prediction.flatten()[0]
            values.append(prediction)
            print(prediction)
            if prediction > 0.5:
                prediction = 1
            else:
                prediction = 0
            
            
            if prediction == 1:
                profit = max((exp_price-strike-call_last-put_last).item(), (strike-exp_price-call_last-put_last).item())
            else:
                # profit = min((strike+put_last+call_last-exp_price).item(), (exp_price-strike+put_last+call_last).item())
                profit = max((exp_price-strike-call_last-put_last).item(), (strike-exp_price-call_last-put_last).item())*-1
            profits.append(profit)
            
            if (exp_price-strike-call_last-put_last).item() > 0:
                actual = 1
            elif (strike-exp_price-call_last-put_last).item() > 0:
                actual = 1
            else:
                actual = 0
            predicts.append((actual, prediction))
            count += 1
            if actual == prediction:
                score +=1
            
        except:
            continue    
        # break
    
    print("score:", score/count)
    print("count", count)
    print("profit", np.array(profits).mean())
    # print("cost", call_last+put_last)
    print(predicts)
    print(values)



In [179]:
def simple_test(model, X_test, y_test):
    print("this is a test")
    print(model.evaluate(X_test, y_test, batch_size=16, verbose=0))
    # print(results)

In [180]:
model = build_model(LOOKBACK,STOCKS)
inputs_test, prem_inputs_test, dte_inputs_test, pc_test ,y_test = train_model(model)


# simple_test(model, [np.array(inputs_test), np.array(prem_inputs_test), np.array(dte_inputs_test)], np.array(y_test))
backtest(model)  

# print(input)



aapl_eod_202207.txt
aapl_eod_202209.txt
aapl_eod_202212.txt
aapl_eod_202204.txt
aapl_eod_202210.txt
aapl_eod_202202.txt
aapl_eod_202201.txt
aapl_eod_202305.txt
aapl_eod_202304.txt
aapl_eod_202208.txt
aapl_eod_202302.txt
aapl_eod_202205.txt
aapl_eod_202211.txt
aapl_eod_202203.txt
aapl_eod_202206.txt
[symbol                         AAPL       avg
timestamp                                    
2022-03-30 00:00:00-04:00  0.027228  0.025462
2022-03-31 00:00:00-04:00  0.027237  0.025439
2022-04-01 00:00:00-04:00  0.027536  0.025661
2022-04-04 00:00:00-04:00  0.027501  0.025470
2022-04-05 00:00:00-04:00  0.027051  0.025018
2022-04-06 00:00:00-04:00  0.027666  0.025608
2022-04-07 00:00:00-04:00  0.022904  0.022723
2022-04-08 00:00:00-04:00  0.019048  0.018618
2022-04-11 00:00:00-04:00  0.019352  0.019175
2022-04-12 00:00:00-04:00  0.019301  0.019139
2022-04-13 00:00:00-04:00  0.019313  0.019512
2022-04-14 00:00:00-04:00  0.019106  0.019387
2022-04-18 00:00:00-04:00  0.020021  0.019823
2022-04-1