In [1]:
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import datetime as dt

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import time
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [3]:
def generate_next_day_stats(options_chain, next_day_date, an_number):
    print("This is a snapshot of the options chain passed in:")
    print(options_chain.head(5))
    print("\n")
    
    sorted_options = options_chains.sort_values(by="Vol/OI", ascending=False).drop(
        columns=["DTE", "Bid", "Midpoint", "Ask", "Last", "IV", "Time"])
    
    print("These options have the highest Volume to Open Interest ratings of the day:")
    print(sorted_options.head(an_number))
    print("\n")
    
    next_day_options = sorted_options.loc[sorted_options["Exp Date"] == next_day_date]
    print("These are the most unusually active tickers for the open tomorrow:")
    active_tickers = set(next_day_options.head(an_number)["Symbol"])
    print(list(active_tickers))
    print("\n")

    next_day_options = next_day_options.loc[next_day_options["Symbol"].isin(active_tickers)]
    active_next_day = next_day_options
    next_day_options = next_day_options.groupby(by=["Symbol", "Type"])
    next_day_options = next_day_options.sum().drop(
                            columns=["Price","Strike", "Vol/OI"])
    next_day_options = next_day_options.assign(Aggregate_Vol_OI = next_day_options["Volume"]/next_day_options["Open Int"])
    print("These is an overview of the most active options of the day:")
    print(next_day_options)
    print("\n")
    
    return active_tickers, active_next_day, next_day_options

In [4]:
def generate_ml_data(active_next_day, aggregate_voloi, start_date, start_data):
    active_sorted = active_next_day

    active_sorted['CSAP'] = np.where((active_sorted['Strike']>=active_sorted['Price']) & 
                                            (active_sorted['Type'] == "Call"),
                                            active_sorted['Vol/OI'], 0)
    active_sorted['CSBP'] = np.where((active_sorted['Strike']<active_sorted['Price']) & 
                                            (active_sorted['Type'] == "Call"),
                                            active_sorted['Vol/OI'], 0)
    active_sorted['PSAP'] = np.where((active_sorted['Strike']>active_sorted['Price']) & 
                                            (active_sorted['Type'] == "Put"),
                                            active_sorted['Vol/OI'], 0)
    active_sorted['PSBP'] = np.where((active_sorted['Strike']<=active_sorted['Price']) & 
                                            (active_sorted['Type'] == "Put"),
                                            active_sorted['Vol/OI'], 0)

    active_sorted = active_sorted.drop(columns=["Price", "Strike", "Exp Date", "Volume", "Open Int", "Vol/OI"])

    ml_options = pd.DataFrame(columns=['Ticker','CSAP', 'CSBP', 'PSAP', 'PSBP', "Call Aggregate", "Put Aggregate"])

    for idx, ticker in enumerate(active_tickers):
        temp = active_sorted.loc[active_sorted["Symbol"] == ticker]
        temp = temp.set_index(["Symbol", "Type"]).sort_values(by=["Symbol", "Type"])

        summary = temp.sum()
        df = web.DataReader(ticker, 'yahoo', start_date)
        df = web.DataReader(ticker, 'yahoo', start_data)
        df = df.loc[df.index == start]

        ml_options = ml_options.append({'Ticker': ticker, 'CSAP': summary["CSAP"], 
                           'CSBP': summary["CSBP"], 'PSAP': summary["PSAP"], 'PSBP': summary["PSBP"], 
                            "Call Aggregate": aggregate_voloi["Aggregate_Vol_OI"][ticker]["Call"] 
                                                 if "Call" in aggregate_voloi["Aggregate_Vol_OI"][ticker] else 0,
                            "Put Aggregate": aggregate_voloi["Aggregate_Vol_OI"][ticker]["Put"]
                                                if "Put" in aggregate_voloi["Aggregate_Vol_OI"][ticker] else 0,
                                       "Open": df["Open"][start],
                                       "Close": df["Adj Close"][start]},
                            ignore_index=True)

    return ml_options

In [10]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=1000),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    "Neural Net": MLPClassifier(alpha = 1),
    "Naive Bayes": GaussianNB(),
    #"AdaBoost": AdaBoostClassifier(),
    #"QDA": QuadraticDiscriminantAnalysis(),
    #"Gaussian Process": GaussianProcessClassifier()
}

In [11]:
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 5, verbose = True):
    """
    This method, takes as input the X, Y matrices of the Train and Test set.
    And fits them on all of the Classifiers specified in the dict_classifier.
    The trained models, and accuracies are saved in a dictionary. The reason to use a dictionary
    is because it is very easy to save the whole dictionary with the pickle module.
    
    Usually, the SVM, Random Forest and Gradient Boosting Classifier take quiet some time to train. 
    So it is best to train them on a smaller dataset first and 
    decide whether you want to comment them out or not based on the test accuracy score.
    """
    
    dict_models = {}
    for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
        t_start = time.clock()
        classifier.fit(X_train, Y_train)
        t_end = time.clock()
        
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        test_score = classifier.score(X_test, Y_test)
        
        dict_models[classifier_name] = {'model': classifier, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff}
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff))
    return dict_models

In [12]:
def display_dict_models(dict_models, sort_by='test_score'):
    cls = [key for key in dict_models.keys()]
    test_s = [dict_models[key]['test_score'] for key in cls]
    training_s = [dict_models[key]['train_score'] for key in cls]
    training_t = [dict_models[key]['train_time'] for key in cls]
    
    df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),4)), columns = ['classifier', 'train_score', 'test_score', 'train_time'])
    for ii in range(0,len(cls)):
        df_.loc[ii, 'classifier'] = cls[ii]
        df_.loc[ii, 'train_score'] = training_s[ii]
        df_.loc[ii, 'test_score'] = test_s[ii]
        df_.loc[ii, 'train_time'] = training_t[ii]
    
    display(df_.sort_values(by=sort_by, ascending=False))

In [31]:
def preprocess_data(ml_options):
    ml_df = ml_options
    ml_df["Log Return"] = np.log(ml_df["Close"]) - np.log(ml_df["Open"])
    ml_df["Positive Return"] = (ml_df["Log Return"] > 0).astype(int)
    ml_X = ml_df.drop(columns=["Ticker", "Close", "Open", "Log Return", "Positive Return"])
    ml_Y = ml_df.drop(columns=["Ticker", "CSAP", "CSBP", "PSAP", "PSBP", "Call Aggregate",
                           "Put Aggregate", "Close", "Open", "Log Return"])
    
    mm_scaler = preprocessing.MinMaxScaler()
    ml_X = mm_scaler.fit_transform(ml_X, ml_Y)
    print(ml_X)
    X_train, X_test, y_train, y_test = train_test_split(ml_X, ml_Y, test_size = 0.2)
    
    return X_train, X_test, y_train, y_test

In [9]:
next_day = "9/6/2019"
active_ticker_num = 50
filename = "20190905_Unusual_Activity.csv"
options_chains = pd.read_csv(filename)

start_date = dt.datetime(2019, 9, 6)
start_data = dt.datetime(2019, 9, 3)

active_tickers, active_next_day, aggregate_voloi = generate_next_day_stats(options_chains, next_day, active_ticker_num)
ml_options = generate_ml_data(active_next_day, aggregate_voloi, start_date, start_data)

This is a snapshot of the options chain passed in:
  Symbol   Price  Type  Strike    Exp Date    DTE   Bid  Midpoint   Ask  Last  \
0   TWTR   45.30  Call    47.0   9/13/2019    8.0  0.32      0.33  0.34  0.35   
1   NVDA  179.74   Put   175.0    9/6/2019    1.0  0.25      0.26  0.27  0.26   
2    MNK    1.59  Call     5.0   1/15/2021  498.0  0.55      0.63  0.70  0.65   
3    GPN  174.84   Put   170.0   9/20/2019   15.0  1.10      1.33  1.55  1.06   
4    RTN  185.27  Call   200.0  10/18/2019   43.0  0.72      0.79  0.87  0.76   

    Volume  Open Int  Vol/OI       IV      Time  
0  17990.0     120.0  149.92   35.46%  9/5/2019  
1  11242.0     146.0   77.00   44.68%  9/5/2019  
2  40488.0     870.0   46.54  152.99%  9/5/2019  
3   9111.0     199.0   45.78   20.77%  9/5/2019  
4   5173.0     115.0   44.98   18.70%  9/5/2019  


These options have the highest Volume to Open Interest ratings of the day:
   Symbol    Price  Type  Strike    Exp Date   Volume  Open Int  Vol/OI
0    TWTR    

In [32]:
X_train, X_test, y_train, y_test = preprocess_data(ml_options)

[[1.         0.13891444 0.         0.47345976 0.25679013 0.14016905]
 [0.02736876 0.         0.53361567 0.         0.14400736 0.48496528]
 [0.         0.25636308 0.         0.         0.703125   0.        ]
 [0.04081306 0.         0.51561673 0.01615098 0.21490907 0.19770001]
 [0.64036492 0.06531739 1.         0.07812358 0.4678595  0.63377613]
 [0.50432138 0.08095676 0.         0.         0.47542354 0.        ]
 [0.12596031 0.7908617  0.         0.21713093 0.27991422 0.15610751]
 [0.02608835 0.         0.47856008 0.         0.13734124 0.43475669]
 [0.         0.         0.43250397 0.         0.         0.39315888]
 [0.14884763 0.         0.         0.12076944 0.34780192 0.64003785]
 [0.05041613 0.         0.28639492 0.10915525 0.26464797 0.33636027]
 [0.18886044 0.         0.         0.         0.42298871 0.        ]
 [0.49631882 0.33149341 0.         0.83132202 0.48729627 0.49837153]
 [0.         0.         0.46956061 0.         0.         0.42684533]
 [0.         0.23489727 0.        

In [20]:
dict_models = batch_classify(X_train, y_train, X_test, y_test, no_classifiers = 8)
display_dict_models(dict_models)

  from ipykernel import kernelapp as app
  y = column_or_1d(y, warn=True)
  from ipykernel import kernelapp as app
  app.launch_new_instance()
  from ipykernel import kernelapp as app
  y = column_or_1d(y, warn=True)
  from ipykernel import kernelapp as app
  y = column_or_1d(y, warn=True)
  from ipykernel import kernelapp as app


trained Logistic Regression in 0.00 s
trained Nearest Neighbors in 0.00 s
trained Linear SVM in 0.00 s
trained Gradient Boosting Classifier in 0.21 s


  from ipykernel import kernelapp as app
  app.launch_new_instance()


trained Decision Tree in 0.00 s


  from ipykernel import kernelapp as app
  y = column_or_1d(y, warn=True)


trained Random Forest in 1.11 s
trained Neural Net in 0.16 s
trained Naive Bayes in 0.00 s


  from ipykernel import kernelapp as app
  y = column_or_1d(y, warn=True)


Unnamed: 0,classifier,train_score,test_score,train_time
7,Naive Bayes,0.791667,0.714286,0.001017
0,Logistic Regression,0.75,0.571429,0.001195
1,Nearest Neighbors,0.75,0.571429,0.002826
2,Linear SVM,0.75,0.571429,0.001234
3,Gradient Boosting Classifier,1.0,0.571429,0.211855
6,Neural Net,0.75,0.571429,0.161878
5,Random Forest,1.0,0.428571,1.110793
4,Decision Tree,1.0,0.285714,0.001349


In [27]:
dict_models["Decision Tree"]["model"].predict(X_test)

array([1., 0., 0., 0., 1., 1., 1.])

In [22]:
y_test

array([[1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.]])