In [1]:
from typing import Dict, Tuple

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold

# models:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# evaluation functions:
from sklearn.metrics import precision_score, recall_score, f1_score



pd.set_option('display.max_columns', 100)


In [2]:
# notebook goal: Setup a basic machine learning framework that cleans data, standardizes features,
#  evaluates feature impt, shap values, and a myriad of ML algorithms
# TODO: add the day-of-week as a feature
# TODO: Add in target date versus historic reference dates
# TODO: Add in volume-based feature functionality
# TODO: Evaluate standardizing features per stock or one model per stock - may not be enough data realistically
# TODO: Check bol-range-pct calculation - only giving zero value

In [3]:
# functions:

def clean_stock_data(dataframe: pd.DataFrame) -> pd.DataFrame :

    '''removes nulls and in the future will be built out to do any additonal cleaning on the dataframe that is necessary
    Args:
        dataframe: pandas dataframe containing all of the potential features
        parameters: 
            calculation_field: field on which all of the features are built

    Returns:
        dataframe: dataset that is ready to load into a machine learning framework
    '''

    #TODO: In pipeline write this output to the 
    # remove records the preceed the target period to have complete information:
    dataframe.dropna(inplace = True)
    #dataframe = dataframe.reset_index(drop = True) # we won't reset the index for now for traceability back to the date, ticker combination later after training

    # set the date as an index to us post-forecasting: This is a bad idea, come back to the concept
    #dataframe.set_index(keys = 'date', verify_integrity = False, inplace = True) # verify integrity Fale to allow duplicates**
    
    # remove fields that will not be used as predictive features (can be hardcoded since dataframe structure will be the same):
    dataframe = dataframe.drop(columns = [ 'date', 'high', 'low', 'open', 'volume', 'adj_close'])
    

    return dataframe


def identify_fields_to_standardize(dataframe: pd.DataFrame, parameters: Dict) -> np.array :

    '''creates a list of the continuous fields to standardize by dimension within the predictive model; NOTE: this is used within the standardizer
    
    Args:
        dataframe: dataframe that contains all of the fields of interest to be used in the calculations
        parameters:
            continuous_feature_cutoff: ratio of unique values to record count to be used to codify continuous features -> removes records from the standardization process which don't have enough data to standardize (e.g., boolean)

    Returns: list of continuous fields to use in the standardization process based on user's specifications of "uniqueness" threshold    

    '''

    numeric_fields = dataframe.select_dtypes(include = 'number').columns
    records = len(dataframe)

    record_summary = pd.DataFrame(dataframe[numeric_fields].nunique(), columns = ['unique_values'])
    record_summary['rows_in_df'] = records
    record_summary['value_to_record_ratio'] = record_summary['unique_values']/ record_summary['rows_in_df']

    # filter for a threshold specified by the user:
    record_summary = record_summary[record_summary['value_to_record_ratio'] > parameters['continuous_feature_cutoff']]

    # remove percentage features # TODO: later add in functionality to remove percentage based features

    return record_summary.index


# Justification for approach on scaling - the argument can be made that since our approach will generalize movemements across multiple securities that we need to standardize each security to its own price range.  Therefore, any features with price-relative values will be scaled per the security's price values to avoid odd splits in tree-based algos
# the concern with standardization is generally focused on not letting any one feature have considerably more weight in a model than another; however in this case, 


def standardize_continuous_features(dataframe: pd.DataFrame, parameters: Dict) -> pd.DataFrame:

    '''function that identifies the continuious features in the dataframe and standardizes each feature by equity to enable scaling relative to each equity
    
    Args:
        Dataframe: Pandas dataframe to be used in machine learning
        Parameters:
            stock_field: field indicating the stock for the window function to scan
            calculation_field: field for which the target is being calculated (used for drop in main row merge)
    
    Returns:
        Dataframe: containing the standardized data fields
    
    '''

    continuous_fields = list(identify_fields_to_standardize(dataframe = dataframe, parameters = parameters))

    # add in the ticker for grouping next:
    continuous_fields.append(parameters['stock_field'])

    # downselect to the fields that will be used to standardize:
    continuous_dataframe = dataframe[continuous_fields]

    # calculate z-scores: --> Standardizes within each feature to scale accordingly
    z_scores = (continuous_dataframe - continuous_dataframe.groupby(by = parameters['stock_field']).transform('mean')) / continuous_dataframe.groupby(by = parameters['stock_field']).transform('std')

    # drop the null ticker (not needed post groupby): 
    z_scores.drop(columns = [ parameters['stock_field'], parameters['calculation_field'] ], inplace = True)

    # rename the fields to indicate standardization:
    z_scores.columns = z_scores.columns + '_std'

    # drop original continuous fields # TODO: coming back after calculation checks:
    if parameters['drop_original_fields'] == True:
        continuous_fields.remove(parameters['stock_field'])
        dataframe.drop(columns = continuous_fields, inplace = True)

    # append the fields back into the core dataframe:
    z_scores = pd.concat([dataframe, z_scores], axis = 1)

    # remove the standardized target field:
    z_scores.drop(columns = z_scores.columns[z_scores.columns.str.contains('target')][1], inplace = True)

    # remove unnecessary items:
    del continuous_fields, continuous_dataframe

    return z_scores



def one_hot_encode_tickers(dataframe: pd.DataFrame, parameters: Dict) -> pd.DataFrame:

    '''Returns one-hot encoded features to the predictive dataset NOTE: May not work, but this retains some of the information in the original dataframe while also potentially giving the global model a nudge
       Note: we choose not to drop first for now, even though it's a trap; Can be used post processing or as model features
    Args:
        dataframe: core dataset that has been augmented with additional features
        parameters:
            stock_field: text field containing the 
    Returns:   
        dataframe with augmented columns
    
    '''

    dataframe = pd.get_dummies(data = dataframe, prefix = "ind", columns = [parameters['stock_field']], drop_first = False)

    return dataframe



def create_training_test_splits(dataframe: pd.DataFrame, parameters: Dict) :

    '''Function that splits out training and test sets for machine learning; for the purposes of this model the way we piose the problem allows for random train test split
    Args:
        dataframe: pandas dataframe containing only the target field and the features to be used by the classifier
        parameters:
            test_ratio: proportion of samples in the dataframe to be used as a test set once the models are tuned and evaluated

    '''

    # define Y and x:
    target_feature = list(dataframe.columns[dataframe.columns.str.contains('target')])

    y = dataframe[target_feature]
    X = dataframe.drop(columns = target_feature)

    # create the training and test splits:
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=parameters['test_size'], random_state=parameters['seed'], stratify = y)

    return X_train, X_test, y_train, y_test




In [4]:
df = catalog.load('combined_modeling_input')

In [5]:
# test: clean stock data:

df = clean_stock_data(dataframe = df)

In [6]:
parameters = {'continuous_feature_cutoff' : 0.6,
              'stock_field' : 'ticker',
              'calculation_field' : 'close',
              'drop_original_fields' : True,
              'drop_stock_field': True, # keep this fixed 
              'test_size' : 0.20,
              'seed' : 1187,
              'cross_val_splits' : 5,
              'c' : 1.0,
              'kernel' : 'rbf',
              'gamma' : 'scale'
              
              }

In [7]:
# test: standardize features:
test = standardize_continuous_features(dataframe = df, parameters = parameters)




In [8]:
# one-hot encode: 
test = one_hot_encode_tickers(dataframe = test, parameters= parameters)

In [9]:
# train-test splits:

X_train, X_test, y_train, y_test = create_training_test_splits(dataframe=test, parameters= parameters)

In [10]:
##################################### - Function development HERE

In [113]:
# classifiers to use: support vector machine, decision tree, random forest, xgboost, adaboost

def train_models(X_train: pd.DataFrame, y_train: pd.DataFrame, parameters) -> pd.DataFrame:

    '''Trains a series of machine learning model outputs for evaluation by the user
    
    Args:
        X_train: inputs from train-test split function
        y_train: y-series from the train-test split function

    Returns:
        Summarized output of all ML models tried
    
    '''

    # define all of the models to be used:
    classifiers = {
    "LogisticRegression": LogisticRegression(n_jobs = -1, max_iter = 100000),
    "RandomForestClassifier": RandomForestClassifier(n_jobs = -1),
    "SVC": SVC(C = parameters['c'], kernel =parameters['kernel'], gamma = parameters['gamma']),
    "AdaBoostClassifier": AdaBoostClassifier()
    }

    # create a readable representation of the target:
    y_train = y_train.iloc[:, 0].values
    X_train = X_train.values

    #TODO: add precision, recall, f-measure on all sets
    #accuracies = {}
    names = []
    model = []
    fold = []
    training_samples = []

    #TODO: Rename test to validation **

    train_accuracies = []
    test_accuracies = []

    train_precisions = []
    test_precisions = []

    train_recalls = []
    test_recalls = []

    train_f_measures = []
    test_f_measures = []

    # iterate through the models:
    for name, classifier in classifiers.items():

        clf = classifier
        print(name)
        print (clf)

        # iterate through the folds: ->> not ideal to nest the loops here
        cv = StratifiedKFold(n_splits=parameters['cross_val_splits'], shuffle=True, random_state=parameters['seed']).split(X_train, y_train)

        for k, (fold_train, fold_test) in enumerate(cv):

             # append model name into list:
            model.append(clf)
            
            clf.fit(X_train[fold_train],y_train[fold_train])
            print(clf)

            # create predictions:
            train_pred = clf.predict(X = X_train[fold_train])
            test_pred = clf.predict(X = X_train[fold_test])

            # calculate accuracies:
            train_accuracy = clf.score(X_train[fold_train], y_train[fold_train])
            test_accuracy = clf.score(X_train[fold_test], y_train[fold_test])
    
            # calculate precision:
            train_precision = precision_score(y_train[fold_train], train_pred)
            test_precision = precision_score(y_train[fold_test], test_pred)
 
            # calculate recall:
            train_recall = recall_score(y_train[fold_train], train_pred)
            test_recall = recall_score(y_train[fold_test], test_pred)

            # calculate f-measure:
            train_f = f1_score(y_train[fold_train], train_pred)
            test_f = f1_score(y_train[fold_test], test_pred)
            

            # append name:
            names.append(name)
            
            # append training sample size:
            training_samples.append( len(X_train[fold_train]) )

            # append fold number to the list:
            fold.append(k+1)

            # append score into list:
            train_accuracies.append(train_accuracy)
            test_accuracies.append(test_accuracy)

            # append precisions to the list:
            train_precisions.append(train_precision)
            test_precisions.append(test_precision)

            # append recalls to the list:
            train_recalls.append(train_recall)
            test_recalls.append(test_recall)

            # append f-measures to the list:
            train_f_measures.append(train_f)
            test_f_measures.append(test_f)
    
    results_df = pd.DataFrame({
                "names" : names,
                "model" : model,
                "fold" : fold,
                "training_samples" : training_samples,
                "train_accuracy": train_accuracies,
                "test_accuracy": test_accuracies,
                "train_precision": train_precisions,
                "test_precision": test_precisions,
                "train_recall": train_recalls,
                "test_recall": test_recalls,
                "train_f_measures": train_f_measures,
                "test_f_measures": test_f_measures
                
                })
                        
    
    return results_df #names, model, fold, train_accuracies, test_accuracies, train_precisions, test_precisions



In [114]:
##################################### - Testing functions HERE

In [115]:

#names, model, fold, train_accuracies, test_accuracies, train_precisions, test_precisions 

outputs = train_models(X_train = X_train, y_train = y_train, parameters = parameters)
    

LogisticRegression
LogisticRegression(max_iter=100000, n_jobs=-1)
LogisticRegression(max_iter=100000, n_jobs=-1)
LogisticRegression(max_iter=100000, n_jobs=-1)
LogisticRegression(max_iter=100000, n_jobs=-1)
LogisticRegression(max_iter=100000, n_jobs=-1)
LogisticRegression(max_iter=100000, n_jobs=-1)
RandomForestClassifier
RandomForestClassifier(n_jobs=-1)
RandomForestClassifier(n_jobs=-1)
RandomForestClassifier(n_jobs=-1)
RandomForestClassifier(n_jobs=-1)
RandomForestClassifier(n_jobs=-1)
RandomForestClassifier(n_jobs=-1)
SVC
SVC()
SVC()
SVC()
SVC()
SVC()
SVC()
AdaBoostClassifier
AdaBoostClassifier()
AdaBoostClassifier()
AdaBoostClassifier()
AdaBoostClassifier()
AdaBoostClassifier()
AdaBoostClassifier()


In [112]:
outputs.head(10)

Unnamed: 0,names,model,fold,training_samples,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f_measures,test_f_measures
0,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",1,1696,0.635024,0.647059,0.655222,0.654891,0.869896,0.912879,0.74745,0.762658
1,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",2,1697,0.654095,0.639151,0.668353,0.659942,0.878443,0.867424,0.75913,0.749591
2,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",3,1697,0.643489,0.608491,0.661163,0.641399,0.873814,0.836502,0.752758,0.726073
3,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",4,1697,0.636417,0.622642,0.651002,0.641096,0.893738,0.889734,0.753299,0.745223
4,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",5,1697,0.641131,0.653302,0.655052,0.662921,0.891841,0.897338,0.755323,0.76252
5,RandomForestClassifier,"(DecisionTreeClassifier(max_features='auto', r...",1,1696,1.0,0.861176,1.0,0.864769,1.0,0.920455,1.0,0.891743
6,RandomForestClassifier,"(DecisionTreeClassifier(max_features='auto', r...",2,1697,1.0,0.858491,1.0,0.849315,1.0,0.939394,1.0,0.892086
7,RandomForestClassifier,"(DecisionTreeClassifier(max_features='auto', r...",3,1697,1.0,0.865566,1.0,0.86014,1.0,0.935361,1.0,0.896175
8,RandomForestClassifier,"(DecisionTreeClassifier(max_features='auto', r...",4,1697,1.0,0.867925,1.0,0.873646,1.0,0.920152,1.0,0.896296
9,RandomForestClassifier,"(DecisionTreeClassifier(max_features='auto', r...",5,1697,1.0,0.891509,1.0,0.915709,1.0,0.908745,1.0,0.912214


In [25]:
classifiers = {
    "LogisticRegression": LogisticRegression(n_jobs = -1),
    "RandomForestClassifier": RandomForestClassifier(),
    "SVC": SVC(C = parameters['c'], kernel =parameters['kernel'], gamma = parameters['gamma']),
    "AdaBoostClassifier": AdaBoostClassifier()
    }

for name, classifier in classifiers.items():
    print(name)
    print(classifier)

LogisticRegression
LogisticRegression(n_jobs=-1)
RandomForestClassifier
RandomForestClassifier()
SVC
SVC()
AdaBoostClassifier
AdaBoostClassifier()


In [107]:
del outputs

In [161]:
X_train


Unnamed: 0,above_7_close_sma_ind,above_14_close_sma_ind,above_21_close_sma_ind,cum_days_above_above_7_close_sma_ind,cum_days_above_above_14_close_sma_ind,cum_days_above_above_21_close_sma_ind,bol_range_pct,14_close_sma_std,14_close_sma_pct_diff_std,14_close_std_std,21_close_sma_std,21_close_sma_pct_diff_std,21_close_std_std,7_close_sma_std,7_close_sma_pct_diff_std,7_close_std_std,bol_pct_from_bottom_std,bol_pct_from_top_std,bol_range_std,lower_bollinger_band_std,upper_bollinger_band_std,ind_AAPL,ind_XLE,ind_XLF
1761,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.728141,0.139125,0.782738,1.719754,0.035315,0.079286,1.599400,-0.287772,1.743248,-0.404477,0.124680,0.079286,1.692394,1.674234,0,1,0
262,1.0,1.0,1.0,29.0,29.0,29.0,0.0,-0.728668,-0.711695,-0.381173,-0.758257,-0.920902,-0.225646,-0.694784,-0.310634,-0.691154,1.066961,0.530723,-0.225646,-0.782218,-0.733163,1,0,0
1029,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.567690,0.355732,0.341547,0.619375,0.371276,-0.099164,0.438060,-0.138839,-0.240886,-0.622113,-0.238312,-0.099164,0.636195,0.577099,0,1,0
176,1.0,1.0,1.0,8.0,8.0,12.0,0.0,-1.264857,-0.525942,-0.745971,-1.260670,-0.450913,-1.036227,-1.245967,-0.048816,-0.696182,0.118982,0.564380,-1.036227,-1.233283,-1.278463,1,0,0
1168,1.0,1.0,1.0,10.0,10.0,10.0,0.0,0.422295,-0.394909,-0.441860,0.417621,-0.337386,-0.727393,0.478704,-0.252009,-0.651419,-0.328984,0.752636,-0.727393,0.566915,0.255262,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.064453,0.116919,-0.496792,0.024288,-0.058123,-0.344177,0.022235,-0.046768,0.153614,-0.319966,0.206530,-0.344177,0.096018,-0.046328,0,1,0
1251,0.0,1.0,1.0,0.0,3.0,12.0,0.0,-1.610367,-0.154934,0.285971,-1.719921,-0.634328,0.542578,-1.547813,0.374899,0.030315,1.789231,-0.508841,0.542578,-1.822437,-1.548376,0,1,0
1644,1.0,1.0,1.0,3.0,3.0,35.0,0.0,0.287773,-0.274151,-0.850403,0.273001,-0.273709,-0.845754,0.288895,-0.395678,-0.677833,-0.431693,0.743571,-0.845754,0.447924,0.091836,0,1,0
1428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.110356,0.411976,-0.347172,-1.092770,0.397075,-0.499153,-1.174179,0.017428,-1.068286,-0.581143,-0.323350,-0.499153,-0.981658,-1.154787,0,1,0


In [146]:
def logistic_regression(X, y):
  """Trains a logistic regression classifier on the given data using stratified k-fold.

  Args:
    X: The Pandas DataFrame of features.
    y: The Pandas DataFrame of target labels.

  Returns:
    The trained logistic regression classifier.
  """
  X = X
  y = y_train.iloc[:, 0]

  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  classifier = LogisticRegression(max_iter = 10000)
  #scores = []

  classifier.fit(X, y)
  '''
  for fold_train, fold_test in cv.split(X, y):
    classifier.fit(X.loc[fold_train], y[fold_train])
    scores.append(classifier.score(X.loc[fold_test], y[fold_test]))
    '''
  return classifier #, np.mean(score)

In [147]:
classifier = logistic_regression(X = X_train, y = y_train)

In [149]:
classifier


LogisticRegression(max_iter=10000)

In [133]:
y_train[