In [21]:
from typing import Dict, Tuple

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold

# visualization tools:
import matplotlib.pyplot as plt
import seaborn as sns

# models:
from xgboost import XGBClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

# evaluation functions:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix



pd.set_option('display.max_columns', 100)


In [2]:
# notebook goal: Setup a basic machine learning framework that cleans data, standardizes features,
#  evaluates feature impt, shap values, and a myriad of ML algorithms
# TODO: add the day-of-week as a feature
# TODO: Add in target date versus historic reference dates
# TODO: Add in volume-based feature functionality
# TODO: Evaluate standardizing features per stock or one model per stock - may not be enough data realistically
# TODO: Check bol-range-pct calculation - only giving zero value
# TODO: Add profit point forecast

In [3]:
# functions:

def clean_stock_data(dataframe: pd.DataFrame) -> pd.DataFrame :

    '''removes nulls and in the future will be built out to do any additonal cleaning on the dataframe that is necessary
    Args:
        dataframe: pandas dataframe containing all of the potential features
        parameters: 
            calculation_field: field on which all of the features are built

    Returns:
        dataframe: dataset that is ready to load into a machine learning framework
    '''

    #TODO: In pipeline write this output to the catalogue
    # remove records the preceed the target period to have complete information:
    dataframe = dataframe.dropna() 
    #dataframe = dataframe.reset_index(drop = True) # we won't reset the index for now for traceability back to the date, ticker combination later after training

    # set the date as an index to us post-forecasting: This is a bad idea, come back to the concept
    #dataframe.set_index(keys = 'date', verify_integrity = False, inplace = True) # verify integrity Fale to allow duplicates**
    
    # remove fields that will not be used as predictive features (can be hardcoded since dataframe structure will be the same):
    dataframe = dataframe.drop(columns = [ 'date', 'high', 'low', 'open', 'volume', 'adj_close'])
    

    return dataframe


def identify_fields_to_standardize(dataframe: pd.DataFrame, parameters: Dict) -> np.array :

    '''creates a list of the continuous fields to standardize by dimension within the predictive model; NOTE: this is used within the standardizer
    
    Args:
        dataframe: dataframe that contains all of the fields of interest to be used in the calculations
        parameters:
            continuous_feature_cutoff: ratio of unique values to record count to be used to codify continuous features -> removes records from the standardization process which don't have enough data to standardize (e.g., boolean)

    Returns: list of continuous fields to use in the standardization process based on user's specifications of "uniqueness" threshold    

    '''

    numeric_fields = dataframe.select_dtypes(include = 'number').columns
    records = len(dataframe)

    record_summary = pd.DataFrame(dataframe[numeric_fields].nunique(), columns = ['unique_values'])
    record_summary['rows_in_df'] = records
    record_summary['value_to_record_ratio'] = record_summary['unique_values']/ record_summary['rows_in_df']

    # filter for a threshold specified by the user:
    record_summary = record_summary[record_summary['value_to_record_ratio'] > parameters['continuous_feature_cutoff']]

    # remove percentage features # TODO: later add in functionality to remove percentage based features

    return record_summary.index


# Justification for approach on scaling - the argument can be made that since our approach will generalize movemements across multiple securities that we need to standardize each security to its own price range.  Therefore, any features with price-relative values will be scaled per the security's price values to avoid odd splits in tree-based algos
# the concern with standardization is generally focused on not letting any one feature have considerably more weight in a model than another; however in this case, 


def standardize_continuous_features(dataframe: pd.DataFrame, parameters: Dict) -> pd.DataFrame:

    '''function that identifies the continuious features in the dataframe and standardizes each feature by equity to enable scaling relative to each equity
    
    Args:
        Dataframe: Pandas dataframe to be used in machine learning
        Parameters:
            stock_field: field indicating the stock for the window function to scan
            calculation_field: field for which the target is being calculated (used for drop in main row merge)
    
    Returns:
        Dataframe: containing the standardized data fields
    
    '''

    continuous_fields = list(identify_fields_to_standardize(dataframe = dataframe, parameters = parameters))

    # add in the ticker for grouping next:
    continuous_fields.append(parameters['stock_field'])

    # downselect to the fields that will be used to standardize:
    continuous_dataframe = dataframe[continuous_fields]

    # calculate z-scores: --> Standardizes within each feature to scale accordingly
    z_scores = (continuous_dataframe - continuous_dataframe.groupby(by = parameters['stock_field']).transform('mean')) / continuous_dataframe.groupby(by = parameters['stock_field']).transform('std')

    # drop the null ticker (not needed post groupby): 
    z_scores.drop(columns = [ parameters['stock_field'], parameters['calculation_field'] ], inplace = True)

    # rename the fields to indicate standardization:
    z_scores.columns = z_scores.columns + '_standardized'

    # drop original continuous fields # TODO: coming back after calculation checks:
    if parameters['drop_original_fields'] == True:
        continuous_fields.remove(parameters['stock_field'])
        dataframe.drop(columns = continuous_fields, inplace = True)

    # append the fields back into the core dataframe:
    z_scores = pd.concat([dataframe, z_scores], axis = 1)

    # remove the standardized target field:
    z_scores.drop(columns = z_scores.columns[z_scores.columns.str.contains('target')][1], inplace = True)

    # remove unnecessary items:
    del continuous_fields, continuous_dataframe

    return z_scores



def one_hot_encode_tickers(dataframe: pd.DataFrame, parameters: Dict) -> pd.DataFrame:

    '''Returns one-hot encoded features to the predictive dataset NOTE: May not work, but this retains some of the information in the original dataframe while also potentially giving the global model a nudge
       Note: we choose not to drop first for now, even though it's a trap; Can be used post processing or as model features
    Args:
        dataframe: core dataset that has been augmented with additional features
        parameters:
            stock_field: text field containing the 3 letter ticker of the dataset
    Returns:   
        dataframe with augmented columns
    
    '''

    dataframe = pd.get_dummies(data = dataframe, prefix = "ind", columns = [parameters['stock_field']], drop_first = False)

    return dataframe


def profile_target_variable(dataframe: pd.DataFrame, parameters: dict):


    '''Function that looks at the target variable and creates an output for the user to review and decide whether rebalancing will help classification task
    Args:
        dataframe: Main resulting dataframe from all data conversion steps
        parameters:
            
    
    '''
    # isolate the target variable:
    target_field = list(dataframe.columns[dataframe.columns.str.contains('target')])

    # create simple value count outputs:
    target_summary_table = pd.DataFrame(dataframe[target_field].value_counts()).reset_index()
    target_summary_table.rename(columns = {0 : 'counts'}, inplace = True)
    target_summary_table['proportion'] = target_summary_table['counts'] / target_summary_table['counts'].sum()

    # create bargraph and save it:
    ''' TODO : resolve ability to output a matplotlib plot in kedro catalog
    sns.countplot(x=target_field, data=dataframe)
    plt.title("Class Distribution")
    plt.show() '''
    target_field =', '.join(dataframe.columns[dataframe.columns.str.contains('target')].str.replace(r'\[|\]', ''))
    positive_proportion = target_summary_table[target_summary_table[target_field].astype(int) == 1]['proportion'].to_list()
   

    print('Classification target: ' + str(target_field) + " contains a class balance of: " + str(positive_proportion) + " in the positive case")
           

    return target_summary_table # TODO: Write this to the catalogue as a reporting output for the users



def create_training_test_splits(dataframe: pd.DataFrame, parameters: Dict) :

    '''Function that splits out training and test sets for machine learning; for the purposes of this model the way we piose the problem allows for random train test split
    Args:
        dataframe: pandas dataframe containing only the target field and the features to be used by the classifier
        parameters:
            test_ratio: proportion of samples in the dataframe to be used as a test set once the models are tuned and evaluated
    
    Returns:
        X_train: training set for use in ML process
        X_test: test set to be held out until all cross-validation is completed
        y_train: training set for target variables
        y_test: test target to be held out until all cross-validation is completed

    '''

    # define Y and x:
    target_feature = list(dataframe.columns[dataframe.columns.str.contains('target')])

    y = dataframe[target_feature]
    X = dataframe.drop(columns = target_feature)

    # create the training and test splits:
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=parameters['test_size'], random_state=parameters['seed'], stratify = y)

    #y_train = y_train.values.ravel()
    #y_test = y_test.values.ravel()

    return X_train, X_test, y_train, y_test


def custom_recall_score(confusion_matrix: np.array) -> np.int64 :

    recall_value = confusion_matrix[1,1] / (confusion_matrix[1,1] + confusion_matrix[1,0])

    return recall_value


def custom_precision_score(confusion_matrix: np.array) -> np.int64 : 

    precision_value = confusion_matrix[1,1] / (confusion_matrix[1,1] + confusion_matrix[0,1])

    return precision_value




In [4]:
#df = catalog.load('combined_modeling_input')
df = pd.read_csv('../data/03_primary/combined_modeling_input.csv')

In [5]:
# setup the parameters for the model: 

parameters = {'continuous_feature_cutoff' : 0.6,
              'stock_field' : 'ticker',
              'calculation_field' : 'close',
              'drop_original_fields' : True,
              'drop_stock_field': True, # keep this fixed 
              'test_size' : 0.20, # proportion of the dataset held out as the test set
              'seed' : 1187,
              'cross_val_splits' : 5,
              'c' : 1.0,
              'kernel' : 'rbf',
              'gamma' : 'scale'
            }




In [31]:
# remove the null values from the dataset and drop un-needed columns for the classifier:
test = clean_stock_data(dataframe= df)
# test: standardize features:
test = standardize_continuous_features(dataframe = test, parameters = parameters)
# one-hot encode: 
test = one_hot_encode_tickers(dataframe = test, parameters= parameters)
# create training and test sets
X_train, X_test, y_train, y_test = create_training_test_splits(dataframe=test, parameters= parameters)






In [7]:
def train_model(X_train: np.array,X_test: np.array, y_train: np.array, y_test: np.array, parameters: dict ):


    ''' Function that trains ML models via cross validation specified by the user; recommends best model for classification of target task

    Args:
        X_train: pre split training set
        X_test: pre split testing set - To actually be held out, validation will be completed within the training set

    '''

    


    return 



In [8]:
# start with just creating a logistic regression and getting a confusion matrix, then scale to K-fold

y_train = y_train.values.ravel()
#y_train = np.array(y_train).reshape(-1, 1)

clf = LogisticRegression(penalty = 'l2', max_iter= 100000, random_state = 1187, C = 1.0, n_jobs = -1)


clf.fit(X = X_train, y = y_train)
y_pred = clf.predict(X_train)

# show the confusion matrix and the accuracy measures:

accuracy = clf.score(X = X_train, y = y_train)
precision = precision_score(y_true = y_train, y_pred = y_pred)
recall = recall_score(y_true = y_train, y_pred = y_pred)
conf_matrix = confusion_matrix(y_pred = y_pred, y_true = y_train)




In [25]:
classifiers = { 'Logistic_regression' : LogisticRegression(penalty = 'l2', max_iter= 100000, random_state = 1187, C = 1.0, n_jobs = -1),
                'Random_forest' : RandomForestClassifier(n_estimators= 200, criterion= 'gini', min_samples_split = 2, max_features = 'sqrt', n_jobs = -1, random_state= 1187),
                'Support_vector_classifier' : SVC(random_state= 1187),
                'XGBoost' :  XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs = -1),
                'K_nearest_neighbors' : KNeighborsClassifier()
               

                }



In [41]:
def train_models(X_train: np.array, X_test: np.array, y_train: np.array, y_test: np.array, parameters: dict) -> pd.DataFrame:

    '''WRITE DOCUMENTATION '''
    
    # standardize the y_train set:
    y_train = y_train.values.ravel()

    # instantiate the classifiers: TODO: create parameter arguments for the clfs
    classifiers = { 'Logistic_regression' : LogisticRegression(penalty = 'l2', max_iter= 100000, random_state = 1187, C = 1.0, n_jobs = -1),
                'Random_forest' : RandomForestClassifier(n_estimators= 200, criterion= 'gini', min_samples_split = 2, max_features = 'sqrt', n_jobs = -1, random_state= 1187),
                'Support_vector_classifier' : SVC(random_state= 1187),
                'XGBoost' :  XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs = -1),
                'K_nearest_neighbors' : KNeighborsClassifier()
                }
    
    cv = StratifiedKFold(n_splits=parameters['cross_val_splits'], shuffle=True, random_state=parameters['seed']).split(X_train, y_train)


    # create storage points for the model: NOTE: not doing k-fold yet and only showing "train" outputs:
    model = []
    fold = []
    precisions = []
    recalls = []
    accuracies = []
    true_positives = []
    true_negatives = []
    false_positives = []
    false_negatives = []


    for name, classifier in classifiers.items():

        clf = classifier

        # fit and predict:
        print('training: ' + name + ' classifier')


            
        clf.fit(X = X_train, y = y_train)
        y_pred = clf.predict(X_train)
        clf_confusion_matrix = confusion_matrix(y_true = y_train, y_pred = y_pred)

        # calculate scores and append to arrays:
        clf_accuracy = clf.score(X = X_train, y = y_train)
        clf_precision = precision_score(y_true = y_train, y_pred = y_pred)
        clf_recall = recall_score(y_true = y_train, y_pred = y_pred)
        clf_tp = clf_confusion_matrix[1,1]
        clf_tn = clf_confusion_matrix[0,0]
        clf_fp = clf_confusion_matrix[0,1]
        clf_fn = clf_confusion_matrix[1,0]

        # append model performnance values to arrays:
        model.append(name)
        precisions.append(clf_precision)
        recalls.append(clf_recall)
        accuracies.append(clf_accuracy)
        true_positives.append(clf_tp)
        true_negatives.append(clf_tn)
        false_positives.append(clf_fp)
        false_negatives.append(clf_fn)
        
    results_df = pd.DataFrame(
                    {
                    "model" : model,
                    "precision" : precisions,
                    "recall" : recalls,
                    "accuracy" : accuracies,
                    "true_positive" : true_positives,
                    "true_negatives" : true_negatives,
                    "false_positives" : false_positives,
                    "false_negatives" : false_negatives
                    }
                )
    
    return results_df

    
     

In [54]:
# version with CV splits: 

def train_models(X_train: np.array, X_test: np.array, y_train: np.array, y_test: np.array, parameters: dict) -> pd.DataFrame:

    '''WRITE DOCUMENTATION '''
    
    # standardize the y_train set:
    y_train = y_train.iloc[:, 0].values
    X_train = X_train.values

    # instantiate the classifiers: TODO: create parameter arguments for the clfs, pre-HP tuning
    classifiers = { 'Logistic_regression' : LogisticRegression(penalty = 'l2', max_iter= 100000, random_state = 1187, C = 1.0, n_jobs = -1),
                'Random_forest' : RandomForestClassifier(n_estimators= 200, criterion= 'gini', min_samples_split = 2, max_features = 'sqrt', n_jobs = -1, random_state= 1187),
                'Support_vector_classifier' : SVC(random_state= 1187),
                'XGBoost' :  XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs = -1),
                'K_nearest_neighbors' : KNeighborsClassifier()
                }
    
    cv = StratifiedKFold(n_splits=parameters['cross_val_splits'], shuffle=True, random_state=parameters['seed']).split(X_train, y_train)


    # create storage points for the model: NOTE: not doing k-fold yet and only showing "train" outputs:
    model = []
    fold = []
    ###
    train_precisions = []
    test_precisions = []
    ###
    train_recalls = []
    test_recalls = []
    ###
    train_f_scores = []
    test_f_scores = []
    ###
    train_accuracies = []
    test_accuracies = []
    ###
    train_true_positives = []
    test_true_positives = []
    ###
    train_true_negatives = []
    test_true_negatives = []
    ##
    train_false_positives = []
    test_false_positives = []
    ###
    train_false_negatives = []
    test_false_negatives = []

    for name, classifier in classifiers.items():

        clf = classifier

        # fit and predict:
        print('training: ' + name + ' classifier')
        cv = StratifiedKFold(n_splits=parameters['cross_val_splits'], shuffle=True, random_state=parameters['seed']).split(X_train, y_train)


        for k, (fold_train, fold_test) in enumerate(cv):


            # fit the model on the fold
            clf.fit(X = X_train[fold_train], y = y_train[fold_train])
            
            # create predictions:
            train_pred = clf.predict(X = X_train[fold_train])
            test_pred = clf.predict(X = X_train[fold_test])


            train_confusion_matrix = confusion_matrix(y_true = y_train[fold_train], y_pred = train_pred)
            test_confusion_matrix = confusion_matrix(y_true = y_train[fold_test], y_pred = test_pred)

            # calculate performance metrics:
            train_accuracy = clf.score(X = X_train[fold_train], y = y_train[fold_train])
            test_accuracy = clf.score(X = X_train[fold_test], y = y_train[fold_test])
    
            # calculate precision:
            train_precision = precision_score(y_true = y_train[fold_train], y_pred = train_pred)
            test_precision = precision_score(y_true = y_train[fold_test], y_pred = test_pred)
 
            # calculate recall:
            train_recall = recall_score(y_true = y_train[fold_train], y_pred = train_pred)
            test_recall = recall_score(y_true = y_train[fold_test], y_pred = test_pred)

            # calculate f-measure:
            train_f = f1_score(y_true = y_train[fold_train], y_pred = train_pred)
            test_f = f1_score(y_true = y_train[fold_test], y_pred = test_pred)

            # calculate true positives:
            train_tp = train_confusion_matrix[1,1]
            test_tp = test_confusion_matrix[1,1]

            # calculate true negatives:
            train_tn = train_confusion_matrix[0,0]
            test_tn = test_confusion_matrix[0,0]

            # calculate false positives:
            train_fp = train_confusion_matrix[0,1]
            test_fp = test_confusion_matrix[0,1]

            # calculate false negatives:
            train_fn = train_confusion_matrix[1,0]
            test_fn = test_confusion_matrix[1,0]

            # append model performnance values to arrays:
            model.append(name)
            fold.append(k)
            train_accuracies.append(train_accuracy)
            test_accuracies.append(test_accuracy)
            ###
            train_precisions.append(train_precision)
            test_precisions.append(test_precision)
            ###
            train_recalls.append(train_recall)
            test_recalls.append(test_recall)
            ###
            train_f_scores.append(train_f)
            test_f_scores.append(test_f)
            ###
            train_true_positives.append(train_tp)
            test_true_positives.append(test_tp)
            ###
            train_true_negatives.append(train_tn)
            test_true_negatives.append(test_tn)
            ###
            train_false_positives.append(train_fp)
            test_false_positives.append(test_fp)
            ###
            train_false_negatives.append(train_fn)
            test_false_negatives.append(test_fn)
            
        
    results_df = pd.DataFrame(
                    {
                    "model" : model,
                    "fold" : fold,
                    "train_accuracy" : train_accuracies,
                    "test_accuracy" : test_accuracies,
                    ###
                    "train_precision" : train_precisions,
                    "test_precision" : test_precisions,
                    ###
                    "train_recall" : train_recalls,
                    "test_recall" : test_recalls,
                    ###
                    "train_true_positives" : train_true_positives,
                    "test_true_positives" : test_true_positives,
                    ###
                    "train_true_negatives" : train_true_negatives,
                    "test_true_negatives" : test_true_negatives,
                    ###
                    "train_false_positives" : train_false_positives,
                    "test_false_positives" : test_false_positives,
                    ###
                    "train_false_negatives" : train_false_negatives,
                    "test_false_negatives" : test_false_negatives,
                    
                    }
                )
    
    return results_df


In [55]:
output = train_models(X_train = X_train, X_test = X_test, y_train = y_train, y_test= y_test, parameters = parameters)

training: Logistic_regression classifier
training: Random_forest classifier
training: Support_vector_classifier classifier
training: XGBoost classifier
training: K_nearest_neighbors classifier


In [57]:
output



Unnamed: 0,model,fold,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_true_positives,test_true_positives,train_true_negatives,test_true_negatives,train_false_positives,test_false_positives,train_false_negatives,test_false_negatives
0,Logistic_regression,0,0.645428,0.636792,0.660511,0.654494,0.883191,0.882576,930,233,164,37,478,123,123,31
1,Logistic_regression,1,0.634218,0.658019,0.651293,0.666667,0.88509,0.901515,932,238,143,41,499,119,121,26
2,Logistic_regression,2,0.638938,0.636792,0.655197,0.654391,0.885199,0.878327,933,231,150,39,491,122,121,32
3,Logistic_regression,3,0.632448,0.632075,0.649965,0.652422,0.886148,0.870722,934,229,138,39,503,122,120,34
4,Logistic_regression,4,0.64092,0.612293,0.657913,0.632708,0.879507,0.897338,927,236,160,23,482,137,127,27


In [53]:
output.shape

(5, 16)

In [12]:
##################################### - Function development HERE

In [13]:
############################ - Machine learning loop

In [20]:
# classifiers to use: support vector machine, decision tree, random forest, xgboost, adaboost

# TODO: Add in the confusion matrix values to make output analysis more straightforward

def train_models(X_train: pd.DataFrame, y_train: pd.DataFrame, parameters) -> pd.DataFrame:

    '''Trains a series of machine learning model outputs for evaluation by the user
    
    Args:
        X_train: inputs from train-test split function
        y_train: y-series from the train-test split function

    Returns:
        Summarized output of all ML models tried
    
    '''

    # define all of the models to be used:
    classifiers = {
    "LogisticRegression": LogisticRegression(n_jobs = -1, max_iter = 100000),
   # "RandomForestClassifier": RandomForestClassifier(n_jobs = -1),
    "SVC": SVC(C = parameters['c'], kernel =parameters['kernel'], gamma = parameters['gamma']),
    #"AdaBoostClassifier": AdaBoostClassifier()
    }

    # create a readable representation of the target:
    y_train = y_train.iloc[:, 0].values
    X_train = X_train.values

    #TODO: add precision, recall, f-measure on all sets
    #accuracies = {}
    names = []
    models = []
    fold = []
    training_samples = []

    #TODO: Rename test to validation **

    train_accuracies = []
    test_accuracies = []

    train_precisions = []
    test_precisions = []

    train_recalls = []
    test_recalls = []

    train_f_measures = []
    test_f_measures = []

    # iterate through the models:
    for name, classifier in classifiers.items():

        clf = classifier
        print(name)
        print (clf)

        # iterate through the folds: ->> not ideal to nest the loops here
        cv = StratifiedKFold(n_splits=parameters['cross_val_splits'], shuffle=True, random_state=parameters['seed']).split(X_train, y_train)

        for k, (fold_train, fold_test) in enumerate(cv):

             # append model name into list:
            models.append(str(classifier))
            
            clf.fit(X_train[fold_train],y_train[fold_train])
        

            # create predictions:
            train_pred = clf.predict(X = X_train[fold_train])
            test_pred = clf.predict(X = X_train[fold_test])

            # calculate accuracies:
            train_accuracy = clf.score(X_train[fold_train], y_train[fold_train])
            test_accuracy = clf.score(X_train[fold_test], y_train[fold_test])
    
            # calculate precision:
            train_precision = precision_score(y_train[fold_train], train_pred)
            test_precision = precision_score(y_train[fold_test], test_pred)
 
            # calculate recall:
            train_recall = recall_score(y_train[fold_train], train_pred)
            test_recall = recall_score(y_train[fold_test], test_pred)

            # calculate f-measure:
            train_f = f1_score(y_train[fold_train], train_pred)
            test_f = f1_score(y_train[fold_test], test_pred)
            

            # append name:
            names.append(name)
            
            # append training sample size:
            training_samples.append( len(X_train[fold_train]) )

            # append fold number to the list:
            fold.append(k+1)

            # append score into list:
            train_accuracies.append(train_accuracy)
            test_accuracies.append(test_accuracy)

            # append precisions to the list:
            train_precisions.append(train_precision)
            test_precisions.append(test_precision)

            # append recalls to the list:
            train_recalls.append(train_recall)
            test_recalls.append(test_recall)

            # append f-measures to the list:
            train_f_measures.append(train_f)
            test_f_measures.append(test_f)
    
    results_df = pd.DataFrame({
                "names" : names,
                "model" : models,
                "fold" : fold,
                "training_samples" : training_samples,
                "train_accuracy": train_accuracies,
                "test_accuracy": test_accuracies,
                "train_precision": train_precisions,
                "test_precision": test_precisions,
                "train_recall": train_recalls,
                "test_recall": test_recalls,
                "train_f_measures": train_f_measures,
                "test_f_measures": test_f_measures
                
                })

    # create aggregated results df:
    aggregated_results_df = results_df.drop(columns = ['fold']).groupby(by = ['names', 'model']).mean()
                            
    
    return results_df, aggregated_results_df #names, model, fold, train_accuracies, test_accuracies, train_precisions, test_precisions



In [21]:
##################################### - Testing functions HERE

In [22]:

#names, model, fold, train_accuracies, test_accuracies, train_precisions, test_precisions 

output, aggregated_output= train_models(X_train = X_train, y_train = y_train, parameters = parameters)
    

LogisticRegression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


SVC
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [81]:
output

Unnamed: 0,names,model,fold,training_samples,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f_measures,test_f_measures
0,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",1,1696,0.635024,0.647059,0.655222,0.654891,0.869896,0.912879,0.74745,0.762658
1,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",2,1697,0.654095,0.639151,0.668353,0.659942,0.878443,0.867424,0.75913,0.749591
2,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",3,1697,0.643489,0.608491,0.661163,0.641399,0.873814,0.836502,0.752758,0.726073
3,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",4,1697,0.636417,0.622642,0.651002,0.641096,0.893738,0.889734,0.753299,0.745223
4,LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",5,1697,0.641131,0.653302,0.655052,0.662921,0.891841,0.897338,0.755323,0.76252
5,RandomForestClassifier,RandomForestClassifier(n_jobs=-1),1,1696,1.0,0.858824,1.0,0.854167,1.0,0.931818,1.0,0.891304
6,RandomForestClassifier,RandomForestClassifier(n_jobs=-1),2,1697,1.0,0.863208,1.0,0.857639,1.0,0.935606,1.0,0.894928
7,RandomForestClassifier,RandomForestClassifier(n_jobs=-1),3,1697,1.0,0.860849,1.0,0.861702,1.0,0.923954,1.0,0.891743
8,RandomForestClassifier,RandomForestClassifier(n_jobs=-1),4,1697,1.0,0.849057,1.0,0.856631,1.0,0.908745,1.0,0.881919
9,RandomForestClassifier,RandomForestClassifier(n_jobs=-1),5,1697,1.0,0.903302,1.0,0.914179,1.0,0.931559,1.0,0.922787


In [68]:
aggregated_output

Unnamed: 0_level_0,Unnamed: 1_level_0,training_samples,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f_measures,test_f_measures
names,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AdaBoostClassifier,AdaBoostClassifier(),1696.8,0.752711,0.702968,0.749999,0.713708,0.902813,0.871641,0.819266,0.784399
LogisticRegression,"LogisticRegression(max_iter=100000, n_jobs=-1)",1696.8,0.642031,0.634129,0.658158,0.65205,0.881546,0.880775,0.753592,0.749213
RandomForestClassifier,RandomForestClassifier(n_jobs=-1),1696.8,1.0,0.865162,1.0,0.866935,1.0,0.925576,1.0,0.895081
SVC,SVC(),1696.8,0.639322,0.631301,0.636062,0.631517,0.980446,0.975697,0.771486,0.766647


In [147]:
classifier = logistic_regression(X = X_train, y = y_train)

### Next to-dos:  
1.) Add parameters for all Classifiers to the parameters model  
2.) Add Select "n" best logic to the outputs
3.) Add in feature importances and feature selection before modeling run
3.) Add in Hypterparameter tuning
4.) Run with more positions/equity holdings