In [164]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    roc_curve,
    average_precision_score
)
from sklearn.linear_model import LogisticRegression
import json

In [165]:
def preprocess(trainDF, testDF):
    scaler = StandardScaler()
    trainDF = scaler.fit_transform(trainDF)
    testDF = scaler.fit_transform(testDF)
    return trainDF, testDF

In [166]:
def eval_gridsearch(clf, pgrid, xTrain, yTrain, xTest, yTest):

    cv = GridSearchCV(clf, param_grid=pgrid, cv=10)
    cv.fit(xTrain, yTrain)

    clf = cv.best_estimator_
    best_params = cv.best_params_

    yHat = cv.predict(xTest)
    yHat_proba = cv.predict_proba(xTest)[:,1]

    auc = roc_auc_score(yTest, yHat_proba)

    auprc = average_precision_score(yTest, yHat_proba)

    f1 = f1_score(yTest, yHat)

    fpr, tpr, _ = roc_curve(yTest, yHat_proba)

    return {'AUC': auc, 'AUPRC': auprc, 'F1': f1}, {'fpr': fpr, 'tpr': tpr}, best_params


In [167]:
def eval_randomsearch(clf, pgrid, xTrain, yTrain, xTest, yTest):
    permutations = np.prod([len(v) for v in pgrid.values()])

    cv = RandomizedSearchCV(clf, param_distributions=pgrid, n_iter=int(permutations*0.33), cv=10)
    cv.fit(xTrain, yTrain)

    clf = cv.best_estimator_
    best_params = cv.best_params_

    yHat = cv.predict(xTest)
    yHat_proba = cv.predict_proba(xTest)[:,1]

    auc = roc_auc_score(yTest, yHat_proba)

    auprc = average_precision_score(yTest, yHat_proba)

    f1 = f1_score(yTest, yHat)

    fpr, tpr, _ = roc_curve(yTest, yHat_proba)

    return {'AUC': auc, 'AUPRC': auprc, 'F1': f1}, {'fpr': fpr, 'tpr': tpr}, best_params


In [168]:
def get_parameter_grid(mName):
    if mName == 'LR (None)':
        return {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'tol': [0.0001, 0.0004]}
    elif mName == 'LR (L1)':
        return {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'tol': [0.0001, 0.0004]}
    elif mName == 'LR (L2)':
        return {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'tol': [0.0001, 0.0004]}


In [169]:
def eval_searchcv(clfName, clf, clfGrid,
                  xTrain, yTrain, xTest, yTest,
                  perfDict, rocDF, bestParamDict):
    # evaluate grid search and add to perfDict
    cls_perf, cls_roc, gs_p  = eval_gridsearch(clf, clfGrid, xTrain,
                                               yTrain, xTest, yTest)
    perfDict[clfName + " (Grid)"] = cls_perf
    # add to ROC DF
    rocRes = pd.DataFrame(cls_roc)
    rocRes["model"] = clfName
    rocDF = pd.concat([rocDF, rocRes], ignore_index=True)
    # evaluate random search and add to perfDict
    clfr_perf, _, rs_p  = eval_randomsearch(clf, clfGrid, xTrain,
                                            yTrain, xTest, yTest)
    perfDict[clfName + " (Random)"] = clfr_perf
    bestParamDict[clfName] = {"Grid": gs_p, "Random": rs_p}
    return perfDict, rocDF, bestParamDict

In [None]:
def main():
    df = pd.read_csv("Models/Data/data.csv")

    # make classes
    X = df.drop(columns=['close', 'otc'])
    y = pd.DataFrame(columns=['price'])
    for i, row in df.iterrows():
        if row['open'] - row['close'] > 0:
            y.loc[i] = 1 # decrease
        else:
            y.loc[i] = 0 # increase

    y = y.to_numpy().flatten()

    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.2)
    
    xTrain, xTest = preprocess(xTrain, xTest)

    perfDict = {}
    rocDF = pd.DataFrame()
    bestParamDict = {}

    print("Tuning Unregularized Logistic Regression --------")
    # logistic regression (unregularized)
    unregLrName = "LR (None)"
    unregLrGrid = get_parameter_grid(unregLrName)
    # fill in
    lrClf = LogisticRegression()
    perfDict, rocDF, bestParamDict = eval_searchcv(unregLrName, lrClf, unregLrGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, rocDF, bestParamDict)
    # logistic regression (L1)
    print("Tuning Logistic Regression (Lasso) --------")
    lassoLrName = "LR (L1)"
    lassoLrGrid = get_parameter_grid(lassoLrName)
    # fill in
    lassoClf = LogisticRegression(penalty='l1', solver='liblinear', max_iter=300)
    perfDict, rocDF, bestParamDict = eval_searchcv(lassoLrName, lassoClf, lassoLrGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, rocDF, bestParamDict)
    # Logistic regression (L2)
    print("Tuning Logistic Regression (Ridge) --------")
    ridgeLrName = "LR (L2)"
    ridgeLrGrid = get_parameter_grid(ridgeLrName)
    # fill in
    ridgeClf = LogisticRegression(penalty='l2')
    perfDict, rocDF, bestParamDict = eval_searchcv(ridgeLrName, ridgeClf, ridgeLrGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, rocDF, bestParamDict)

    perfDF = pd.DataFrame.from_dict(perfDict, orient='index')
    print(perfDF)
    # save roc curves to data
    rocDF.to_csv('out', index=False)
    # store the best parameters
    with open('best', 'w') as f:
        json.dump(bestParamDict, f)

In [171]:
def holdout(model, xFeat, y, testSize):
    xTrain, xTest, yTrain, yTest = train_test_split(xFeat, y, test_size=testSize)
    resultDict = eval_randomsearch(model, xTrain, yTrain, xTest, yTest)
    return resultDict

In [172]:
main()

Tuning Unregularized Logistic Regression --------
Tuning Logistic Regression (Lasso) --------
Tuning Logistic Regression (Ridge) --------
                         AUC     AUPRC        F1
LR (None) (Grid)    0.897898  0.849056  0.816901
LR (None) (Random)  0.900901  0.851657  0.828571
LR (L1) (Grid)      0.891141  0.846480  0.828571
LR (L1) (Random)    0.890390  0.858577  0.816901
LR (L2) (Grid)      0.897898  0.849056  0.816901
LR (L2) (Random)    0.895646  0.846632  0.816901


AttributeError: 'str' object has no attribute 'rocOutput'