In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
from sklearn.preprocessing import MinMaxScaler
# Import the linear models from scikit-learn
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
# load data
path = '/Users/carolynliu/capp30254_fight_potatoes/data processing/window_data_features/'
csv_files = os.listdir(path)

df_dict = {}
for f in csv_files:
    df_name = f.split("\\")[-1]
    if ".csv" in f.split("\\")[-1]:
        df_dict[df_name] = pd.read_csv(path + df_name, index_col=[0])

In [3]:
df_dict.keys(), len(df_dict)

(dict_keys(['ZTS_Group4_testing.csv', 'BIO_Group3_testing.csv', 'ZTS_Group1_testing.csv', 'BIO_Group3_training.csv', 'BIO_Group4_training.csv', 'ZTS_Group2_testing.csv', 'BIO_Group0_testing.csv', 'ZTS_Group2_training.csv', 'BIO_Group2_training.csv', 'ZTS_Group3_training.csv', 'ZTS_Group4_training.csv', 'BIO_Group0_training.csv', 'BIO_Group2_testing.csv', 'ZTS_Group0_testing.csv', 'ZTS_Group1_training.csv', 'BIO_Group1_training.csv', 'ZTS_Group0_training.csv', 'BIO_Group4_testing.csv', 'ZTS_Group3_testing.csv', 'BIO_Group1_testing.csv']),
 20)

In [4]:
def normalize(X_train, X_test):
    '''
    Normalize features using sklearn MinMaxScaler
    '''
    norm = MinMaxScaler().fit(X_train)
    X_train_norm = norm.transform(X_train)
    X_test_norm = norm.transform(X_test)
    X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
    X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)
    return X_train_norm, X_test_norm

In [5]:
# process and normalize data
def process(train, test):
    X_train = train.iloc[:, 3:]
    X_test = test.iloc[:, 3:]
    # normalize
    X_train, X_test = normalize(X_train, X_test)
    y_train_class = train['Y_boolean']
    y_train_price = train['Stock price']
    
    y_test_class = test['Y_boolean']
    y_test_price = test['Stock price']
    return X_train, X_test, y_train_class, y_train_price, y_test_class, y_test_price

In [6]:
# train model
def train_model(model_name, X_train, X_test, y_train_class, y_train_price, y_test_class, y_test_price):
    '''
    Train model with different hyperparameter values, given a model and data
    '''
    #train_results = {}
    #test_results = {}
    train_results = []
    test_results = []
    val = None

    if model_name == "logit":
        params = [.00001, .0001, .001, .01, .5, 1, 2.5, 5]
        y_train = y_train_class
        y_test = y_test_class
    elif model_name == "sgdc" or model_name == 'sgdr':
        params = [.00001, 0.0001, .001, .01, .1, .2, .3, .4,.5, 1]
        if model_name == 'sgdc':
            y_train = y_train_class
            y_test = y_test_class
        else:
            y_train = y_train_price
            y_test = y_test_price
    elif model_name == 'GradientBoostingClassifier':
        params = [.00001, 0.0001, .001, .01, .1, .2, .3, .4,.5, 1]
        #model = GradientBoostingClassifier(learning_rate = val, random_state=123)
        y_train = y_train_class
        y_test = y_test_class
    elif model_name == 'rfc' or model_name == 'rfr':
        params = [i for i in range(5, 250, 5)]
        if model_name == 'rfc':
            #model = RandomForestClassifier(n_estimators=val, random_state=123)
            y_train = y_train_class
            y_test = y_test_class
        else:
            #model = RandomForestRegressor(n_estimators=val, random_state=123)
            y_train = y_train_price
            y_test = y_test_price
    elif model_name == 'svm':
        params = [.001, .01, .5, 1, 2.5, 5]
        model = SVC(C=val, random_state=123)
        y_train = y_train_class
        y_test = y_test_class
    else: # knn
        params = [i for i in range(5, 100, 5)]
        model = KNeighborsClassifier(n_neighbors=val)
        y_train = y_train_class
        y_test = y_test_class
    for val in params:
        if model_name == 'logit':
            model = LogisticRegression(C = val, max_iter=20000, random_state=123)
            param_name = 'C'
        elif model_name == 'sgdc':
            model = SGDClassifier(alpha=0.0001, epsilon=0.01, eta0 = val, max_iter = 20000, random_state=123)
            param_name = 'eta0'
        elif model_name == 'sgdr':
            model = SGDRegressor(alpha=0.0001, epsilon=0.01, eta0 = val, max_iter = 20000, random_state=123)
            param_name = 'eta0'
        elif model_name == 'GradientBoostingClassifier':
            model = GradientBoostingClassifier(learning_rate = val, random_state=123)
            param_name = 'learning rate'
        elif model_name == 'rfc': 
            model = RandomForestClassifier(n_estimators=val, random_state=123)
            param_name = 'n_estimators'
        elif model_name == 'rfr':
            model = RandomForestRegressor(n_estimators=val, random_state=123)
            param_name = 'n_estimators'
        elif model_name == 'svm':
            model = SVC(C=val, random_state=123)
            param_name = 'C'
        else:
            model = KNeighborsClassifier(n_neighbors=val)
            param_name = "n_neighbors"

        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        '''
        y_test_pred = model.predict(X_test)
        if model_name == 'sgdr' or model_name == 'rfr':
            #return y_test_pred, y_test, params
            test_acc = np.mean((y_test_pred - y_test)/y_test)
            train_acc = np.mean((y_train_pred - y_train)/y_train)
        else:
            train_acc = metrics.accuracy_score(y_train, y_train_pred)
            test_acc = metrics.accuracy_score(y_test, y_test_pred)
        #train_results[val] = train_acc
        #test_results[val] = test_acc
        ''' 
        if model_name == 'sgdr' or model_name == 'rfr':
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            train_acc = metrics.r2_score(y_train, y_train_pred)
            test_acc = metrics.r2_score(y_test, y_test_pred)
        else:
            y_test_pred = None
            train_acc = model.score(X_train, y_train)
            test_acc = model.score(X_test, y_test)
        train_results.append(train_acc)
        test_results.append(test_acc)
    return train_results, test_results, params, param_name, y_test_pred, y_test


In [7]:
# function to run models
models = {'logit': 'Logistic Regression', 'sgdc': 'SGD Classifier', 'sgdr': 'SGD Regression',
 'GradientBoostingClassifier': 'Gradient Boosting Classifier', 'rfc': 'Random Forest Classifier',
  'rfr': 'Random Forest Regression', 'svm': 'Support Vector Classification', 'knn': 'KNN'}
def run_models(train, test, name):
    '''
    runs models & plots for each group
    returns results
    '''
    plt_name = name
    X_train, X_test, y_train_class, y_train_price, y_test_class, y_test_price = process(df_dict[train], df_dict[test])
    name = {m:[] for m in models}
    for m in models:
        #print(m)
        train_results, test_results, params, param_name, y_test_pred, y_test = train_model(m, X_train, X_test, y_train_class, y_train_price, y_test_class, y_test_price)
        name[m].append(train_results)
        name[m].append(test_results)
        name[m].append(params)
        name[m].append(param_name)
        name[m].append(y_test_pred)
        name[m].append(y_test)

    # plot results
    for model, results in name.items():
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12,6))
        ax.plot(results[2], results[0], color="steelblue", label='Train accuracy', ls = 'dashed')
        ax.plot(results[2], results[1], color="lightblue", label='Test accuracy')
        ax.legend(loc="center right", fontsize=12)
        ax.set_xlabel(results[-3], fontsize=12)
        ylab = 'accuracy'
        max_val_y = max(results[1])
        max_val_x = results[2][results[1].index(max_val_y)]
        ax.set_ylabel(ylab, fontsize=12)
        fig.suptitle(f'{models[model]} for {plt_name}')
        ax.annotate(f'max accuracy: {round(max_val_y, 3)}', xy = (max_val_x, max_val_y), arrowprops=dict(facecolor='black'),)
        fig.savefig(f'/Users/carolynliu/capp30254_fight_potatoes/phase2/plots/{model}_{plt_name}.png', dpi=fig.dpi, bbox_inches='tight')
        plt.close(fig)
    return name

In [8]:
# run models
names = ['BIO0', 'BIO1', 'BIO2','BIO3','BIO4', 'ZTS0', 'ZTS1', 'ZTS2', 'ZTS3', 'ZTS4']
train = ['BIO_Group0_training.csv', 'BIO_Group1_training.csv', 'BIO_Group2_training.csv','BIO_Group3_training.csv',
    'BIO_Group4_training.csv', 'ZTS_Group0_training.csv', 'ZTS_Group1_training.csv', 'ZTS_Group2_training.csv', 
    'ZTS_Group3_training.csv', 'ZTS_Group4_training.csv']
test = ['BIO_Group0_testing.csv', 'BIO_Group1_testing.csv', 'BIO_Group2_testing.csv','BIO_Group3_testing.csv',
    'BIO_Group4_testing.csv', 'ZTS_Group0_testing.csv', 'ZTS_Group1_testing.csv', 'ZTS_Group2_testing.csv', 
    'ZTS_Group3_testing.csv', 'ZTS_Group4_testing.csv']
results = {name: [] for name in names}
for tr, te, name in zip(train, test, names):
    results[name].append(run_models(tr, te, name))

In [100]:
# plot sgd/rf regression actual vs predicted price for best performing model and group for bio and zts

def plot_prices(name, df_name, model_name, results):
    dates = pd.to_datetime(df_dict[df_name]['Report Date'])
    result = results[name][0][model_name]
    y_pred = result[-2]
    y_true = result[-1]

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12,6))
    ax.plot(dates, y_pred, color="steelblue", label='Predicted Price', ls = 'dashed')
    ax.plot(dates, y_true, color="lightblue", label='Actual Price')
    ax.legend(loc="center right", fontsize=12)
    ax.set_xlabel("Date", fontsize=12)
    ax.set_ylabel('Price', fontsize=12)

    fig.suptitle(f'Actual vs Predicted Price for {name} using {models[model_name]}')
    fig.savefig(f'/Users/carolynliu/capp30254_fight_potatoes/phase2/plots/price_{name}_{model_name}.png', dpi=fig.dpi, bbox_inches='tight')
    plt.close(fig)

In [101]:
# create predicted vs true price plots
names = {'BIO1': 'sgdr', 'ZTS0': 'sgdr', 'BIO3': 'rfr', 'ZTS3': 'rfr'}
df_names = ['BIO_Group1_testing.csv', 'ZTS_Group0_testing.csv', 'BIO_Group3_testing.csv', 'ZTS_Group3_testing.csv']

for i, (name, model_name), in enumerate(names.items()):
    plot_prices(name, df_names[i], model_name, results)