In [2]:
import pandas as pd
import numpy as np
import datetime
#from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_squared_error
from skfeature.function.similarity_based import fisher_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score 
%matplotlib inline

In [3]:
BIO = pd.read_csv("phase3_data/bio.csv")
ZTS = pd.read_csv("phase3_data/zts.csv")
BIO = BIO.loc[:, 'Report Date': 's_diff']
ZTS = ZTS.loc[:, 'Report Date': 's_diff']
dates = ZTS['Report Date']

datetime.datetime(2022, 2, 22, 0, 0)

In [4]:
# function to select data given date window
def window(start, end, df, date):
    '''
    Given a start and end date, return df with data only
    from that period
    
    Inputs:
        start/end: start and end dates
            ex: start = '2022-01-23' ('YYYY-MM-DD')
        df: pd dataframe
        date: pd series with dates of all possible dates in data
    
    Returns: pd dataframe
    '''
    date = date[date.between(start, end, inclusive='both')]
    # concat dates and dataframe
    df = pd.concat([date, df], axis = 1, join="inner")
    return df

In [5]:
def feature_selection_FS(training_dataset, testing_dataset, bar):
    """
    find the important feature using Fishers' score
    """
    X = training_dataset.iloc[:, 3:]
    Y_price = training_dataset['Stock price']
    Y_dummy = training_dataset['Y_boolean']
    date_train = training_dataset['Report Date']
    date_test = testing_dataset['Report Date']
    
    ranks = fisher_score.fisher_score(X.to_numpy(), Y_dummy)
    feat_importances = pd.Series(ranks, training_dataset.columns[3:])
    feat_importances.plot(kind='barh', color ="teal")
    plt.show()
    
    feat_selected = feat_importances.to_frame('importance')
    # select importance > than bar
    feat_selected = feat_selected[feat_selected['importance'] > bar]
    select_feature_lst = list(feat_selected.index)
    X1 = X[np.intersect1d(X.columns, select_feature_lst)]
    Y_true_price = testing_dataset['Stock price']
    Y_true = testing_dataset['Y_boolean']
    testing_dataset = testing_dataset[np.intersect1d(testing_dataset.columns, select_feature_lst)]
    
    new_training_dataset = pd.merge(Y_price, Y_dummy, left_index=True, right_index=True)
    new_training_dataset = pd.merge(new_training_dataset, X1, left_index=True, right_index=True)
    new_training_dataset = pd.merge(date_train, new_training_dataset, left_index=True, right_index=True)
    
    new_testing_dataset = pd.merge(Y_true_price, Y_true, left_index=True, right_index=True)
    new_testing_dataset = pd.merge(new_testing_dataset, testing_dataset,left_index=True, right_index=True)
    new_testing_dataset = pd.merge(date_test, new_testing_dataset, left_index=True, right_index=True)
    
    return new_training_dataset, new_testing_dataset
    
    

In [6]:
def feature_selection_IG(training_dataset, testing_dataset, bar):
    """
    find the important feature using IG method
    """

    X = training_dataset.iloc[:, 3:]
    Y_price = training_dataset['Stock price']
    Y_dummy = training_dataset['Y_boolean']
    date_train = training_dataset['Report Date']
    date_test = testing_dataset['Report Date']
    
    importances = mutual_info_classif(X, Y_dummy, random_state=1241)
    feat_importances = pd.Series(importances, training_dataset.columns[3:])
    feat_importances.plot(kind='barh', color ="teal")
    plt.show()
    
    feat_selected = feat_importances.to_frame('importance')
    # select importance > than bar
    feat_selected = feat_selected[feat_selected['importance'] > bar]
    select_feature_lst = list(feat_selected.index)
    X1 = X[np.intersect1d(X.columns, select_feature_lst)]
    Y_true_price = testing_dataset['Stock price']
    Y_true = testing_dataset['Y_boolean']
    testing_dataset = testing_dataset[np.intersect1d(testing_dataset.columns, select_feature_lst)]
    
    new_training_dataset = pd.merge(Y_price, Y_dummy, left_index=True, right_index=True)
    new_training_dataset = pd.merge(new_training_dataset, X1, left_index=True, right_index=True)
    new_training_dataset = pd.merge(date_train, new_training_dataset, left_index=True, right_index=True)
    
    new_testing_dataset = pd.merge(Y_true_price, Y_true, left_index=True, right_index=True)
    new_testing_dataset = pd.merge(new_testing_dataset, testing_dataset,left_index=True, right_index=True)
    new_testing_dataset = pd.merge(date_test, new_testing_dataset, left_index=True, right_index=True)
    
    return new_training_dataset, new_testing_dataset
    

In [7]:
def test_feature_select(training_dataset, validate_dataset):
    """
    check the performance of the current selected: use only training dataset accuracy 
    """
    # run baseline logit
    logit = LogisticRegression(solver = 'lbfgs', random_state=1241)
    X = training_dataset.iloc[:, 3:]
    Y_dummy = training_dataset['Y_boolean']
    logit.fit(X, Y_dummy)
    Y_dummy_validate = validate_dataset['Y_boolean']
    X_validate = validate_dataset.iloc[:, 3:]
    Y_pred = logit.predict(X_validate)
    # pred = lr.predict(testing_dataset)
#     score = logit.score(testing_dataset, Y_true)
    
    score = accuracy_score(Y_dummy_validate, Y_pred)
    return score

In [8]:
# collect all date
date_starts = []
date = datetime.datetime(2020, 3, 2)
date_starts.append(date)

for idx in range(722):
    date += datetime.timedelta(days=1)
    date_starts.append(date)


In [9]:
# currently using BIO as example
# loop for feature selection and modeling
for train_date_start in date_starts:
    train_date_end = train_date_start + datetime.timedelta(days=90)
    validate_date_start = train_date_end + datetime.timedelta(days=1)
    validate_date_end = validate_date_start + datetime.timedelta(days=30)
    test_date_start = validate_date_end + datetime.timedelta(days=1)
    test_date_end = test_date_start + datetime.timedelta(days=30)
    
    train_date_start = str(train_date_start)[:10]
    train_date_end = str(train_date_end)[:10]
    validate_date_start = str(validate_date_start)[:10]
    validate_date_end = str(validate_date_end)[:10]
    test_date_start = str(test_date_start)[:10]
    test_date_end = str(test_date_end)[:10]
    
    train_data = window(train_date_start, train_date_end, BIO, date)
    validate_data = window(validate_date_start, validate_date_end, BIO, date)
    test_data = window(test_date_start, test_date_end, BIO, date)
    
    bars_FS = [0,1,2,3,4,5,6,7,8,9,10]
    bars_IG = [0,0.001,0.005,0.01,0.015,0.02,0.025, 0.03,0.035,0.04,0.045,0.05,0.055,0.06,0.065,0.07, 0.075, 0.8,0.085, 0.9,0.095,0.1]
    best_accuracy = 0
    best_bar = []
    
    for bar_FS in bars_FS:
        for bar_IG in bars_IG:
            try:
                train_BIO_selected, validate_BIO_selected = feature_selection_FS(train_data, validate_data, bar_FS)
                train_BIO_selected, validate_BIO_selected = feature_selection_IG(train_BIO_selected, validate_BIO_selected, bar_IG)
            # validate_testing:
                score = test_feature_select(training_dataset, validate_dataset)
                if score > best_accuracy:
                    best_accuracy = score
                    best_bar = [bar_FS, bar_IG]
            except:
                continue
   

AttributeError: 'datetime.datetime' object has no attribute 'between'