In [40]:
import pandas as pd
import numpy as np
from itertools import product
import matplotlib.pyplot as plt

# change the settings:

pd.set_option('display.max_columns', 100)




In [20]:
df = pd.read_csv('../data/01_raw/combined_stock_pulls/combined_raw_stock_data.csv')


In [21]:
df.head()

Unnamed: 0,date,high,low,open,close,volume,adj_close,ticker
0,2019-01-02,58.869999,56.150002,56.439999,58.48,24892600.0,48.071712,XLE
1,2019-01-03,58.860001,57.240002,58.650002,57.900002,18024100.0,47.594952,XLE
2,2019-01-04,60.049999,58.560001,58.900002,59.869999,21351500.0,49.214333,XLE
3,2019-01-07,61.200001,59.52,60.32,60.759998,18056700.0,49.945919,XLE
4,2019-01-08,61.75,60.900002,61.610001,61.23,18692300.0,50.332279,XLE


In [22]:
def calculate_rolling_means(dataframe: pd.DataFrame, 
                            stock_field: str, 
                            date_field: str,
                            calculation_field: str, 
                            day_ranges: list, 
                            exponential = False) -> pd.DataFrame: 
    
    '''return a dataframe that includes all of the rolling means either straight or exponential appended to the overall dataset
    
    Args:
        dataframe: main dataset (from raw to be fed in or specified in the modeling catalog
        field: field to use on which to calculate the rolling standard deviations
        day_ranges: list of the days over which the rolling mean is to be calculates (e.g., 6, 7, 15)
        exponential: If True will calculate exponential moving averages instead of simple moving averages
    
    '''
    
    # first sort the dataframe:
    dataframe = dataframe.sort_values(by =[stock_field, date_field] )

    for days in day_ranges: # loop through each day range and append the new column after running for each security

        moving_averages = []

        if exponential == True:
            for equity in dataframe[stock_field].unique():
                temp = dataframe[dataframe[stock_field] == equity]
                stock_moving_average = temp[calculation_field].ewm(span = days, min_periods = days).mean()
                moving_averages.extend(stock_moving_average)
                del temp
            
            dataframe[str(days) +'_' + calculation_field + '_' + 'ema'] = moving_averages

        else:
            for equity in dataframe[stock_field].unique():
                temp = dataframe[dataframe[stock_field] == equity]
                stock_moving_average = temp[calculation_field].rolling(days).mean()
                moving_averages.extend(stock_moving_average)
                del temp
           
            dataframe[str(days) +'_' + calculation_field + '_' + 'ema'] = moving_averages

    return dataframe


def calculate_rolling_std(dataframe: pd.DataFrame, field: str, day_ranges: list) -> pd.DataFrame: 
    
    '''return a dataframe that includes all of the rolling standard deviations of a given field'''
    
    for days in day_ranges:
        dataframe[str(days) +'_' + field + '_' + 'std'] = dataframe[field].rolling(days).std()

    return dataframe

def fix_columns(columns: list) -> list:
    
    '''function that takes a list of columns and modifies them to be easier to read -- assign to df.columns
    
    Args:
        columns: list of the columns in the dataframe
    
    Returns: list of columns to be set as the dataframe columns
    
    '''
    
    column_string_replace = ['\n','@',' ','__', '/', '-']


    columns = columns.map(lambda x: x.strip())
    columns = columns.map(lambda x : x.lower())

    for string in column_string_replace:
        columns = columns.map(lambda x : x.replace(string, '_') if isinstance (x, (str, bytes)) else x)

    return columns


def create_above_below_indicator_fields(dataframe: pd.DataFrame, 
                                        parameters: dict) -> pd.DataFrame:

    #TODO: Come back and format functions to be in proper format with black/linting with proper documentation

    '''Function that adds indicator fields or calculates percentage differences or both depending on arguments
    
    Args:
        dataframe: input as a pandas dataframe with fields already included for calculation

    Returns: dataframe that includes representative fields for the functionality specified by the user
    '''

    target_columns = dataframe.columns[dataframe.columns.str.contains("|".join(['sma', 'ema']))]

    for column in target_columns:

        if parameters['indicator_return_type'] == 'boolean':

            dataframe['above_'+ column + '_ind'] = np.where(dataframe[column].isna(), 
            np.nan, np.where(dataframe[column] < dataframe[parameters['calculation_field']], 1, 0))
        
        elif parameters['indicator_return_type'] == 'percentage':

            dataframe[column + '_pct_diff'] = np.where(dataframe[column].isna(), 
            np.nan,  dataframe[column] / dataframe[parameters['calculation_field']] -1)
        
        else:

            dataframe['above_'+ column + '_ind'] = np.where(dataframe[column].isna(), 
            np.nan, np.where(dataframe[column] < dataframe[parameters['calculation_field']], 1, 0))

            dataframe[column + '_pct_diff'] = np.where(dataframe[column].isna(), 
            np.nan,  dataframe[column] / dataframe[parameters['calculation_field']] -1)

    return dataframe


def create_bollinger_bands(dataframe: pd.DataFrame, global_parameters:dict, function_parameters: dict) -> pd.DataFrame:

    #TODO: evolve model to include functionality to include multiple sets of bollinger bands

    '''function that returns bollinger bands for each equity in the datasets sent to the model
    
    Args:
        Dataframe: pandas dataframe containing the equities for which the bollinger bands are calculated
        calculation_field: field used to calculate all features (set in the globals parameters)
        moving_average_used: moving average field (calculated in prior step) to be used in the model
        number_of_std: number of standard deviations from the mean to calculate the upper and lower bands
        use_sma: Boolean to indicate whether to use the EMA or SMA in order to calculate the bollinger bands
        return_top_distance: Boolean for whether to return field indicating distance to the upper band
        return_bottom_distance: Boolean for whether to return field indicating distance to the bottom band
        return_gap: Boolean for whether to return the distance between bands and the proportion relative to the price
    '''

    assert np.isin(str(function_parameters['moving_average_used']) +'_' + global_parameters['calculation_field'] + '_' + 'std', df.columns), \
    'please ensure the moving average number of days is in the days parameter'

    if function_parameters['use_sma'] == True:
        
        assert np.isin(str(function_parameters['moving_average_used']) + '_' + global_parameters['calculation_field'] + '_' + 'sma', df.columns), \
            'Please ensure the moving average calculated is SMA'
        


        dataframe['upper_bollinger_band'] = dataframe[str(function_parameters['moving_average_used']) + '_' + global_parameters['calculation_field'] + '_' + 'sma'] + \
                                            (function_parameters['number_of_std'] * dataframe[str(function_parameters['moving_average_used']) +'_' + global_parameters['calculation_field'] + '_' + 'std'])

        dataframe['lower_bollinger_band'] = dataframe[str(function_parameters['moving_average_used']) + '_' + global_parameters['calculation_field'] + '_' + 'sma'] - \
                                            (function_parameters['number_of_std'] * dataframe[str(function_parameters['moving_average_used']) +'_' + global_parameters['calculation_field'] + '_' + 'std'])

        if function_parameters['return_top_distance'] == True:
            dataframe['bol_pct_from_top'] = dataframe[global_parameters['calculation_field']] / dataframe['upper_bollinger_band'] -1

        if function_parameters['return_bottom_distance'] == True:
            dataframe['bol_pct_from_bottom'] = dataframe[global_parameters['calculation_field']] / dataframe['lower_bollinger_band'] -1

        if function_parameters['return_gap'] == True:
            dataframe['bol_range'] = dataframe['upper_bollinger_band'] - dataframe['lower_bollinger_band']
            dataframe['bol_range_pct'] = (dataframe['upper_bollinger_band'] - dataframe['upper_bollinger_band']) / dataframe[global_parameters['calculation_field']]

    elif function_parameters['use_sma'] == False:

        assert np.isin(str(function_parameters['moving_average_used']) + '_' + global_parameters['calculation_field'] + '_' + 'ema', df.columns), \
            'Please ensure the moving average calculated is EMA'

        dataframe['upper_bollinger_band'] = dataframe[str(function_parameters['moving_average_used']) + '_' + global_parameters['calculation_field'] + '_' + 'ema'] + \
                                            (function_parameters['number_of_std'] * dataframe[str(function_parameters['moving_average_used']) +'_' + global_parameters['calculation_field'] + '_' + 'std'])

        dataframe['lower_bollinger_band'] = dataframe[str(function_parameters['moving_average_used']) + '_' + global_parameters['calculation_field'] + '_' + 'ema'] - \
                                            (function_parameters['number_of_std'] * dataframe[str(function_parameters['moving_average_used']) +'_' + global_parameters['calculation_field'] + '_' + 'std'])


        if function_parameters['return_top_distance'] == True:
            dataframe['bol_pct_from_top'] = dataframe[global_parameters['calculation_field']] / dataframe['upper_bollinger_band'] -1

        if function_parameters['return_bottom_distance'] == True:
            dataframe['bol_pct_from_bottom'] = dataframe[global_parameters['calculation_field']] / dataframe['lower_bollinger_band'] -1
    

    return dataframe



def calculate_cumulative_days_above(dataframe: pd.DataFrame, parameters: dict) -> pd.DataFrame:

    '''Function that calculates the cumulative days spent above a given moving average(s)
    CAUTION: Must contain indicator field for each respective moving average -- requires running in the pipeline to remove from pipeline, recalculate features separately
    
    Args:
        dataframe: dataframe containing a series of moving average fields across different equity tickers
        mnoving averages

    Returns: pandas dataframe containing the newly created cumulative features
    '''

    fields_to_calc = dataframe.columns[dataframe.columns.str.contains("|".join(['close_ema_ind', 'close_sma_ind']))]
    # sort values to ensure consistency (in case something changes in the dataframe):
    dataframe = dataframe.sort_values(by =[parameters['stock_field'], parameters['date_field'] ])

    #TODO: Figure out a better way to ensure field + ticker consistency without nested for loop

    for field in fields_to_calc: 
        
        #TODO: Figure out a better way to do this running total on a series
        temp = dataframe[[parameters['stock_field'], parameters['date_field'], field]].reset_index(drop = True)
        temp.fillna(0, inplace = True) # fill nulls for consistency

        groups = ((temp[parameters['stock_field']]!=temp[parameters['stock_field']].shift()) | (temp[field]!=temp[field].shift())).cumsum()

        dataframe['cum_days_above_' + field ] = temp.groupby(by = groups)[field].cumsum()

        del temp, groups

    return dataframe.reset_index(drop = True)





  

In [23]:
# run on dataframe:

day_ranges = [7, 14, 21]

df = calculate_rolling_means(dataframe = df, 
                            stock_field = 'ticker', 
                            date_field= 'date',
                            calculation_field= 'close',
                            day_ranges = day_ranges,
                            exponential= False)

df = calculate_rolling_std(dataframe= df, field = 'close', day_ranges = day_ranges)

In [24]:
df.head()


Unnamed: 0,date,high,low,open,close,volume,adj_close,ticker,7_close_ema,14_close_ema,21_close_ema,7_close_std,14_close_std,21_close_std
1848,2019-01-02,39.712502,38.557499,38.7225,39.48,148158800.0,38.168354,AAPL,,,,,,
1849,2019-01-03,36.43,35.5,35.994999,35.547501,365248800.0,34.366493,AAPL,,,,,,
1850,2019-01-04,37.137501,35.950001,36.1325,37.064999,234428400.0,35.833588,AAPL,,,,,,
1851,2019-01-07,37.2075,36.474998,37.174999,36.982498,219111200.0,35.753819,AAPL,,,,,,
1852,2019-01-08,37.955002,37.130001,37.389999,37.6875,164101200.0,36.435398,AAPL,,,,,,


In [168]:
parameters = {
    "moving_average_used" : 21,
    "date_field": 'date',
    "stock_field" : 'ticker',
    "calculation_field" : 'close', # need to reference the other dictionary in the definition statement
    "number_of_std" : 2,
    "use_sma": False, 
    "return_top_distance" : True,
    "return_bottom_distance" : True,
    "return_gap" : True,
    "prediction_horizon" : 20
}

global_parameters = {
    "calculation_field" : 'close',
    "indicator_return_type": 'boolean_and_percentage'
}

In [26]:
df = create_bollinger_bands(dataframe = df, global_parameters = global_parameters, function_parameters = parameters)


In [27]:
df = create_above_below_indicator_fields(dataframe = df, parameters = global_parameters)

In [31]:
test = df[['date', 'ticker', 'close', '7_close_ema', 'above_7_close_ema_ind', ]]

Unnamed: 0,date,ticker,close,7_close_ema,above_7_close_ema_ind
1848,2019-01-02,AAPL,39.480000,,
1849,2019-01-03,AAPL,35.547501,,
1850,2019-01-04,AAPL,37.064999,,
1851,2019-01-07,AAPL,36.982498,,
1852,2019-01-08,AAPL,37.687500,,
...,...,...,...,...,...
1843,2022-08-25,XLF,34.759998,34.784285,0.0
1844,2022-08-26,XLF,33.720001,34.511428,0.0
1845,2022-08-29,XLF,33.480000,34.198571,0.0
1846,2022-08-30,XLF,33.299999,33.961428,0.0


In [158]:
# test cumulative days above logic:
test= calculate_cumulative_days_above(dataframe = df, parameters = parameters )


In [176]:
# generate the target feature for classification:

def create_target_classifier(dataframe: pd.DataFrame, parameters: dict) -> pd.DataFrame:

    '''Function that creates the target feature for the predictive model(s)
    
    Args:
        dataframe: main dataset containing the outputs of the feature engineering pipeline
        target_field: field from which the target feature is the be generated
        stock_field: field containing the stock/ticker symbol(s)
        prediction_horizon: timeframe from which to calculate the prediction (e.g., 20 days out)

    Returns: Dataframe containing the predictive model target
        
    '''

    # always start by sorting and resetting the index:
    dataframe = dataframe.sort_values(by =[parameters['stock_field'], parameters['date_field'] ]).reset_index(drop = True)

    dataframe['target_'+ str(parameters['prediction_horizon'])+"_days_ahead"] = dataframe.groupby(by = parameters['stock_field'])[parameters['calculation_field']].shift(-parameters['prediction_horizon'])

    # create boolean for classification
    dataframe['target_'+ str(parameters['prediction_horizon'])+"_days_ahead_ind"] = np.where(dataframe[parameters['calculation_field']] < dataframe['target_'+ str(parameters['prediction_horizon'])+"_days_ahead"],
                                                                                    1 , 0 )
  
    return dataframe
    
    



    



In [177]:
test = create_target_classifier(dataframe = df, parameters= parameters)

In [178]:
test.to_clipboard()