# Imports

In [1]:
# Utils
from datetime import datetime
from datetime import timedelta
import os
import sys
import time
from itertools import combinations
from itertools import permutations
from functools import partial

# Data management
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller

# Data fetching
import yfinance as yf

# Spread generation
# from sklearn.linear_model import LinearRegression

  from pandas import Int64Index as NumericIndex


# Utils

## Stonk price data download

### Input ticker names by industry

In [2]:
def get_tickers_by_industry(industries=None, data_dir=None, filename=None):
    '''
    Read the CSV file containing all tickers and their subindustries and return tickers from the selected subindustries in a list.
    
    -Args:
        industries (List(string)): if not given, return all tickers; else the list can contain:
            'technology_hardware_and_equipment'
            'software_and_services'
            'media_and_entertainment'
            'retailing'
            'automobiles_and_components'
            'semiconductors_and_semiconductor_equipment'
            'health_care_equipment_and_services'
            'banks'
            'pharmaceuticals_biotechnology_and_life_sciences'
            'food_and_staples_retailing'
            'oil_gas_and_consumable_fuels'
            'food_beverage_and_tobacco'
            'telecommunication_services'
            'consumer_durables_and_apparel'
            'consumer_services'
            'transportation'
            'diversified_financials'
            'utilities'
            'capital_goods'
            'insurance'
            'chemicals'
            'metals_and_mining'
            'commercial_and_professional_services'
            'containers_and_packaging'
            'energy_equipment_and_services'
            'construction_materials'
            'paper_and_forest_products'
    
    -Returns:
        tickers (pandas Series): list of selected ticker names
    '''
    filename = 'stonk_list.csv' if filename is None else filename
    data_dir = 'data' if data_dir is None else data_dir
    
    path_to_csv = os.path.join(data_dir, filename)
    stonk_list = pd.read_csv(path_to_csv)
    return stonk_list.set_index('ticker') if industries is None else stonk_list[stonk_list['subindustry'].isin(industries)].set_index('ticker')

In [3]:
def download_stonk_prices(stonk_list, period_years=3, date_from=None, date_to=None, interval='1d', source='yfinance', data_dir='data', proxy=False):    
    '''
    Returns historical price data for the selected stonks.

    -Args:
        stonk_list (List(string)): List of stonk identifiers as strings, case unsensitive
        period_years (float): How many years of data to download until date_to, can be a floating point number
    -Optional:
        date_from (datetime): Start date for stonk data (use instead of period_years)
        date_to (datetime): End date for stonk data
        interval (string): Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
        source (string): Where to source data from. Valid sources: yfinance
        data_dir (string): Folder name where to output downloaded data
        file_prefix (string): Prefix of CSV file containing downloaded data inside data_dir
        proxy (boolean): Whether to use a proxy connection to avoid API limits/blocks
                
    -Returns:
        stonks (Pandas Dataframe): Pandas Dataframe containing requested ticker prices
    '''
    
    date_to = datetime.now() if date_to is None else date_to
    date_from = date_to-(timedelta(days=int(365*period_years))) if date_from is None else date_from
    
    if source.lower() == 'yfinance':
        stonks = yf.download(list(stonk_list), start=date_from, end=date_to, interval=interval, group_by='column', threads=True, rounding=True)['Adj Close']
        stonks.dropna(axis=0, how='all', inplace=True)
        stonks.sort_values(by='Date', inplace=True)
        
        stonks.index = pd.to_datetime(stonks.index).date
        stonks.index.name = 'date'

        clean_stonks = stonks.dropna(axis=1, how='all', thresh=int(len(stonks.index) * 0.99)).copy()
        clean_stonks.dropna(axis=0, how='all', thresh=int(len(clean_stonks.columns) * 0.99), inplace=True)
        
        # Forward fill ticker columns (axis=0 for columns)
        clean_stonks.fillna(axis=0, method='ffill', inplace=True)
        
        clean_stonks.dropna(axis=1, how='any', inplace=True)
        
        # Must be no NA values left
        assert clean_stonks.isna().sum().sum() == 0
    else:
        raise ValueError('Unsupported data source')
        
    def stonks_to_csv(stonks, clean):
        from_date_string = stonks.index[0]
        to_date_string = stonks.index[-1]

        filename = 'stonks_{from_date}_to_{to_date}.csv'.format(from_date=from_date_string, to_date=to_date_string)
        
        if clean:
            filename = 'clean_' + filename
            
        file_path = os.path.join(data_dir, filename)

        stonks.to_csv(path_or_buf=file_path, header=True, index=True, na_rep='NaN')
    
    stonks_to_csv(stonks, clean=False)
    stonks_to_csv(clean_stonks, clean=True)
    
    return (stonks, clean_stonks)

## Stock price data input

In [4]:
def read_stonk_data(date_from, date_to, clean=True, date_index=False, data_dir=None):
    data_dir = 'data' if data_dir is None else data_dir
    data_prefix = 'clean_stonks' if clean else 'stonks'
    
    path = os.path.join(data_dir, '{}_{}_to_{}.csv'.format(data_prefix, date_from, date_to))
    stonks = pd.read_csv(path, header=0, index_col=0)
    
    if clean:
        assert stonks.isna().sum().sum() == 0
    
    if date_index:
        return stonks
    else:
        return stonks.T

In [5]:
def get_stonk_data_by_industry(date_from, date_to, clean=True, date_index=False, industries=None, stonk_list_filename=None, data_dir=None):
    '''
    Read the CSV file containing all stonk price data and return the tickers from the selected subindustries.
    
    -Args:
        industries (List(string)): if not given, return all tickers; else the list can contain:
            'technology_hardware_and_equipment'
            'software_and_services'
            'media_and_entertainment'
            'retailing'
            'automobiles_and_components'
            'semiconductors_and_semiconductor_equipment'
            'health_care_equipment_and_services'
            'banks'
            'pharmaceuticals_biotechnology_and_life_sciences'
            'food_and_staples_retailing'
            'oil_gas_and_consumable_fuels'
            'food_beverage_and_tobacco'
            'telecommunication_services'
            'consumer_durables_and_apparel'
            'consumer_services'
            'transportation'
            'diversified_financials'
            'utilities'
            'capital_goods'
            'insurance'
            'chemicals'
            'metals_and_mining'
            'commercial_and_professional_services'
            'containers_and_packaging'
            'energy_equipment_and_services'
            'construction_materials'
            'paper_and_forest_products'
    
    -Returns:
        stonks (pandas DataFrame): list of selected tickers' price data
    '''
    all_stonks = read_stonk_data(date_from, date_to, date_index=date_index, data_dir=data_dir, clean=clean)
    
    if industries is None or not industries:
        return all_stonks
    else: 
        all_tickers = get_tickers_by_industry(industries=None, data_dir=data_dir, filename=stonk_list_filename)
        all_stonks = all_stonks.join(all_tickers, how='inner')
        return all_stonks[all_stonks['subindustry'].isin(industries)].drop(columns='subindustry')

In [6]:
# TODO: make combinations with numpy
def combine_stonk_pairs(stonks_prices):
    # All ticker names must be unique
    assert all(stonks_prices.index.unique() == stonks_prices.index)
    assert(len(stonks_prices) < 300)
    
    combs = np.asarray(list(combinations(stonks_prices.index.unique(), 2)))
    
    return stonks_prices.loc[combs[:, 0]], stonks_prices.loc[combs[:, 1]]

## Linear regression residuals

In [7]:
def get_residuals_many(X, Y):
    '''
    Vectorized calculation of residuals from many univariate linear regressions.
        Args:
        - X (numpy array of shape (n_pairs, d_time)): matrix of LR inputs X, each row represents a different regression, corresponding to the same rows in Y
        - Y (numpy array of shape (n_pairs, d_time)): matrix of LR inputs Y, each row represents a different regression, corresponding to the same rows in X
        Returns:
        - residuals (numpy array of shape (n_pairs, d_time)): matrix of resulting residuals between vectorized pairs of X and Y
        - betas (numpy array of shape (n_pairs, 1)): beta coefficients for each linear regression
        - Y_hat (numpy array of shape (n_pairs, d_time)): predictions using X
    '''
    # Stack 2D matrices into 3D matrices
    X = X.reshape(np.shape(X)[0], np.shape(X)[1], -1)
    Y = Y.reshape(np.shape(Y)[0], np.shape(Y)[1], -1)
    
    # Add bias/intercept in the form (Xi, 1)
    Z = np.concatenate([X, np.ones((np.shape(X)[0], np.shape(X)[1], 1))], axis=2)
    
    # Save the transpose as it's used a couple of times
    Z_t = Z.transpose(0, 2, 1)
    
    # Linear Regression equation solutions w.r.t. weight matrix
    # W contains (beta_coef, a_intercept) for each regression
    W = np.matmul(np.linalg.inv(np.matmul(Z_t, Z)),  np.matmul(Z_t, Y))
    
    # Predictions and residuals
    # Y_hat = np.matmul(Z, W).round(2)
    residuals = (Y - np.matmul(Z, W)).round(2)
    
    # Y_hat returned for debugging purposes
    # return (residuals[:, :, 0], W[:, 0, 0], Y_hat[:, :, 0])
    return (residuals[:, :, 0], W[:, 0, 0])

In [8]:
def get_rolling_residuals(X, Y, l_reg, l_roll, dt, write_csv=False, data_dir='data'):
    '''
    Calculates rolling window residuals in vectorized form. Returns the result as an array that repeats each ticker for the number of regressions calculated.
    For example, if the inputs are (Pair A, Pair B, Pair C) and l_roll / dt = 3, then the returned results will have the form as follows:
    (Pair A, Pair A, Pair A, Pair B, Pair B, Pair B, Pair C, Pair C, Pair C)
    Works best when l_reg and l_roll are integers.
        Args:
        - X, Y (DataFrame of shape (n_pairs, >= l_reg + l_roll)): matrix of LR inputs X, Y; each row containing at least the complete data period for rolling regressions (can be longer)
        - l_reg (float): length of each LR to calculate residuals, in years; will be multiplied by the adjusted number of days in a trading year
        - l_roll (float): length of rolling window, in years; will be multipled by the adjusted number of days in a trading year
        - dt (int): rolling window step size, in trading days; total trading year days will be reduced to be divisible by dt (by not more than the value of dt)
        Returns:
        - residuals (numpy array of shape (n_pairs * (l_roll/dt)+1, l_reg + l_roll)): matrix of resulting residuals between vectorized pairs of X and Y
        - betas (numpy array of shape (n_pairs * (l_roll/dt)+1, 1)): beta coefficients for each linear regression
        - Y_hat (numpy array of shape (n_pairs * (l_roll/dt)+1, l_reg + l_roll)): predictions using X
    '''
    
    _DAYS_IN_TRADING_YEAR = 252
    
    # Adjust days in a year so that the number is divisible by dt
    _DAYS_IN_TRADING_YEAR = _DAYS_IN_TRADING_YEAR - (_DAYS_IN_TRADING_YEAR % dt)
    l_reg_days = int(_DAYS_IN_TRADING_YEAR * l_reg)
    l_roll_days = int(_DAYS_IN_TRADING_YEAR * l_roll)
    total_days = l_reg_days + l_roll_days
    
    # Number of regressions for each ticker
    n_windows = (l_roll_days // dt) + 1
    
    # Number of tickers
    n_x = X.shape[0]
    
    # Take the dates, create an empty array for windowed dates
    date_index = X.columns[-total_days:]
    date_index_windowed = np.empty(shape=(n_x*n_windows, 2), dtype='O')
    
    # Repeat each ticker name times n_windows
    X_index = np.repeat(X.index, n_windows)
    Y_index = np.repeat(Y.index, n_windows)
    
    # X and Y must have the same dates
    assert np.array_equal(X.columns, Y.columns)
    
    X = X.to_numpy(dtype=np.float32)
    Y = Y.to_numpy(dtype=np.float32)
    
    # Rolling window length must be divisible by dt
    assert (l_roll_days % dt) == 0
    
    # There has to be enough days' worth of data in X (and Y) and their shapes must match
    assert X.shape == Y.shape and X.shape[1] >= total_days
    
    # Take the total_days from the end of the arrays (most recent days first, oldest days at the end are cut off)
    X = X[:, -total_days:]
    Y = Y[:, -total_days:]
    
    # Create empty arrays that will contain windowed slices of our data
    X_windows = np.empty(shape=(n_x*n_windows, l_reg_days))
    Y_windows = np.empty(shape=(n_x*n_windows, l_reg_days))
    
    # Take windowed slices and place them into the created empty arrays
    for n in range(n_x):
        for i in range(n_windows):
            n_i = (n*n_windows)+i
            t_i = i*dt
            t_y = t_i + l_reg_days
            
            X_windows[n_i] = X[n, t_i:t_y]
            Y_windows[n_i] = Y[n, t_i:t_y]
            date_index_windowed[n_i, 0] = date_index[t_i]
            date_index_windowed[n_i, 1] = date_index[t_y-1]
    
    # Make sure we've got the windowing dimensions right
    assert X_windows.shape == (n_x*n_windows, l_reg_days) and Y_windows.shape == (n_x*n_windows, l_reg_days)
    
    # Sanity checks
    assert all([
        X[0, -1] == X_windows[n_windows-1, -1],
        Y[0, -1] == Y_windows[n_windows-1, -1],
        X[-1, -1] == X_windows[-1, -1],
        Y[-1, -1] == Y_windows[-1, -1],
    ])
    
    # Construct ticker pair index column
    pair_index = np.array(pd.DataFrame(np.array([Y_index, X_index])).apply('_'.join, axis=0, raw=True))
    
    # Construct regression date range index column
    date_index = np.array(pd.DataFrame(np.array([date_index_windowed[:, 0], date_index_windowed[:, 1]])).apply('_'.join, axis=0, raw=True))
    
    # Lengths of indexes must match
    assert len(pair_index) == len(date_index)
    
    # Calculate and return the residuals
    res, betas = get_residuals_many(X_windows, Y_windows)
    
    res = pd.DataFrame(res, index=pair_index)
    res.insert(0, 'dates', date_index)
    betas = pd.DataFrame(betas, index=pair_index)
    betas.insert(0, 'dates', date_index)
    
    if write_csv:
        time = datetime.now().time()
        res.to_csv(os.path.join(data_dir, time.strftime('residuals_%H%M%S.csv')), header=False, index=True)
        betas.to_csv(os.path.join(data_dir, time.strftime('betas_%H%M%S.csv')), header=False, index=True)
        pd.Series(date_index).to_csv(os.path.join(data_dir, time.strftime('dates_%H%M%S.csv')), header=False, index=False)
    
    return res, betas

## ADF testing

In [9]:
def get_adfs(residuals, adf_regression):
    # Get ADF test p-values for each row of the residuals array. No autolag (maxlag always used)
    return np.apply_along_axis(lambda x: adfuller(x, regression=adf_regression, autolag=None)[1], axis=1, arr=residuals)

In [10]:
def get_aggregate_adfs(residuals, betas=None, cutoff=0.1, adf_regression='c', write_csv=False, data_dir='data'):
    # Get ADF p-values
    adfs = get_adfs(residuals.drop(columns='dates').to_numpy(dtype=np.float32), adf_regression=adf_regression).reshape((-1, 1))
    
    # Add ones to ADF values where betas are negative, if betas are given
    if betas is not None:
        # Must be the same number of columns
        assert adfs.shape[0] == betas.shape[0]
        # Residuals and betas must have the same index names
        assert np.all(residuals.index == betas.index)
        # Add 1's to p-values where betas are negative
        adfs = adfs + (betas[0].to_numpy() <= 0).reshape((-1, 1))
        
    # Make a copy for returning, CSV output
    adfs_raw = pd.DataFrame(adfs.copy(), index=residuals.index)
    
    # All unique ticker pairs, in original order
    unique_pairs = residuals.index.unique()
    
    # Number of regressions for one pair
    pairs_per_index = len(residuals) // len(unique_pairs)
    
    # Reshape into a 3D array for averaging ADF values along the second axis
    adfs = adfs.reshape((len(unique_pairs), pairs_per_index, 1))
    
    # Takes cutoff, averages along the pairs_per_index (second) axis
    adfs = (adfs <= cutoff).mean(axis=1)
    
    # Probably always true, but just in case
    assert adfs.shape[0] == len(unique_pairs)
        
    # Back to a DataFrame with named indexes
    adfs = pd.DataFrame(adfs, index=unique_pairs)
    
    # Output to CSV
    if write_csv:
        time = datetime.now().time()
        adfs.to_csv(os.path.join(data_dir, time.strftime('adfs_%H%M%S.csv')), header=False, index=True)
        adfs_raw.to_csv(os.path.join(data_dir, time.strftime('adfs-raw_%H%M%S.csv')), header=False, index=True)
        
    return adfs, adfs_raw

## Standardized residuals

In [11]:
def get_last_pairs(pairs):
    # Get unique ticker pairs, in preserved order
    unique_pairs = pairs.index.unique()
    
    # Number of samples per ticker pair
    pairs_per_index = len(pairs) // len(unique_pairs)
    
    # Must be an equal number of pairs per index
    assert pairs_per_index * len(unique_pairs) == len(pairs)
    
    # Slice taking only the last regression for each ticker pair
    last_pairs = pairs.iloc[pairs_per_index-1:len(pairs):pairs_per_index].copy()
    
    # Make sure we got the slices right
    assert np.all(last_pairs.index == unique_pairs) and np.all(pairs.iloc[-1] == last_pairs.iloc[-1])
        
    return last_pairs

In [12]:
def get_standardized_residuals(residuals, write_csv=False, data_dir='data'):
    # Dates aren't needed anymore, as we're using the latest regressions
    residuals = residuals.drop(columns='dates')
    
    # Get the last regression for each spread
    last_reg_pairs = get_last_pairs(residuals)
    
    # Get unique ticker pairs
    unique_pairs = last_reg_pairs.index
    
    # Convert to numpy
    last_reg_pairs = last_reg_pairs.to_numpy(dtype=np.float32)
    
    # Standardize
    last_reg_pairs = (last_reg_pairs - last_reg_pairs.mean(axis=1, keepdims=True)) / last_reg_pairs.std(axis=1, keepdims=True)
    
    # Back to a DataFrame with named indexes
    last_reg_pairs = pd.DataFrame(last_reg_pairs, index=unique_pairs)
    
    # Output to CSV
    if write_csv:
        time = datetime.now().time()
        last_reg_pairs.to_csv(os.path.join(data_dir, time.strftime('std-residuals_%H%M%S.csv')), header=False, index=True)
        
    return last_reg_pairs

## Mean residual magnitude

In [13]:
def get_mean_residual_magnitude(std_residuals, dt):
    # Assume there is enough days' worth of data for averaging over dt days
    assert std_residuals.shape[1] >= dt
    
    # Select the last dt days from the right
    std_residuals = std_residuals.to_numpy(dtype=np.float32)[:, -dt:]
    
    # Take the absolute maximum for each day, over all tickers, mean over the results
    mean_magnitude = np.abs(std_residuals).max(axis=0).mean()
    
    return mean_magnitude

## Trade returns test

In [14]:
def get_spreads_returns(prices_X, prices_Y, betas_YX, buy_X):

    # Save spread indexes for later
    pairs_indexes = betas_YX.index
    
    # Take numpy betas
    betas_YX = betas_YX.drop(columns='dates').to_numpy()

    # Take numpy buy list for X
    buy_X = buy_X.values

    # Sanity checks
    assert all([
        prices_X.shape == prices_Y.shape,
        len(buy_X) == betas_YX.shape[0]
    ])

    # Prices to numpy
    prices_X = prices_X.to_numpy()
    prices_Y = prices_Y.to_numpy()

    # Save entering prices at t=0
    initial_prices_X = prices_X[:, [0]].copy()
    initial_prices_Y = prices_Y[:, [0]].copy()

    # Initial proportional values of trades at t=0, X prices scaled by beta
    initial_trade_values = (initial_prices_X * betas_YX) + initial_prices_Y

    # Returns for X, Y trades each day. X prices scaled by beta
    returns_X = betas_YX * (prices_X - initial_prices_X)
    returns_Y = prices_Y - initial_prices_Y

    # Negate short trades
    returns_X[~buy_X] = -returns_X[~buy_X]
    returns_Y[buy_X] = -returns_Y[buy_X]

    # Add the trade returns for X, Y, divide by initial investment values to get profit/loss %
    trade_returns = (returns_X + returns_Y) / initial_trade_values

    # Back to dataframe with indexes
    trade_returns = pd.DataFrame(trade_returns, index=pairs_indexes)
    
    return trade_returns

## Data collection pipeline

In [15]:
def data_collection(stonk_prices, industries, l_reg, l_roll, dt, adf_pass_cutoff=0.1, trade_length_months=3, trading_interval_months=1):
    return None

### Data collection pipeline step

In [16]:
def data_collection_step(X, Y, l_reg, l_roll, dt, adf_pval_cutoff, adf_pass_rate_filter, trade_length_months):
    assert X.shape == Y.shape

    _DAYS_IN_TRADING_MONTH = 20
    output = {}

    trade_length_days = trade_length_months*_DAYS_IN_TRADING_MONTH

    X_until_T = X.iloc[:, :-trade_length_days]
    Y_until_T = Y.iloc[:, :-trade_length_days]

    X_from_T = X.iloc[:, -trade_length_days-1:]
    Y_from_T = Y.iloc[:, -trade_length_days-1:]

    # X and Y dimensions must match
    assert X_from_T.shape == Y_from_T.shape and X_until_T.shape == Y_until_T.shape

    # Check whether enough data was given
    assert X_from_T.shape[1] == trade_length_days+1 and X.shape[1] == X_until_T.shape[1] + X_from_T.shape[1] - 1

    residuals, betas = get_rolling_residuals(X=X_until_T, Y=Y_until_T, l_reg=l_reg, l_roll=l_roll, dt=dt)

    adfs, adfs_raw = get_aggregate_adfs(residuals, betas=betas, cutoff=adf_pval_cutoff)

    std_residuals = get_standardized_residuals(residuals)

    last_betas = get_last_pairs(betas)

    assert np.all(std_residuals.index == last_betas.index)
    assert np.all(adfs.index == std_residuals.index)

    # Select spreads that are above the specified ADF pass rate
    selected = (adfs >= adf_pass_rate_filter).values

    selected_std_residuals = std_residuals[selected]

    selected_residuals_max_mean = get_mean_residual_magnitude(selected_std_residuals, dt=trade_length_days)

    selected_betas = last_betas[selected]

    selected_buys_X = selected_std_residuals.iloc[:, -1] > 0

    selected_X_from_T = X_from_T[selected]
    selected_Y_from_T = Y_from_T[selected]

    
    selected_trade_returns = get_spreads_returns(prices_X=selected_X_from_T, prices_Y=selected_Y_from_T, betas_YX=selected_betas, buy_X=selected_buys_X)

    output['adf_residuals'] = adfs[selected][0].values
    output['last_residual'] = selected_std_residuals.iloc[:, -1].values
    output['ticker_x'] = selected_X_from_T.index
    output['ticker_y'] = selected_Y_from_T.index
    output['return_one_month'] = selected_trade_returns.iloc[:, 20].values
    output['beta'] = selected_betas[0].values
    output['residual_mean_max'] = np.full(len(selected_std_residuals), selected_residuals_max_mean)
    output['trade_date'] = np.full(len(selected_std_residuals), X_from_T.columns[0])

    if trade_length_months > 1:
        output['return_two_month'] = selected_trade_returns.iloc[:, 40].values
    else:
        output['return_two_month'] = np.full(len(selected_std_residuals.index), np.nan)

    if trade_length_months > 2:
        output['return_three_month'] = selected_trade_returns.iloc[:, 60].values
    else:
        output['return_three_month'] = np.full(len(selected_std_residuals.index), np.nan)


    return output

In [None]:
a = data_collection_step(X, Y, 3, 1, 20, 0.1, 3)

In [None]:
X, Y, 3, 1, 20, 0.1, 3

In [17]:
l_reg, l_roll, dt, adf_pass_cutoff, trade_length_months = (3, 1, 20, 0.1, 3)

In [79]:
Y_until_T

Unnamed: 0,2017-04-17,2017-04-18,2017-04-19,2017-04-20,2017-04-21,2017-04-24,2017-04-25,2017-04-26,2017-04-27,2017-04-28,...,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-10,2022-01-11,2022-01-12,2022-01-13,2022-01-14
ADBE,129.99,129.81,130.22,131.42,131.52,132.89,133.49,132.79,133.38,133.74,...,564.37,554.00,514.43,514.12,510.70,525.83,529.89,532.37,516.90,520.60
ADP,91.66,91.72,91.85,92.72,92.41,93.71,94.49,94.36,94.63,94.03,...,242.80,243.93,241.85,240.19,237.66,231.73,234.11,233.47,230.78,227.62
ADS,223.10,222.63,225.10,243.78,242.75,244.02,246.02,245.38,243.71,233.50,...,68.43,70.79,68.81,69.14,70.20,71.60,72.08,71.73,74.66,72.16
ADSK,85.99,86.41,87.93,88.41,88.58,90.14,90.32,90.23,90.63,90.07,...,283.72,278.19,264.32,264.11,262.32,262.39,270.63,269.60,260.17,259.10
AKAM,58.82,58.95,59.68,60.20,60.28,61.39,61.80,62.24,59.75,60.94,...,117.51,116.95,115.48,111.88,110.54,112.65,113.42,114.11,112.91,112.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YEXT,13.55,14.01,14.04,13.81,13.44,13.74,13.72,13.81,14.63,14.84,...,10.00,9.67,8.95,9.00,9.03,9.07,9.44,9.33,8.95,8.67
ZEN,28.41,28.62,28.10,28.37,28.33,28.64,28.68,28.40,28.55,28.75,...,103.53,102.25,96.87,98.36,97.92,99.99,100.45,101.95,99.15,100.41
YEXT,13.55,14.01,14.04,13.81,13.44,13.74,13.72,13.81,14.63,14.84,...,10.00,9.67,8.95,9.00,9.03,9.07,9.44,9.33,8.95,8.67
ZEN,28.41,28.62,28.10,28.37,28.33,28.64,28.68,28.40,28.55,28.75,...,103.53,102.25,96.87,98.36,97.92,99.99,100.45,101.95,99.15,100.41


In [80]:
Y_from_T

Unnamed: 0,2022-01-14,2022-01-18,2022-01-19,2022-01-20,2022-01-21,2022-01-24,2022-01-25,2022-01-26,2022-01-27,2022-01-28,...,2022-03-30,2022-03-31,2022-04-01,2022-04-04,2022-04-05,2022-04-06,2022-04-07,2022-04-08,2022-04-11,2022-04-12
ADBE,520.60,513.34,516.58,510.85,499.91,519.66,502.72,500.81,493.05,518.16,...,460.06,455.62,458.19,468.81,458.58,444.33,452.72,445.34,434.44,426.77
ADP,227.62,225.32,223.71,219.22,216.05,217.16,215.80,196.48,195.45,198.28,...,227.76,227.54,233.50,235.18,238.00,237.79,238.50,237.71,230.96,231.06
ADS,72.16,68.86,65.50,65.35,64.43,66.98,66.43,66.27,65.86,67.41,...,57.05,56.15,56.80,55.83,54.89,52.61,51.83,52.70,52.90,53.28
ADSK,259.10,252.03,254.35,252.41,239.19,245.35,233.78,233.28,228.66,239.54,...,221.00,214.35,213.04,218.77,211.55,203.94,204.77,202.25,199.03,196.14
AKAM,112.62,112.08,113.17,113.27,112.61,114.19,112.56,109.94,110.19,112.17,...,121.11,119.39,120.51,120.46,120.41,119.59,119.34,118.36,118.01,117.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YEXT,8.67,8.50,8.47,8.22,7.81,7.94,7.92,7.59,7.41,7.57,...,7.09,6.89,6.83,7.07,6.77,6.50,6.45,6.25,6.21,6.19
ZEN,100.41,96.88,97.57,97.69,94.74,95.77,93.72,92.03,90.52,94.51,...,119.69,120.29,123.81,125.17,124.76,122.58,123.25,122.50,121.99,123.38
YEXT,8.67,8.50,8.47,8.22,7.81,7.94,7.92,7.59,7.41,7.57,...,7.09,6.89,6.83,7.07,6.77,6.50,6.45,6.25,6.21,6.19
ZEN,100.41,96.88,97.57,97.69,94.74,95.77,93.72,92.03,90.52,94.51,...,119.69,120.29,123.81,125.17,124.76,122.58,123.25,122.50,121.99,123.38


In [24]:
assert X.shape == Y.shape
    
_DAYS_IN_TRADING_MONTH = 20
output = {}
    
trade_length_days = trade_length_months*_DAYS_IN_TRADING_MONTH
    
X_until_T = X.iloc[:, :-trade_length_days]
Y_until_T = Y.iloc[:, :-trade_length_days]
    
X_from_T = X.iloc[:, -trade_length_days-1:]
Y_from_T = Y.iloc[:, -trade_length_days-1:]
    
# X and Y dimensions must match
assert X_from_T.shape == Y_from_T.shape and X_until_T.shape == Y_until_T.shape
    
# Check whether enough data was given
assert X_from_T.shape[1] == trade_length_days+1 and X.shape[1] == X_until_T.shape[1] + X_from_T.shape[1] - 1
    
residuals, betas = get_rolling_residuals(X=X_until_T, Y=Y_until_T, l_reg=l_reg, l_roll=l_roll, dt=dt)
    
adfs, adfs_raw = get_aggregate_adfs(residuals, betas=betas, cutoff=adf_pass_cutoff)
    
std_residuals = get_standardized_residuals(residuals)
    
last_betas = get_last_pairs(betas)
    
assert np.all(std_residuals.index == last_betas.index)
assert np.all(adfs.index == std_residuals.index)
    
# Select spreads with >50% ADF pass rate
selected = (adfs > 0.5).values
    
selected_std_residuals = std_residuals[selected]
    
selected_residuals_max_mean = get_mean_residual_magnitude(selected_std_residuals, dt=trade_length_days)
    
selected_betas = last_betas[selected]
    
selected_buys_X = selected_std_residuals.iloc[:, -1] > 0
    
# selected_separated_pairs = separate_pair_index(selected_std_residuals.index)
selected_X_from_T = X_from_T[selected]
selected_Y_from_T = Y_from_T[selected]
    
selected_trade_returns = get_spreads_returns(prices_X=selected_X_from_T, prices_Y=selected_Y_from_T, betas_YX=selected_betas, buy_X=selected_buys_X)
    
output['adf_residuals'] = adfs[selected][0].values
output['last_residual'] = selected_std_residuals.iloc[:, -1].values
output['ticker_x'] = selected_X_from_T.index
output['ticker_y'] = selected_Y_from_T.index
output['return_one_month'] = selected_trade_returns.iloc[:, 20].values
output['beta'] = selected_betas[0].values
output['residual_mean_max'] = np.full(len(selected_std_residuals), selected_residuals_max_mean)
output['trade_date'] = np.full(len(selected_std_residuals), X_from_T.columns[0])

if trade_length_months > 1:
    output['return_two_month'] = selected_trade_returns.iloc[:, 40].values
else:
    output['return_two_month'] = np.full(len(selected_std_residuals.index), np.nan)
        
if trade_length_months > 2:
    output['return_three_month'] = selected_trade_returns.iloc[:, 60].values
else:
    output['return_three_month'] = np.full(len(selected_std_residuals.index), np.nan)


In [64]:
df = pd.DataFrame(output)

In [51]:
df['last_residual'].argmax()

542

In [75]:
df[df['last_residual'].abs() > 3]['return_two_month'].mean()

0.010174763797510648

In [76]:
df[df['last_residual'].abs() < 3]['return_two_month'].mean()

0.002693945187356014

## Other

### Utility functions

In [19]:
def measure_time(func):
    t1 = time.time()
    ret = func()
    t2 = time.time()
    print("Time: " + str(int(t2-t1)) + 's')
    return ret

In [20]:
def separate_pair_index(indexes):
    indexes = pd.Series(indexes)
    y = np.array(indexes.apply(lambda x: x.split('_')[0]))
    x = np.array(indexes.apply(lambda x: x.split('_')[1]))
    return {'y':y, 'x':x}

## Pipeline example tutorial

#### 0. Import tickers from given custom list

In [19]:
# filename = 'data/selected_spreads.csv'
# ticker_pairs = pd.read_csv(filename, header=0)
# ticker_pairs.set_index('spreads', inplace=True)

# separated_indexes = separate_pair_index(ticker_pairs.index)
# ticker_pairs['x'] = separated_indexes['x']
# ticker_pairs['y'] = separated_indexes['y']

# ticker_pairs = ticker_pairs.loc[~ticker_pairs.index.str.contains(r'CIT|LORL|ENBL|MDP')]

#### 1. Download stock daily prices

In [20]:
# Gets all ticker names (no argument given)
ticker_list = get_tickers_by_industry(['software_and_services'])

In [20]:
# ticker_list = set(list(ticker_pairs['x']) + list(ticker_pairs['y']))

In [40]:
# Specific date - 3rd of March 2022 (Y, M, D)
# date_to = datetime(2022, 3, 1)
# Date of today
date_to = datetime.today()
# How many years' of data to download (going backwards from date_end). Year can be a floating point number
period_years = 5

In [41]:
# Download ticker price data for the tickers selected above (saved to .csv automatically)
df, df_clean = download_stonk_prices(ticker_list.index, period_years=period_years, date_to=date_to)

[*********************100%***********************]  219 of 219 completed


#### 2. Read stock data

In [26]:
# Get stock data of ALL industries (all tickers) - no arguments specified
# stonks = get_stonk_data_by_industry('2018-04-26', '2022-04-12')

In [21]:
# Get stock data from selected industries only
stonks = get_stonk_data_by_industry('2017-04-17', '2022-04-12', industries=['software_and_services'])

In [22]:
X, Y = combine_stonk_pairs(stonks)

#### 3. Select spreads

In [None]:
# X = stonks.loc[ticker_pairs['x']]
# Y = stonks.loc[ticker_pairs['y']]

##### 3.1. (IF PREVIOUS CELL FAILED) Remove ticker pairs with failed downloads and retry previous operation (CTRL + / to uncomment lines)

In [22]:
# failed_list = '|'.join(['SITE', 'INFO', 'RRD', 'HWM', 'OR', 'NGVT', 'VRS', 'TWLO', 'NUAN', 'PI', 'RRR', 'TPB', 'USFD', 'GMS', 'ENIC'])
# ticker_pairs = ticker_pairs[~ticker_pairs.index.str.contains(failed_list)]
# X = stonks.loc[ticker_pairs['x']]
# Y = stonks.loc[ticker_pairs['y']]

In [41]:
_WRITE_RESULTS_TO_CSV = False

#### 4. Calculate rolling residuals

In [42]:
residuals, betas = measure_time(partial(get_rolling_residuals, X=X, Y=Y, l_reg=3, l_roll=1, dt=5, write_csv=_WRITE_RESULTS_TO_CSV))

Time: 37s


#### 5. Calculate ADF test results using the residuals returned above. Betas are optionally given to invalidate ADF test results where betas are negative

In [43]:
adfs, adfs_raw = measure_time(partial(get_aggregate_adfs, residuals, betas=betas, write_csv=_WRITE_RESULTS_TO_CSV))

Time: 615s


#### 6. Calculate the standardized residuals of the regression from the last time period

In [44]:
std_residuals = get_standardized_residuals(residuals, write_csv=_WRITE_RESULTS_TO_CSV)

#### 8. Calculate selected trade returns

In [55]:
# selected_std_residuals = std_residuals[(adfs > 0.5).values]

#### 7. Calculate the mean residual trade making magnitude cutoff over the last *dt* days

In [65]:
# mean_residual_magnitude = get_mean_residual_magnitude(selected_std_residuals, dt=30)
# mean_residual_magnitude

In [74]:
# mean_residual_magnitude

4.7123723

##### 8.2. Select which trades to make based on the last standardized residual

In [67]:
# trade_YX = selected_std_residuals[selected_std_residuals.iloc[:, -1].abs() >= mean_residual_magnitude]

##### 8.2. Get betas for the last regressions and for the selected pairs

In [25]:
# last_betas = get_last_pairs(betas)
# betas_YX = last_betas.loc[trade_YX.index]

##### 8.3. Select long/short stocks

In [27]:
# buy_X = trade_YX.iloc[:, -1].apply(lambda x: x > 0)

##### 8.4. Separate spread pairs

In [28]:
# separated_pairs = separate_pair_index(trade_YX.index)
# tickers_X = separated_pairs['x']
# tickers_Y = separated_pairs['y']

##### 8.5. Calculate returns for the trades

### Slow residual functions (for testing)

In [154]:
# def get_rolling_slow_residuals(X, Y, l_reg, l_roll, dt):
#     _DAYS_IN_TRADING_YEAR = (252) - (252 % dt)
#     l_reg_days = _DAYS_IN_TRADING_YEAR * l_reg
#     l_roll_days = _DAYS_IN_TRADING_YEAR * l_roll
#     total_days = l_reg_days + l_roll_days
#     n_windows = l_roll_days // dt
#     n_x = X.shape[0]
    
#     assert (l_roll_days % dt) == 0
#     assert X.shape[1] >= total_days and Y.shape[1] >= total_days
    
#     X = X[:, -total_days:]
#     Y = Y[:, -total_days:]
    
#     # First window
#     X_windows = np.empty(shape=(n_x*n_windows, l_reg_days))
#     Y_windows = np.empty(shape=(n_x*n_windows, l_reg_days))
    
#     for n in range(n_x):
#         for i in range(n_windows):
#             X_windows = np.concatenate(( X_windows, X[n, i*dt:l_reg_days+(i*dt)] ))
#             Y_windows = np.concatenate(( Y_windows, Y[n, i*dt:l_reg_days+(i*dt)] ))
    
#     assert X_windows.shape == (n_x*n_windows, l_reg_days) and Y_windows.shape == (n_x*n_windows, l_reg_days)
    
#     return get_slow_residuals_many(X_windows, Y_windows)

In [104]:
# def get_slow_residuals_many(X, Y, n_jobs=-1):
#     lr = LinearRegression(n_jobs=n_jobs, fit_intercept=True)
#     X = X.reshape((X.shape[0], X.shape[1], -1))
#     Y = Y.reshape((Y.shape[0], Y.shape[1], -1))
    
#     preds = []
#     res = []
#     betas = []
#     for i in range(X.shape[0]):
#         lr.fit(X[i], Y[i])
#         preds.append(lr.predict(X[i]).round(2))
#         res.append(Y[i]-preds[-1])
#         betas.append(lr.coef_[0][0])
#     return (np.asarray(res)[:,:,0], np.asarray(preds)[:,:,0], np.asarray(betas))

In [28]:
# t1_fast = time.time()
# res, betas, preds  = get_rolling_residuals(X, Y, l_reg=2, l_roll=1, dt=5)
# t2_fast = time.time()

# t1_slow = time.time()
# res_slow, preds_slow = get_rolling_slow_residuals(X, Y, l_reg=2, l_roll=1, dt=5)
# t2_slow = time.time()

# print("Time slow: " + str(t2_slow-t1_slow))
# print("Time fast: " + str(t2_fast-t1_fast))

In [29]:
# t1_fast = time.time()
# res, preds, betas = get_residuals_many(X, Y)
# t2_fast = time.time()

# t1_slow = time.time()
# res_slow, preds_slow, betas_slow = get_slow_residuals_many(X, Y)
# t2_slow = time.time()

# print("Time slow: " + str(t2_slow-t1_slow))
# print("Time fast: " + str(t2_fast-t1_fast))

### Stock list preprocessing

In [153]:
def preprocess_stock_list(raw_data_path='data/raw_stonk_list.xls', output_path='data/stonk_list.csv'):
    '''
    Parses a raw excel file from CapitalIQ containing ticker names and their subindustries, validates
    unusual ticker names with Yahoo Finance, saving the processed data in CSV format.

        Parameters:
            Required:
                raw_data_path (string):
                    Path to the raw excel file.
                output_path (string):
                    Path where to save the parsed data.
                
        Returns:
            Nothing
    '''
    
    df = pd.read_excel(io=raw_data_path)
    
    # Drop NA rows
    df.dropna(axis=0, inplace=True)
    
    # Reset index and drop the first row
    df.reset_index(inplace=True, drop=True)
    df.drop(index=0, axis=0, inplace=True)
    
    # Drop unwanted columns
    df.drop(columns=df.columns[[1, 2, 3, 4, 5, 7, 8, 9]], inplace=True)
    
    # Rename remaining columns
    df.columns = ['ticker', 'subindustry']
    
    # Remove the '(Primary)' tag from subindustries
    df['subindustry'] = df['subindustry'].str.replace(r' \(Primary\)', '')
    
    # Remove everything until (and including) the semicolon for tickers
    df['ticker'] = df['ticker'].str.replace(r'(.*:)', '')
    
    df['ticker'] = df['ticker'].str.replace(r' WI', '.VI')
    df['ticker'] = df['ticker'].str.replace(r'\.WI', '.VI')
    
    # Replace the ticker endings for a Yahoo finance supported format
    df['ticker'] = df['ticker'].str.replace(r'\.PR', '-P')
    # df['ticker'] = df['ticker'].str.replace(r' PR', '-P')
    
    # Take all remaining tickers that have a dot
    dotted = df[df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Replace the dots with dashes
    dashed = dotted.copy()
    dashed['ticker'] = dashed['ticker'].str.replace(r'\.', '-')
    
    # Remove the dots
    undotted = dotted.copy()
    undotted['ticker'] = undotted['ticker'].str.replace(r'\.', '')

    # Combine all variantas together
    all_variants = pd.concat([dotted, dashed, undotted])
    
    # Run all of these through Yahoo finance, get last day's price
    stonks = yf.download(list(all_variants['ticker'].astype('string').values), period='1m', interval='1d', group_by='column')
    
    # Drop all NA tickers (that failed to download)
    valid_tickers = stonks['Adj Close'].iloc[-1].dropna(axis=0).to_frame().reset_index()
    
    # Rename columns
    valid_tickers.columns = ['ticker', 'price']
    
    # Add subindustries to the remaining valid tickers
    valid_tickers = valid_tickers.join(all_variants.set_index('ticker'), on='ticker')
    
    # Drop the price column
    valid_tickers.drop(columns=valid_tickers.columns[[1]], inplace=True)
    
    # Remove all tickers that have a dot from main dataframe
    df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Add the validated tickers back
    df = pd.concat([df, valid_tickers], axis=0, ignore_index=True)
    
    # Make the subindustry strings more code friendly
    df['subindustry'] = df['subindustry'].str.replace(' ', '_')
    df['subindustry'] = df['subindustry'].str.lower()
    df['subindustry'] = df['subindustry'].str.replace(',', '')
    
    df.to_csv(path_or_buf=output_path, header=True, index=False)