# Imports

In [61]:
# Data management
import numpy as np
import pandas as pd

# Data fetching
import yfinance as yf

# Spread generation
from sklearn.linear_model import LinearRegression

# Backtesting

# ML

# Utils
from datetime import datetime
from datetime import timedelta
import os
import numba
import time

# Utils

## Data fetch

In [90]:
def get_stonk_data(stonk_list, period_years=3, date_from=None, date_to=datetime.now(), interval='1d', source='yfinance', data_dir='data', file_prefix='stonks', proxy=False):    
    '''
    Returns historical price data for the selected stonks.

    -Args:
        stonk_list (string, list): List of stonk identifiers as strings, case unsensitive
        period_years (float): How many years of data to download until date_to, can be a floating point number
    -Optional:
        date_from (datetime): Start date for stonk data (use instead of period_years)
        date_to (datetime): End date for stonk data
        interval (string): Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
        source (string): Where to source data from. Valid sources: yfinance
        data_dir (string): Folder name where to output downloaded data
        file_prefix (string): Prefix of CSV file containing downloaded data inside data_dir
        proxy (boolean): Whether to use a proxy connection to avoid API limits/blocks
                
    -Returns:
        stonks (Pandas Dataframe): Pandas Dataframe containing requested ticker prices
    '''
    
    if date_from is None:
        date_from = date_to-(timedelta(days=int(365*period_years)))
        
    if source.lower() == 'yfinance':
        stonks = yf.download(list(stonk_list), start=date_from, end=date_to, interval=interval, group_by='column', threads=True, rounding=True)['Adj Close']
        stonks.dropna(axis=0, how='all', inplace=True)
    else:
        raise ValueError('Unsupported data source')
        
    from_date_string = stonks.index[0].strftime('%Y-%m-%d')
    to_date_string = stonks.index[-1].strftime('%Y-%m-%d')
    
    filename = '{prefix}_{from_date}_to_{to_date}.csv'.format(prefix=file_prefix, from_date=from_date_string, to_date=to_date_string)
    file_path = os.path.join(data_dir, filename)
    
    stonks.to_csv(path_or_buf=file_path, header=True, index=True, na_rep='NaN')
    
    return stonks

In [142]:
# stonks = get_stonk_data(["googl", "tsla", "ffs"], period_years=0.01)

In [63]:
stonk_list = pd.read_csv('data/stonk_list.csv')

In [91]:
df = get_stonk_data(stonk_list['ticker'], period_years=3)

[*********************100%***********************]  2283 of 2283 completed

12 Failed downloads:
- WFC PRN: No data found, symbol may be delisted
- SNX.VI: No data found, symbol may be delisted
- ET-PE: No data found for this date range, symbol may be delisted
- FTAI-PA: No data found for this date range, symbol may be delisted
- ET-PD: No data found for this date range, symbol may be delisted
- AZEK: Error occurred while retrieving timeseries from Redis, keys: [RedisKey [key=AZEK, cluster=finance]]
- ALL-PB: No data found for this date range, symbol may be delisted
- FHN PRA: No data found, symbol may be delisted
- ET-PC: No data found for this date range, symbol may be delisted
- WCC-PA: No data found for this date range, symbol may be delisted
- RXN.VI: No data found, symbol may be delisted
- NRZ-PD: No data found for this date range, symbol may be delisted


In [3]:
# help(yf.download)

In [2]:
# help(yf.Ticker)

## Stock list preprocessing

In [9]:
def preprocess_stock_list(raw_data_path='data/raw_stock_list.xls', output_path='data/stonk_list.csv'):
    '''
    Parses a raw excel file from CapitalIQ containing ticker names and their subindustries, validates
    unusual ticker names with Yahoo Finance, saving the processed data in CSV format.

        Parameters:
            Required:
                raw_data_path (string):
                    Path to the raw excel file.
                output_path (string):
                    Path where to save the parsed data.
                
        Returns:
            Nothing
    '''
    
    df = pd.read_excel(io=raw_data_path)
    
    # Drop NA rows
    df.dropna(axis=0, inplace=True)
    
    # Reset index and drop the first row
    df.reset_index(inplace=True, drop=True)
    df.drop(index=0, axis=0, inplace=True)
    
    # Drop unwanted columns
    df.drop(columns=df.columns[[1, 2, 3, 4, 5, 7, 8, 9]], inplace=True)
    
    # Rename remaining columns
    df.columns = ['ticker', 'subindustry']
    
    # Remove the '(Primary)' tag from subindustries
    df['subindustry'] = df['subindustry'].str.replace(r' \(Primary\)', '')
    
    # Remove everything until (and including) the semicolon for tickers
    df['ticker'] = df['ticker'].str.replace(r'(.*:)', '')
    
    df['ticker'] = df['ticker'].str.replace(r' WI', '.VI')
    df['ticker'] = df['ticker'].str.replace(r'\.WI', '.VI')
    
    # Replace the ticker endings for a Yahoo finance supported format
    df['ticker'] = df['ticker'].str.replace(r'\.PR', '-P')
    
#     # Drop tickers with two letters after a dot, unavailable in Yahoo finance
#     df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]{2}')]
    
    # Take all remaining tickers that have a dot
    dotted = df[df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Replace the dots with dashes
    dashed = dotted.copy()
    dashed['ticker'] = dashed['ticker'].str.replace(r'\.', '-')
    
    # Remove the dots
    undotted = dotted.copy()
    undotted['ticker'] = undotted['ticker'].str.replace(r'\.', '')

    # Combine all variantas together
    all_variants = pd.concat([dotted, dashed, undotted])
    
    # Run all of these through Yahoo finance, get last day's price
    stonks = yf.download(list(all_variants['ticker'].astype('string').values), period='1m', interval='1d', group_by='column')
    
    # Drop all NA tickers (that failed to download)
    valid_tickers = stonks['Adj Close'].iloc[-1].dropna(axis=0).to_frame().reset_index()
    
    # Rename columns
    valid_tickers.columns = ['ticker', 'price']
    
    # Add subindustries to the remaining valid tickers
    valid_tickers = valid_tickers.join(all_variants.set_index('ticker'), on='ticker')
    
    # Drop the price column
    valid_tickers.drop(columns=valid_tickers.columns[[1]], inplace=True)
    
    # Remove all tickers that have a dot from main dataframe
    df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Add the validated tickers back
    df = pd.concat([df, valid_tickers], axis=0, ignore_index=True)
    
    # Make the subindustry strings more code friendly
    df['subindustry'] = df['subindustry'].str.replace(' ', '_')
    df['subindustry'] = df['subindustry'].str.lower()
    df['subindustry'] = df['subindustry'].str.replace(',', '')
    
    df.to_csv(path_or_buf=output_path, header=True, index=False)
    

In [9]:
pd.set_option('display.max_rows', None)

## Linear regression residuals

In [126]:
def get_residuals(X, Y):
    '''
    Vectorized calculation of residuals from multiple univariate linear regressions.

        Args:
        - X (numpy array of shape (n_pairs, d_time)): X variable for LR
        - Y (numpy array of shape (n_pairs, d_time)): Y variable for LR           
        Returns:
        - residuals (numpy array of shape (n_pairs, d_time)): matrix of resulting residuals between vectorized pairs of X and Y
        - Y_hat (numpy array of shape (n_pairs, d_time)): predictions of Y using X
    '''
    # Stack 2D matrices into 3D matrices
    X = X.reshape(np.shape(X)[0], np.shape(X)[1], -1)
    Y = Y.reshape(np.shape(Y)[0], np.shape(Y)[1], -1)
    
    # Add bias
    Z = np.concatenate([X, np.ones((np.shape(X)[0], np.shape(X)[1], 1))], axis=2)
    
    Z_t = Z.transpose(0, 2, 1)
    W = np.matmul(np.linalg.inv(np.matmul(Z_t, Z)),  np.matmul(Z_t, Y))
    
    # Predictions and residuals
    Y_hat = np.matmul(Z, W).round(2)
    residuals = (Y - Y_hat)
    
    return (residuals[:, :, 0], Y_hat[:, :, 0])

In [None]:
def get_slow_residuals(X, Y, n_jobs=None):
    lr = LinearRegression(n_jobs=n_jobs, fit_intercept=True)
    X = X.reshape((X.shape[0], X.shape[1], -1))
    Y = Y.reshape((Y.shape[0], Y.shape[1], -1))
    
    preds = []
    res = []
    for i in range(X.shape[0]):
        lr.fit(X[i], Y[i])
        preds.append(lr.predict(X[i]).round(2))
        res.append(Y[i]-preds[-1])
    return (np.asarray(res)[:,:,0], np.asarray(preds)[:,:,0])

In [56]:
np.shape(X)

(1815, 757)

In [122]:
def read_stonk_data(date_from, date_to, data_dir='data', data_prefix='stonks'):
    path = os.path.join(data_dir, '{}_{}_to_{}.csv'.format(data_prefix, date_from, date_to))
    stonks = pd.read_csv(path, index_col=0)
    stonks.dropna(axis=1, how='all', thresh=len(stonks) * 0.95, inplace=True)
    stonks.dropna(axis=0, how='all', thresh=len(stonks) * 0.95, inplace=True)
    stonks.fillna(axis=1, method='ffill', inplace=True)
    return stonks.to_numpy().T.astype(np.float64)

In [123]:
stonks = read_stonk_data('2019-02-25', '2022-02-24')

In [127]:
X = stonks
Y = np.flipud(stonks)

In [128]:
t1_fast = time.time()
res, preds = get_residuals(X, Y)
t2_fast = time.time()

t1_slow = time.time()
res_slow, preds_slow = get_slow_residuals(X, Y)
t2_slow = time.time()

print("Time slow: " + str(t2_slow-t1_slow))
print("Time fast: " + str(t2_fast-t1_fast))

Time slow: 1.8449993133544922
Time fast: 0.2609989643096924


In [134]:
np.all(np.equal(res, res_slow))

True

### TODO: check which inputs/outputs from X, Y generate mismatching results