# Imports

In [2]:
# Data management
import numpy as np
import pandas as pd

# Data fetching
import yfinance as yf

# Spread generation
from sklearn.linear_model import LinearRegression

# Backtesting

# ML

# Utils
from datetime import datetime
from datetime import timedelta
import os

# Utils

## Data fetch

In [14]:
def get_stonk_data(stonk_list, period_years=3, date_from=None, date_to=datetime.now(), interval='1d', source='yfinance', data_dir='data', file_prefix='stonks', proxy=False):    
    '''
    Returns historical price data for the selected stonks.

        Parameters:
            Required:
                stonk_list (string, list):
                    List of stonk identifiers as strings, case unsensitive
                period_years (float):
                    How many years of data to download until date_to, can be a floating point number
            Optional:
                date_from (datetime):
                    Start date for stonk data (use instead of period_years)
                date_to (datetime):
                    End date for stonk data
                interval (string):
                    Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
                source (string):
                    Where to source data from. Valid sources: yfinance
                data_dir (string):
                    Folder name where to output downloaded data
                file_prefix (string):
                    Prefix of CSV file containing downloaded data inside data_dir
                proxy (boolean):
                    Whether to use a proxy connection to avoid API limits/blocks
                
        Returns:
            stonk_data (Pandas Dataframe): Pandas Dataframe containing requested data
    '''
    
    if date_from is None:
        date_from = date_to-(timedelta(days=int(365*period_years)))
        
    if source.lower() == 'yfinance':
        stonks = yf.download(list(stonk_list), start=date_from, end=date_to, interval=interval, group_by='column', threads=True, rounding=True)['Adj Close']
        stonks.dropna(axis=0, how='all', thresh=len(stonks.columns) // 2, inplace=True)
        stonks.dropna(axis=1, how='any', inplace=True)
    else:
        raise ValueError('Unsupported data source')
        
    from_date_string = stonks.index[0].strftime('%Y-%m-%d')
    to_date_string = stonks.index[-1].strftime('%Y-%m-%d')
    
    filename = '{prefix}_{from_date}_to_{to_date}.csv'.format(prefix=file_prefix, from_date=from_date_string, to_date=to_date_string)
    file_path = os.path.join(data_dir, filename)
    
    stonks.to_csv(path_or_buf=file_path, header=True, index=True)
    
    return stonks

In [7]:
stonks = yf.download(["googl", "tsla", "ffs"], period='2d', interval='1d', group_by='column', threads=True, rounding=True)

[*********************100%***********************]  3 of 3 completed

1 Failed download:
- FFS: No data found for this date range, symbol may be delisted


In [20]:
stonks = get_stonk_data(["googl", "tsla", "ffs"], period_years=0.01)

[*********************100%***********************]  3 of 3 completed

1 Failed download:
- FFS: No data found for this date range, symbol may be delisted


In [3]:
stonks = pd.read_csv('data/stonks_2022-02-15_to_2022-02-18.csv')

In [7]:
stonks

Unnamed: 0,Date,A,AA,AAL,AAON,AAP,AAPL,AAWW,AAXJ,AB,...,ZIONP,ZIP,ZM,ZNGA,ZNTL,ZS,ZTS,ZUMZ,ZUO,ZWS
0,2022-02-15,,,,,,,,,,...,,,,,,,,,,
1,2022-02-16,135.15,77.84,18.82,59.69,222.39,172.55,79.07,82.76,46.46,...,22.56,21.49,138.51,8.9,51.57,280.87,195.09,45.89,16.34,34.69
2,2022-02-17,130.25,76.47,18.22,58.14,220.41,168.88,77.1,82.01,45.25,...,23.0,20.56,129.38,8.88,49.57,261.94,193.07,43.82,15.22,33.88
3,2022-02-18,132.05,78.2,17.87,58.22,218.8,167.3,78.47,81.33,44.99,...,23.0,19.77,126.96,8.84,47.66,255.22,191.32,43.98,14.62,33.37


In [18]:
stonks.index[0].strftime('%Y-%m-%d')

'2022-02-16'

In [11]:
stonk_list = pd.read_csv('data/stonk_list.csv')

In [12]:
stonk_list[stonk_list['ticker'].str.contains('PFE')]

Unnamed: 0,ticker,subindustry
30,PFE,pharmaceuticals_biotechnology_and_life_sciences
31,PFE.VI,pharmaceuticals_biotechnology_and_life_sciences


In [15]:
df = get_stonk_data(stonk_list['ticker'], period_years=0.01)

[*********************100%***********************]  2283 of 2283 completed

11 Failed downloads:
- SNX.VI: No data found, symbol may be delisted
- FHN PRA: No data found, symbol may be delisted
- NRZ-PD: No data found for this date range, symbol may be delisted
- WFC PRN: No data found, symbol may be delisted
- FTAI-PA: No data found for this date range, symbol may be delisted
- ET-PC: No data found for this date range, symbol may be delisted
- ET-PE: No data found for this date range, symbol may be delisted
- WCC-PA: No data found for this date range, symbol may be delisted
- ET-PD: No data found for this date range, symbol may be delisted
- ALL-PB: No data found for this date range, symbol may be delisted
- RXN.VI: No data found, symbol may be delisted


In [11]:
help(yf.download)

Help on function download in module yfinance.multi:

download(tickers, start=None, end=None, actions=False, threads=True, group_by='column', auto_adjust=False, back_adjust=False, progress=True, period='max', show_errors=True, interval='1d', prepost=False, proxy=None, rounding=False, timeout=None, **kwargs)
    Download yahoo tickers
    :Parameters:
        tickers : str, list
            List of tickers to download
        period : str
            Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
            Either Use period parameter or use start and end
        interval : str
            Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
            Intraday data cannot extend last 60 days
        start: str
            Download start date string (YYYY-MM-DD) or _datetime.
            Default is 1900-01-01
        end: str
            Download end date string (YYYY-MM-DD) or _datetime.
            Default is now
        group_by : str
            Group by 'ticker' o

In [8]:
help(yf.Ticker)

Help on class Ticker in module yfinance.ticker:

class Ticker(yfinance.base.TickerBase)
 |  Ticker(ticker, session=None)
 |  
 |  Method resolution order:
 |      Ticker
 |      yfinance.base.TickerBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  option_chain(self, date=None, proxy=None, tz=None)
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties defined here:
 |  
 |  actions
 |  
 |  analysis
 |  
 |  balance_sheet
 |  
 |  balancesheet
 |  
 |  calendar
 |  
 |  cashflow
 |  
 |  dividends
 |  
 |  earnings
 |  
 |  financials
 |  
 |  info
 |  
 |  institutional_holders
 |  
 |  isin
 |  
 |  major_holders
 |  
 |  mutualfund_holders
 |  
 |  news
 |  
 |  options
 |  
 |  quarterly_balance_sheet
 |  
 |  quarterly_balancesheet
 |  
 |  quarterly_cashflow
 |  
 |  quarterly_earnings
 |  
 |  quarterly_financials
 |  
 |  recommendations
 |  
 |  shares
 |  
 |

## Backtesting

## Stock list preprocessing

In [9]:
def preprocess_stock_list(raw_data_path='data/raw_stock_list.xls', output_path='data/stonk_list.csv'):
    '''
    Parses a raw excel file from CapitalIQ containing ticker names and their subindustries, validates
    unusual ticker names with Yahoo Finance, saving the processed data in CSV format.

        Parameters:
            Required:
                raw_data_path (string):
                    Path to the raw excel file.
                output_path (string):
                    Path where to save the parsed data.
                
        Returns:
            Nothing
    '''
    
    df = pd.read_excel(io=raw_data_path)
    
    # Drop NA rows
    df.dropna(axis=0, inplace=True)
    
    # Reset index and drop the first row
    df.reset_index(inplace=True, drop=True)
    df.drop(index=0, axis=0, inplace=True)
    
    # Drop unwanted columns
    df.drop(columns=df.columns[[1, 2, 3, 4, 5, 7, 8, 9]], inplace=True)
    
    # Rename remaining columns
    df.columns = ['ticker', 'subindustry']
    
    # Remove the '(Primary)' tag from subindustries
    df['subindustry'] = df['subindustry'].str.replace(r' \(Primary\)', '')
    
    # Remove everything until (and including) the semicolon for tickers
    df['ticker'] = df['ticker'].str.replace(r'(.*:)', '')
    
    df['ticker'] = df['ticker'].str.replace(r' WI', '.VI')
    df['ticker'] = df['ticker'].str.replace(r'\.WI', '.VI')
    
    # Replace the ticker endings for a Yahoo finance supported format
    df['ticker'] = df['ticker'].str.replace(r'\.PR', '-P')
    
#     # Drop tickers with two letters after a dot, unavailable in Yahoo finance
#     df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]{2}')]
    
    # Take all remaining tickers that have a dot
    dotted = df[df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Replace the dots with dashes
    dashed = dotted.copy()
    dashed['ticker'] = dashed['ticker'].str.replace(r'\.', '-')
    
    # Remove the dots
    undotted = dotted.copy()
    undotted['ticker'] = undotted['ticker'].str.replace(r'\.', '')

    # Combine all variantas together
    all_variants = pd.concat([dotted, dashed, undotted])
    
    # Run all of these through Yahoo finance, get last day's price
    stonks = yf.download(list(all_variants['ticker'].astype('string').values), period='1m', interval='1d', group_by='column')
    
    # Drop all NA tickers (that failed to download)
    valid_tickers = stonks['Adj Close'].iloc[-1].dropna(axis=0).to_frame().reset_index()
    
    # Rename columns
    valid_tickers.columns = ['ticker', 'price']
    
    # Add subindustries to the remaining valid tickers
    valid_tickers = valid_tickers.join(all_variants.set_index('ticker'), on='ticker')
    
    # Drop the price column
    valid_tickers.drop(columns=valid_tickers.columns[[1]], inplace=True)
    
    # Remove all tickers that have a dot from main dataframe
    df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Add the validated tickers back
    df = pd.concat([df, valid_tickers], axis=0, ignore_index=True)
    
    # Make the subindustry strings more code friendly
    df['subindustry'] = df['subindustry'].str.replace(' ', '_')
    df['subindustry'] = df['subindustry'].str.lower()
    df['subindustry'] = df['subindustry'].str.replace(',', '')
    
    df.to_csv(path_or_buf=output_path, header=True, index=False)
    

In [9]:
pd.set_option('display.max_rows', None)

## Linear regression residuals

In [78]:
def get_residuals(X, Y):
    X = X.T
    W = Y.dot(np.linalg.inv(X.dot(X.T)).dot(X))
    return W

In [21]:
stonks = pd.read_csv('data/stonks_2022-02-15_to_2022-02-18.csv', index_col=0)

In [22]:
stonks

Unnamed: 0_level_0,A,AA,AAL,AAON,AAP,AAPL,AAWW,AAXJ,AB,ABBV,...,ZIONP,ZIP,ZM,ZNGA,ZNTL,ZS,ZTS,ZUMZ,ZUO,ZWS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-15,,,,,,,,,,,...,,,,,,,,,,
2022-02-16,135.15,77.84,18.82,59.69,222.39,172.55,79.07,82.76,46.46,145.87,...,22.56,21.49,138.51,8.9,51.57,280.87,195.09,45.89,16.34,34.69
2022-02-17,130.25,76.47,18.22,58.14,220.41,168.88,77.1,82.01,45.25,144.97,...,23.0,20.56,129.38,8.88,49.57,261.94,193.07,43.82,15.22,33.88
2022-02-18,132.05,78.2,17.87,58.22,218.8,167.3,78.47,81.33,44.99,144.03,...,23.0,19.77,126.96,8.84,47.66,255.22,191.32,43.98,14.62,33.37


In [23]:
stonks.dropna(axis=0, how='all', thresh=len(stonks.columns) // 2, inplace=True)
stonks.dropna(axis=1, how='any', inplace=True)

In [24]:
stonks

Unnamed: 0_level_0,A,AA,AAL,AAON,AAP,AAPL,AAWW,AAXJ,AB,ABBV,...,ZIONP,ZIP,ZM,ZNGA,ZNTL,ZS,ZTS,ZUMZ,ZUO,ZWS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-16,135.15,77.84,18.82,59.69,222.39,172.55,79.07,82.76,46.46,145.87,...,22.56,21.49,138.51,8.9,51.57,280.87,195.09,45.89,16.34,34.69
2022-02-17,130.25,76.47,18.22,58.14,220.41,168.88,77.1,82.01,45.25,144.97,...,23.0,20.56,129.38,8.88,49.57,261.94,193.07,43.82,15.22,33.88
2022-02-18,132.05,78.2,17.87,58.22,218.8,167.3,78.47,81.33,44.99,144.03,...,23.0,19.77,126.96,8.84,47.66,255.22,191.32,43.98,14.62,33.37


In [26]:
stonks = stonks.to_numpy()

In [70]:
stonks = stonks.T

In [72]:
stonks

array([[135.15, 130.25, 132.05],
       [ 77.84,  76.47,  78.2 ],
       [ 18.82,  18.22,  17.87],
       ...,
       [ 45.89,  43.82,  43.98],
       [ 16.34,  15.22,  14.62],
       [ 34.69,  33.88,  33.37]])

In [73]:
np.flipud(stonks)

array([[ 34.69,  33.88,  33.37],
       [ 16.34,  15.22,  14.62],
       [ 45.89,  43.82,  43.98],
       ...,
       [ 18.82,  18.22,  17.87],
       [ 77.84,  76.47,  78.2 ],
       [135.15, 130.25, 132.05]])

In [65]:
np.flipud(stonks.T)

array([[ 34.69,  33.88,  33.37],
       [ 16.34,  15.22,  14.62],
       [ 45.89,  43.82,  43.98],
       ...,
       [ 18.82,  18.22,  17.87],
       [ 77.84,  76.47,  78.2 ],
       [135.15, 130.25, 132.05]])

In [79]:
W = get_residuals(stonks, np.flipud(stonks))

In [80]:
W.shape

(2269, 2269)

In [77]:
W

array([[-460746.0820514 , -473761.10586784, -452707.62335391],
       [-455331.9259676 , -468277.52087801, -447492.109354  ],
       [-456253.5090017 , -469315.08765715, -448501.03163826]])

In [83]:
np.flipud(stonks)

array([[ 34.69,  33.88,  33.37],
       [ 16.34,  15.22,  14.62],
       [ 45.89,  43.82,  43.98],
       ...,
       [ 18.82,  18.22,  17.87],
       [ 77.84,  76.47,  78.2 ],
       [135.15, 130.25, 132.05]])

In [50]:
W.shape

(2269, 2269)

In [82]:
Y_hat = np.dot(W, stonks)

In [58]:
Y_hat.shape

(2269, 3)

In [85]:
np.flipud(stonks) - Y_hat

array([[2.42391067e-08, 2.51979415e-08, 2.33458053e-08],
       [7.94301513e-09, 8.58918803e-09, 7.98155853e-09],
       [2.24150369e-08, 2.02749888e-08, 2.58361013e-08],
       ...,
       [1.23763968e-08, 1.29252307e-08, 1.20064136e-08],
       [4.65374654e-08, 4.11888834e-08, 5.22607451e-08],
       [6.88462194e-08, 6.01554007e-08, 8.04257354e-08]])