# Imports

In [2]:
# Data management
import numpy as np
import pandas as pd

# Data fetching
import yfinance as yf

# Spread generation
from sklearn.linear_model import LinearRegression

# ML

# Utils
from datetime import datetime
from datetime import timedelta
import os
import time

# Utils

## Data fetch

In [90]:
def get_stonk_data(stonk_list, period_years=3, date_from=None, date_to=datetime.now(), interval='1d', source='yfinance', data_dir='data', file_prefix='stonks', proxy=False):    
    '''
    Returns historical price data for the selected stonks.

    -Args:
        stonk_list (List(string)): List of stonk identifiers as strings, case unsensitive
        period_years (float): How many years of data to download until date_to, can be a floating point number
    -Optional:
        date_from (datetime): Start date for stonk data (use instead of period_years)
        date_to (datetime): End date for stonk data
        interval (string): Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
        source (string): Where to source data from. Valid sources: yfinance
        data_dir (string): Folder name where to output downloaded data
        file_prefix (string): Prefix of CSV file containing downloaded data inside data_dir
        proxy (boolean): Whether to use a proxy connection to avoid API limits/blocks
                
    -Returns:
        stonks (Pandas Dataframe): Pandas Dataframe containing requested ticker prices
    '''
    
    if date_from is None:
        date_from = date_to-(timedelta(days=int(365*period_years)))
        
    if source.lower() == 'yfinance':
        stonks = yf.download(list(stonk_list), start=date_from, end=date_to, interval=interval, group_by='column', threads=True, rounding=True)['Adj Close']
        stonks.dropna(axis=0, how='all', inplace=True)
    else:
        raise ValueError('Unsupported data source')
        
    from_date_string = stonks.index[0].strftime('%Y-%m-%d')
    to_date_string = stonks.index[-1].strftime('%Y-%m-%d')
    
    filename = '{prefix}_{from_date}_to_{to_date}.csv'.format(prefix=file_prefix, from_date=from_date_string, to_date=to_date_string)
    file_path = os.path.join(data_dir, filename)
    
    stonks.to_csv(path_or_buf=file_path, header=True, index=True, na_rep='NaN')
    
    return stonks

In [40]:
def get_tickers_by_industry(industries, data_dir='data', filename='stonk_list.csv'):
    '''
    Read the CSV file containing all tickers and their subindustries and return tickers from the selected subindustries in a list.
    
    -Args:
        industries (List(string)): the list can contain:
            'technology_hardware_and_equipment'
            'software_and_services'
            'media_and_entertainment'
            'retailing'
            'automobiles_and_components'
            'semiconductors_and_semiconductor_equipment'
            'health_care_equipment_and_services'
            'banks'
            'pharmaceuticals_biotechnology_and_life_sciences'
            'food_and_staples_retailing'
            'oil_gas_and_consumable_fuels'
            'food_beverage_and_tobacco'
            'telecommunication_services'
            'consumer_durables_and_apparel'
            'consumer_services'
            'transportation'
            'diversified_financials'
            'utilities'
            'capital_goods'
            'insurance'
            'chemicals'
            'metals_and_mining'
            'commercial_and_professional_services'
            'containers_and_packaging'
            'energy_equipment_and_services'
            'construction_materials'
            'paper_and_forest_products'
    
    -Returns:
        tickers (pandas Series): list of selected ticker names
    '''
    path_to_csv = os.path.join(data_dir, filename)
    stonk_list = pd.read_csv(path_to_csv)
    return stonk_list[stonk_list['subindustry'].isin(industries)]['ticker']

In [43]:
tickers = get_tickers_by_industry(['technology_hardware_and_equipment', 'software_and_services'])

In [44]:
tickers

0       AAPL
1       MSFT
7          V
28        MA
38      CSCO
        ... 
2139    AVPT
2166    YEXT
2176    ATEN
2184    ADTN
2199    BASE
Name: ticker, Length: 290, dtype: object

In [91]:
df = get_stonk_data(tickers, period_years=3)

[*********************100%***********************]  2283 of 2283 completed

12 Failed downloads:
- WFC PRN: No data found, symbol may be delisted
- SNX.VI: No data found, symbol may be delisted
- ET-PE: No data found for this date range, symbol may be delisted
- FTAI-PA: No data found for this date range, symbol may be delisted
- ET-PD: No data found for this date range, symbol may be delisted
- AZEK: Error occurred while retrieving timeseries from Redis, keys: [RedisKey [key=AZEK, cluster=finance]]
- ALL-PB: No data found for this date range, symbol may be delisted
- FHN PRA: No data found, symbol may be delisted
- ET-PC: No data found for this date range, symbol may be delisted
- WCC-PA: No data found for this date range, symbol may be delisted
- RXN.VI: No data found, symbol may be delisted
- NRZ-PD: No data found for this date range, symbol may be delisted


## Stock list preprocessing

In [10]:
def preprocess_stock_list(raw_data_path='data/raw_stonk_list.xls', output_path='data/stonk_list.csv'):
    '''
    Parses a raw excel file from CapitalIQ containing ticker names and their subindustries, validates
    unusual ticker names with Yahoo Finance, saving the processed data in CSV format.

        Parameters:
            Required:
                raw_data_path (string):
                    Path to the raw excel file.
                output_path (string):
                    Path where to save the parsed data.
                
        Returns:
            Nothing
    '''
    
    df = pd.read_excel(io=raw_data_path)
    
    # Drop NA rows
    df.dropna(axis=0, inplace=True)
    
    # Reset index and drop the first row
    df.reset_index(inplace=True, drop=True)
    df.drop(index=0, axis=0, inplace=True)
    
    # Drop unwanted columns
    df.drop(columns=df.columns[[1, 2, 3, 4, 5, 7, 8, 9]], inplace=True)
    
    # Rename remaining columns
    df.columns = ['ticker', 'subindustry']
    
    # Remove the '(Primary)' tag from subindustries
    df['subindustry'] = df['subindustry'].str.replace(r' \(Primary\)', '')
    
    # Remove everything until (and including) the semicolon for tickers
    df['ticker'] = df['ticker'].str.replace(r'(.*:)', '')
    
    df['ticker'] = df['ticker'].str.replace(r' WI', '.VI')
    df['ticker'] = df['ticker'].str.replace(r'\.WI', '.VI')
    
    # Replace the ticker endings for a Yahoo finance supported format
    df['ticker'] = df['ticker'].str.replace(r'\.PR', '-P')
    # df['ticker'] = df['ticker'].str.replace(r' PR', '-P')
    
    # Take all remaining tickers that have a dot
    dotted = df[df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Replace the dots with dashes
    dashed = dotted.copy()
    dashed['ticker'] = dashed['ticker'].str.replace(r'\.', '-')
    
    # Remove the dots
    undotted = dotted.copy()
    undotted['ticker'] = undotted['ticker'].str.replace(r'\.', '')

    # Combine all variantas together
    all_variants = pd.concat([dotted, dashed, undotted])
    
    # Run all of these through Yahoo finance, get last day's price
    stonks = yf.download(list(all_variants['ticker'].astype('string').values), period='1m', interval='1d', group_by='column')
    
    # Drop all NA tickers (that failed to download)
    valid_tickers = stonks['Adj Close'].iloc[-1].dropna(axis=0).to_frame().reset_index()
    
    # Rename columns
    valid_tickers.columns = ['ticker', 'price']
    
    # Add subindustries to the remaining valid tickers
    valid_tickers = valid_tickers.join(all_variants.set_index('ticker'), on='ticker')
    
    # Drop the price column
    valid_tickers.drop(columns=valid_tickers.columns[[1]], inplace=True)
    
    # Remove all tickers that have a dot from main dataframe
    df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Add the validated tickers back
    df = pd.concat([df, valid_tickers], axis=0, ignore_index=True)
    
    # Make the subindustry strings more code friendly
    df['subindustry'] = df['subindustry'].str.replace(' ', '_')
    df['subindustry'] = df['subindustry'].str.lower()
    df['subindustry'] = df['subindustry'].str.replace(',', '')
    
    df.to_csv(path_or_buf=output_path, header=True, index=False)

## Linear regression residuals

In [3]:
def get_residuals_many(X, Y):
    '''
    Vectorized calculation of residuals from many univariate linear regressions.
        Args:
        - X (numpy array of shape (n_pairs, d_time)): matrix of LR inputs X, each row represents a different regression, corresponding to the same rows in Y
        - Y (numpy array of shape (n_pairs, d_time)): matrix of LR inputs Y, each row represents a different regression, corresponding to the same rows in X
        Returns:
        - residuals (numpy array of shape (n_pairs, d_time)): matrix of resulting residuals between vectorized pairs of X and Y
        - betas (numpy array of shape (n_pairs, 1)): beta coefficients for each linear regression
        - Y_hat (numpy array of shape (n_pairs, d_time)): predictions using X
    '''
    # Stack 2D matrices into 3D matrices
    X = X.reshape(np.shape(X)[0], np.shape(X)[1], -1)
    Y = Y.reshape(np.shape(Y)[0], np.shape(Y)[1], -1)
    
    # Add bias/intercept in the form (Xi, 1)
    Z = np.concatenate([X, np.ones((np.shape(X)[0], np.shape(X)[1], 1))], axis=2)
    
    # Save the transpose as it's used a couple of times
    Z_t = Z.transpose(0, 2, 1)
    
    # Linear Regression equation solutions w.r.t. weight matrix
    # W contains (beta_coef, a_intercept) for each regression
    W = np.matmul(np.linalg.inv(np.matmul(Z_t, Z)),  np.matmul(Z_t, Y))
    
    # Predictions and residuals
    Y_hat = np.matmul(Z, W).round(2)
    residuals = (Y - Y_hat)
    
    # TODO: Y_hat returned for debugging purposes
    return (residuals[:, :, 0], W[:, 0, 0], Y_hat[:, :, 0])

In [70]:
def get_rolling_residuals(X, Y, l_reg, l_roll, dt):
    '''
    Calculates rolling window residuals in vectorized form. Returns the result as an array that repeats each ticker for the number of regressions calculated.
    For example, if the inputs are (Pair A, Pair B, Pair C) and l_roll / dt = 3, then the returned results will have the form as follows:
    (Pair A, Pair A, Pair A, Pair B, Pair B, Pair B, Pair C, Pair C, Pair C)
    Works best when l_reg and l_roll are integers.
        Args:
        - X (numpy array of shape (n_pairs, >= l_reg + l_roll)): matrix of LR inputs X, each row representing not less than complete data period for rolling regressions (can be longer)
        - Y (numpy array of shape (n_pairs, >= l_reg + l_roll)): matrix of LR inputs Y, each row representing not less than complete data period for rolling regressions (can be longer)
        - l_reg (float): length of each LR to calculate residuals, in years; will be multiplied by the adjusted number of days in a trading year
        - l_roll (float): length of rolling window, in years; will be multipled by the adjusted number of days in a trading year
        - dt (int): rolling window step size, in trading days; total trading year days will be reduced to be divisible by dt (by not more than the value of dt)
        Returns:
        - residuals (numpy array of shape (n_pairs * (l_roll/dt)+1, l_reg + l_roll)): matrix of resulting residuals between vectorized pairs of X and Y
        - betas (numpy array of shape (n_pairs * (l_roll/dt)+1, 1)): beta coefficients for each linear regression
        - Y_hat (numpy array of shape (n_pairs * (l_roll/dt)+1, l_reg + l_roll)): predictions using X
    '''
    _DAYS_IN_TRADING_YEAR = 252
    
    # Adjust days in a year so that the number is divisible by dt
    _DAYS_IN_TRADING_YEAR = _DAYS_IN_TRADING_YEAR - (_DAYS_IN_TRADING_YEAR % dt)
    l_reg_days = int(_DAYS_IN_TRADING_YEAR * l_reg)
    l_roll_days = int(_DAYS_IN_TRADING_YEAR * l_roll)
    total_days = l_reg_days + l_roll_days
    n_windows = (l_roll_days // dt) + 1
    n_x = X.shape[0]
    
    # Rolling window length must be divisible by dt
    assert (l_roll_days % dt) == 0
    
    # There has to be enough days' worth of data in X (and Y) and their shapes must match
    assert X.shape == Y.shape and X.shape[1] >= total_days
    
    # Take the total_days from the end of the arrays (most recent days first, oldest days at the end are cut off)
    X = X[:, -total_days:]
    Y = Y[:, -total_days:]
    
    # Create empty arrays that will contain windowed slices of our data
    X_windows = np.empty(shape=(n_x*n_windows, l_reg_days))
    Y_windows = np.empty(shape=(n_x*n_windows, l_reg_days))
    
    # Take windowed slices and place them into the created empty arrays
    for n in range(n_x):
        for i in range(n_windows):
            X_windows[(n*n_windows)+i] = X[n, i*dt:l_reg_days+(i*dt)].copy()
            Y_windows[(n*n_windows)+i] = Y[n, i*dt:l_reg_days+(i*dt)].copy()
    
    # Make sure we've got the windowing dimensions right
    assert X_windows.shape == (n_x*n_windows, l_reg_days) and Y_windows.shape == (n_x*n_windows, l_reg_days)
    
    # Sanity check
    assert all([
        X[0, -1] == X_windows[n_windows-1, -1],
        Y[0, -1] == Y_windows[n_windows-1, -1],
        X[-1, -1] == X_windows[-1, -1],
        Y[-1, -1] == Y_windows[-1, -1],
    ])
    
    # Calculate and return the residuals
    return get_residuals_many(X_windows, Y_windows)

In [20]:
# def get_rolling_slow_residuals(X, Y, l_reg, l_roll, dt):
#     _DAYS_IN_TRADING_YEAR = (252) - (252 % dt)
#     l_reg_days = _DAYS_IN_TRADING_YEAR * l_reg
#     l_roll_days = _DAYS_IN_TRADING_YEAR * l_roll
#     total_days = l_reg_days + l_roll_days
#     n_windows = l_roll_days // dt
#     n_x = X.shape[0]
    
#     assert (l_roll_days % dt) == 0
#     assert X.shape[1] >= total_days and Y.shape[1] >= total_days
    
#     X = X[:, -total_days:]
#     Y = Y[:, -total_days:]
    
#     # First window
#     X_windows = np.empty(shape=(n_x*n_windows, l_reg_days))
#     Y_windows = np.empty(shape=(n_x*n_windows, l_reg_days))
    
#     for n in range(n_x):
#         for i in range(n_windows):
#             X_windows = np.concatenate(( X_windows, X[n, i*dt:l_reg_days+(i*dt)] ))
#             Y_windows = np.concatenate(( Y_windows, Y[n, i*dt:l_reg_days+(i*dt)] ))
    
#     assert X_windows.shape == (n_x*n_windows, l_reg_days) and Y_windows.shape == (n_x*n_windows, l_reg_days)
    
#     return get_slow_residuals_many(X_windows, Y_windows)

In [51]:
# def get_slow_residuals_many(X, Y, n_jobs=-1):
#     lr = LinearRegression(n_jobs=n_jobs, fit_intercept=True)
#     X = X.reshape((X.shape[0], X.shape[1], -1))
#     Y = Y.reshape((Y.shape[0], Y.shape[1], -1))
    
#     preds = []
#     res = []
#     betas = []
#     for i in range(X.shape[0]):
#         lr.fit(X[i], Y[i])
#         preds.append(lr.predict(X[i]).round(2))
#         res.append(Y[i]-preds[-1])
#         betas.append(lr.coef_[0][0])
#     return (np.asarray(res)[:,:,0], np.asarray(preds)[:,:,0], np.asarray(betas))

In [73]:
def read_stonk_data(date_from, date_to, data_dir='data', data_prefix='stonks'):
    path = os.path.join(data_dir, '{}_{}_to_{}.csv'.format(data_prefix, date_from, date_to))
    stonks = pd.read_csv(path, index_col=0)
    stonks.dropna(axis=1, how='all', thresh=len(stonks) * 0.95, inplace=True)
    stonks.dropna(axis=0, how='all', thresh=len(stonks) * 0.95, inplace=True)
    stonks.fillna(axis=1, method='ffill', inplace=True)
    stonks.dropna(axis=1, how='any', inplace=True)
    
    assert stonks.isna().sum().sum() == 0
    
    # return stonks.to_numpy().T.astype(np.float64)
    return stonks

In [74]:
stonks = read_stonk_data('2019-02-25', '2022-02-24')

In [75]:
stonks

Unnamed: 0_level_0,A,AA,AAL,AAON,AAP,AAPL,AAWW,AAXJ,AB,ABBV,...,ZGNX,ZION,ZIONO,ZIONP,ZNGA,ZS,ZTS,ZUMZ,ZUO,ZWS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-25,77.62,30.71,35.43,41.61,153.59,42.59,55.51,68.46,22.56,68.84,...,48.94,47.49,23.53,22.65,5.03,49.93,92.78,24.68,24.24,27.75
2019-02-26,76.84,30.78,35.77,41.09,158.66,42.62,54.94,68.14,22.27,69.03,...,49.52,46.96,23.51,22.65,5.03,50.00,91.90,24.48,23.64,27.29
2019-02-27,77.69,30.91,35.22,40.90,154.16,42.75,54.54,67.59,22.40,68.09,...,51.75,47.58,23.30,22.44,5.15,50.72,92.20,24.82,24.55,27.21
2019-02-28,77.71,29.44,35.16,39.23,157.55,42.33,53.74,66.95,22.45,67.94,...,52.73,47.59,23.15,22.54,5.22,49.68,92.62,24.70,23.76,26.67
2019-03-01,79.47,29.60,34.18,40.61,156.42,42.77,54.00,67.17,22.77,68.68,...,53.95,47.66,23.58,22.55,5.34,60.57,94.12,25.32,23.41,27.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-02-17,130.25,76.47,18.22,58.14,220.41,168.88,77.10,82.01,45.25,144.97,...,26.16,71.22,25.66,23.00,8.88,261.94,193.07,43.82,15.22,33.88
2022-02-18,132.05,78.20,17.87,58.22,218.80,167.30,78.47,81.33,44.99,144.03,...,26.12,70.72,26.04,23.00,8.84,255.22,191.32,43.98,14.62,33.37
2022-02-22,130.49,74.09,17.43,57.55,206.76,164.32,74.48,80.06,42.22,145.56,...,26.15,70.89,25.75,22.98,8.71,254.81,190.10,42.06,14.62,32.50
2022-02-23,126.00,75.32,16.64,56.63,196.80,160.07,75.35,79.19,42.24,146.76,...,26.11,69.71,25.75,23.29,8.72,239.40,187.06,41.74,13.85,31.52


In [8]:
X = stonks
Y = np.flipud(stonks)

In [71]:
res_roll, preds_roll, betas_roll = get_rolling_residuals(X, Y, l_reg=2, l_roll=1, dt=5)

In [65]:
X_roll, Y_roll = get_rolling_residuals(X, Y, l_reg=2, l_roll=1, dt=5)

In [48]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,748,749,750,751,752,753,754,755,756,757
0,77.62,76.84,77.69,77.71,79.47,78.39,78.01,77.04,77.02,76.29,...,141.66,137.20,135.21,135.91,135.15,130.25,132.05,130.49,126.00,127.69
1,30.71,30.78,30.91,29.44,29.60,29.10,29.14,28.19,27.30,26.76,...,71.17,73.52,73.86,74.04,77.84,76.47,78.20,74.09,75.32,72.64
2,35.43,35.77,35.22,35.16,34.18,33.27,32.79,32.33,31.79,31.49,...,18.71,17.61,17.43,18.84,18.82,18.22,17.87,17.43,16.64,16.81
3,41.61,41.09,40.90,39.23,40.61,41.12,41.71,41.75,41.28,40.93,...,58.42,58.76,58.77,60.23,59.69,58.14,58.22,57.55,56.63,56.96
4,153.59,158.66,154.16,157.55,156.42,154.26,153.20,153.84,150.10,147.80,...,222.73,222.94,224.28,222.60,222.39,220.41,218.80,206.76,196.80,199.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1823,49.93,50.00,50.72,49.68,60.57,58.03,57.53,58.27,59.18,60.54,...,284.10,273.00,274.43,285.82,280.87,261.94,255.22,254.81,239.40,262.10
1824,92.78,91.90,92.20,92.62,94.12,94.24,94.15,92.56,90.97,90.83,...,199.37,198.87,196.43,197.76,195.09,193.07,191.32,190.10,187.06,189.83
1825,24.68,24.48,24.82,24.70,25.32,24.55,24.73,24.97,25.31,24.61,...,45.53,44.10,43.57,45.48,45.89,43.82,43.98,42.06,41.74,43.65
1826,24.24,23.64,24.55,23.76,23.41,22.81,22.50,22.40,22.98,23.04,...,16.47,15.98,15.80,16.39,16.34,15.22,14.62,14.62,13.85,14.79


In [51]:
pd.DataFrame(X[:, -750:])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,740,741,742,743,744,745,746,747,748,749
0,77.02,76.29,77.58,78.19,78.39,78.86,79.34,79.21,79.44,79.16,...,141.66,137.20,135.21,135.91,135.15,130.25,132.05,130.49,126.00,127.69
1,27.30,26.76,27.46,28.38,28.92,27.83,27.78,28.58,28.56,28.73,...,71.17,73.52,73.86,74.04,77.84,76.47,78.20,74.09,75.32,72.64
2,31.79,31.49,31.63,30.51,31.42,31.70,31.01,30.96,31.17,30.55,...,18.71,17.61,17.43,18.84,18.82,18.22,17.87,17.43,16.64,16.81
3,41.28,40.93,40.72,40.91,41.28,40.54,41.27,42.12,42.19,42.14,...,58.42,58.76,58.77,60.23,59.69,58.14,58.22,57.55,56.63,56.96
4,150.10,147.80,151.39,150.82,150.65,149.97,150.79,157.34,159.15,159.86,...,222.73,222.94,224.28,222.60,222.39,220.41,218.80,206.76,196.80,199.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1823,59.18,60.54,63.34,63.65,64.80,66.94,67.19,67.28,67.91,67.49,...,284.10,273.00,274.43,285.82,280.87,261.94,255.22,254.81,239.40,262.10
1824,90.97,90.83,93.12,94.12,95.28,94.56,95.26,96.10,97.04,96.82,...,199.37,198.87,196.43,197.76,195.09,193.07,191.32,190.10,187.06,189.83
1825,25.31,24.61,24.61,24.58,25.33,23.68,24.00,24.54,24.29,23.40,...,45.53,44.10,43.57,45.48,45.89,43.82,43.98,42.06,41.74,43.65
1826,22.98,23.04,23.55,23.52,23.63,23.50,23.41,23.06,23.43,23.74,...,16.47,15.98,15.80,16.39,16.34,15.22,14.62,14.62,13.85,14.79


In [72]:
pd.DataFrame(X_roll)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,77.02,76.29,77.58,78.19,78.39,78.86,79.34,79.21,79.44,79.16,...,127.26,129.42,126.99,126.19,122.84,121.76,124.35,121.44,121.41,123.81
1,78.86,79.34,79.21,79.44,79.16,80.22,77.27,77.79,78.90,77.28,...,121.76,124.35,121.44,121.41,123.81,122.97,118.66,114.37,115.52,115.46
2,80.22,77.27,77.79,78.90,77.28,77.97,78.63,79.95,79.54,80.32,...,122.97,118.66,114.37,115.52,115.46,119.20,119.34,121.17,120.79,123.37
3,77.97,78.63,79.95,79.54,80.32,79.23,79.86,80.08,79.81,80.07,...,119.20,119.34,121.17,120.79,123.37,122.34,121.86,121.93,122.03,122.94
4,79.23,79.86,80.08,79.81,80.07,79.48,79.38,78.81,76.02,73.94,...,122.34,121.86,121.93,122.03,122.94,121.32,120.19,121.24,124.96,124.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93223,33.77,34.82,34.39,33.76,33.91,34.60,35.37,35.47,35.29,35.11,...,34.30,33.59,32.70,32.47,32.11,31.98,31.51,32.28,31.08,30.16
93224,34.60,35.37,35.47,35.29,35.11,34.95,35.38,35.22,33.50,32.20,...,31.98,31.51,32.28,31.08,30.16,29.50,30.08,30.54,31.94,32.05
93225,34.95,35.38,35.22,33.50,32.20,31.53,30.07,29.16,30.04,29.22,...,29.50,30.08,30.54,31.94,32.05,30.80,30.76,29.94,31.38,31.74
93226,31.53,30.07,29.16,30.04,29.22,30.17,28.74,27.90,25.61,27.88,...,30.80,30.76,29.94,31.38,31.74,30.39,29.79,33.40,35.11,34.69


In [28]:
# t1_fast = time.time()
# res, betas, preds  = get_rolling_residuals(X, Y, l_reg=2, l_roll=1, dt=5)
# t2_fast = time.time()

# t1_slow = time.time()
# res_slow, preds_slow = get_rolling_slow_residuals(X, Y, l_reg=2, l_roll=1, dt=5)
# t2_slow = time.time()

# print("Time slow: " + str(t2_slow-t1_slow))
# print("Time fast: " + str(t2_fast-t1_fast))

In [29]:
# t1_fast = time.time()
# res, preds, betas = get_residuals_many(X, Y)
# t2_fast = time.time()

# t1_slow = time.time()
# res_slow, preds_slow, betas_slow = get_slow_residuals_many(X, Y)
# t2_slow = time.time()

# print("Time slow: " + str(t2_slow-t1_slow))
# print("Time fast: " + str(t2_fast-t1_fast))