# Imports

In [28]:
# Data management
import numpy as np
import pandas as pd

# Data fetching
import yfinance as yf

# Spread generation
from sklearn.linear_model import LinearRegression

# Backtesting

# ML

# Utils
from datetime import datetime
from datetime import timedelta
import os

# Utils

## Data fetch

In [61]:
def get_stonk_data(stonk_list, period_years=3, date_from=None, date_to=datetime.now(), interval='1d', source='yfinance', data_dir='data', file_prefix='stonks', proxy=False):    
    '''
    Returns historical price data for the selected stonks.

        Parameters:
            Required:
                stonk_list (string, list):
                    List of stonk identifiers as strings, case unsensitive
                period_years (float):
                    How many years of data to download until date_to, can be a floating point number
            Optional:
                date_from (datetime):
                    Start date for stonk data (use instead of period_years)
                date_to (datetime):
                    End date for stonk data
                interval (string):
                    Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
                source (string):
                    Where to source data from. Valid sources: yfinance
                data_dir (string):
                    Folder name where to output downloaded data
                file_prefix (string):
                    Prefix of CSV file containing downloaded data inside data_dir
                proxy (boolean):
                    Whether to use a proxy connection to avoid API limits/blocks
                
        Returns:
            stonk_data (Pandas Dataframe): Pandas Dataframe containing requested data
    '''
    
    if date_from is None:
        date_from = date_to-(timedelta(days=int(365*period_years)))
        
    if source.lower() == 'yfinance':
        stonks = yf.download(list(stonk_list), start=date_from, end=date_to, interval=interval, group_by='column', threads=True, rounding=True)['Adj Close']
    else:
        raise ValueError('Unsupported data source')
        
    filename = '{prefix}_{from_date}_to_{to_date}.csv'.format(prefix=file_prefix, from_date=date_from.strftime('%Y-%m-%d'), to_date=date_to.strftime('%Y-%m-%d'))
    file_path = os.path.join(data_dir, filename)
    
    stonks.to_csv(path_or_buf=file_path, header=True, index=False)

In [59]:
stonks = yf.download(["googl", "tsla", "ffs"], period='1d', interval='1d', group_by='column', threads=True, rounding=True)

[*********************100%***********************]  3 of 3 completed

1 Failed download:
- FFS: No data found for this date range, symbol may be delisted


In [45]:
stonk_list = pd.read_csv('data/stonk_list.csv')

In [62]:
stonk_list[stonk_list['ticker'].str.contains('DELL')]

Unnamed: 0,ticker,subindustry
214,DELL,technology_hardware_and_equipment
215,DELL WI,technology_hardware_and_equipment


In [49]:
get_stonk_data(stonk_list['ticker'], period_years=3)

[*********************100%***********************]  2277 of 2277 completed

17 Failed downloads:
- -PE: No data found, symbol may be delisted
- -PT: No data found, symbol may be delisted
- WCC-PA: No data found for this date range, symbol may be delisted
- RXN WI: No data found, symbol may be delisted
- -PO: No data found, symbol may be delisted
- P-P: No data found, symbol may be delisted
- ET-PE: No data found for this date range, symbol may be delisted
- NRZ-PD: No data found for this date range, symbol may be delisted
- WFC-PN: No data found, symbol may be delisted
- DELL WI: No data found, symbol may be delisted
- ALL-PB: No data found for this date range, symbol may be delisted
- ET-PD: No data found for this date range, symbol may be delisted
- FTAI-PA: No data found for this date range, symbol may be delisted
- J-P: No data found, symbol may be delisted
- FHN-PA: No data found, symbol may be delisted
- K-P: No data found, symbol may be delisted
- -P: No data found, symbol may b

In [11]:
help(yf.download)

Help on function download in module yfinance.multi:

download(tickers, start=None, end=None, actions=False, threads=True, group_by='column', auto_adjust=False, back_adjust=False, progress=True, period='max', show_errors=True, interval='1d', prepost=False, proxy=None, rounding=False, timeout=None, **kwargs)
    Download yahoo tickers
    :Parameters:
        tickers : str, list
            List of tickers to download
        period : str
            Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
            Either Use period parameter or use start and end
        interval : str
            Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
            Intraday data cannot extend last 60 days
        start: str
            Download start date string (YYYY-MM-DD) or _datetime.
            Default is 1900-01-01
        end: str
            Download end date string (YYYY-MM-DD) or _datetime.
            Default is now
        group_by : str
            Group by 'ticker' o

In [8]:
help(yf.Ticker)

Help on class Ticker in module yfinance.ticker:

class Ticker(yfinance.base.TickerBase)
 |  Ticker(ticker, session=None)
 |  
 |  Method resolution order:
 |      Ticker
 |      yfinance.base.TickerBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  option_chain(self, date=None, proxy=None, tz=None)
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties defined here:
 |  
 |  actions
 |  
 |  analysis
 |  
 |  balance_sheet
 |  
 |  balancesheet
 |  
 |  calendar
 |  
 |  cashflow
 |  
 |  dividends
 |  
 |  earnings
 |  
 |  financials
 |  
 |  info
 |  
 |  institutional_holders
 |  
 |  isin
 |  
 |  major_holders
 |  
 |  mutualfund_holders
 |  
 |  news
 |  
 |  options
 |  
 |  quarterly_balance_sheet
 |  
 |  quarterly_balancesheet
 |  
 |  quarterly_cashflow
 |  
 |  quarterly_earnings
 |  
 |  quarterly_financials
 |  
 |  recommendations
 |  
 |  shares
 |  
 |

## Backtesting

## Stock list preprocessing

In [23]:
def preprocess_stock_list(raw_data_path='data/raw_stock_list.xls', output_path='data/stonk_list.csv'):
    '''
    Parses a raw excel file from CapitalIQ containing ticker names and their subindustries, validates
    unusual ticker names with Yahoo Finance, saving the processed data in CSV format.

        Parameters:
            Required:
                raw_data_path (string):
                    Path to the raw excel file.
                output_path (string):
                    Path where to save the parsed data.
                
        Returns:
            Nothing
    '''
    
    df = pd.read_excel(io=raw_data_path)
    
    # Drop NA rows
    df.dropna(axis=0, inplace=True)
    
    # Reset index and drop the first row
    df.reset_index(inplace=True, drop=True)
    df.drop(index=0, axis=0, inplace=True)
    
    # Drop unwanted columns
    df.drop(columns=df.columns[[1, 2, 3, 4, 5, 7, 8, 9]], inplace=True)
    
    # Rename remaining columns
    df.columns = ['ticker', 'subindustry']
    
    # Remove the '(Primary)' tag from subindustries
    df['subindustry'] = df['subindustry'].str.replace(r' \(Primary\)', '')
    
    # Remove everything until (and including) the semicolon for tickers
    df['ticker'] = df['ticker'].str.replace(r'(.*:)', '')
    
    # Replace the ticker endings for a Yahoo finance supported format
    df['ticker'] = df['ticker'].str.replace('.PR', '-P')
    
    # Drop tickers with two letters after a dot, unavailable in Yahoo finance
    df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]{2}')]
    
    # Take all remaining tickers that have a dot
    dotted = df[df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Replace the dots with dashes
    dashed = dotted.copy()
    dashed['ticker'] = dashed['ticker'].str.replace('.', '-')
    
    # Remove the dots
    undotted = dotted.copy()
    undotted['ticker'] = undotted['ticker'].str.replace('.', '')

    # Combine all variantas together
    all_variants = pd.concat([dotted, dashed, undotted])
    
    # Run all of these through Yahoo finance, get last day's price
    stonks = yf.download(list(all_variants['ticker'].astype('string').values), period='1d', interval='1d', group_by='column')
    
    # Drop all NA tickers (that failed to download)
    valid_tickers = stonks['Adj Close'].iloc[-1].dropna(axis=0).to_frame().reset_index()
    
    # Rename columns
    valid_tickers.columns = ['ticker', 'price']
    
    # Add subindustries to the remaining valid tickers
    valid_tickers = valid_tickers.join(all_variants.set_index('ticker'), on='ticker')
    
    # Drop the price column
    valid_tickers.drop(columns=valid_tickers.columns[[1]], inplace=True)
    
    # Remove all tickers that have a dot from main dataframe
    df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    
    # Add the validated tickers back
    df = pd.concat([df, valid_tickers], axis=0, ignore_index=True)
    
    # Make the subindustry strings more code friendly
    df['subindustry'] = df['subindustry'].str.replace(' ', '_')
    df['subindustry'] = df['subindustry'].str.lower()
    df['subindustry'] = df['subindustry'].str.replace(',', '')
    
    df.to_csv(path_or_buf=output_path, header=True, index=False)
    

In [19]:
pd.set_option('display.max_rows', 0)