# Imports

In [2]:
# Data management
import numpy as np
import pandas as pd

# Data fetching
import yfinance as yf

# Backtesting

# ML

# Utils
from datetime import datetime
from datetime import timedelta

# Data fetch

In [12]:
def get_stonk_data(stonk_list, date_start, date_end=datetime.now(), interval='1d', source='yfinance', proxy=False):
    '''
    Returns historical price data for the selected financial instrument.

        Parameters:
            Required:
                stonk_list (string, list):
                    List of stonk identifiers as strings
                date_start (datetime):
                    Start date for stonk data
            Optional:
                date_end (datetime):
                    End date for stonk data
                interval (string)
                    Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
                source (string):
                    Where to source data from. Valid sources: yfinance
                proxy (boolean):
                    Whether to use a proxy connection to avoid API limits/blocks
                
        Returns:
            stonk_data (Pandas Dataframe): Pandas Dataframe containing requested data
    '''
        
    if source == 'yfinance':
        stonks = yf.download(stonk_list, start=date_start, end=date_end, interval=interval, group_by='column', auto_adjust=True, threads=True, rounding=True)['Adj Close']
    else:
        raise ValueError('Unsupported data source type')
    
    

In [110]:
stonks = yf.download(["googl"], period='1y', interval='1d', group_by='column')

[*********************100%***********************]  1 of 1 completed

1 Failed download:
- GOOG-L: No data found, symbol may be delisted


In [111]:
stonks['Adj Close']

Series([], Name: Adj Close, dtype: float64)

In [11]:
help(yf.download)

Help on function download in module yfinance.multi:

download(tickers, start=None, end=None, actions=False, threads=True, group_by='column', auto_adjust=False, back_adjust=False, progress=True, period='max', show_errors=True, interval='1d', prepost=False, proxy=None, rounding=False, timeout=None, **kwargs)
    Download yahoo tickers
    :Parameters:
        tickers : str, list
            List of tickers to download
        period : str
            Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
            Either Use period parameter or use start and end
        interval : str
            Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
            Intraday data cannot extend last 60 days
        start: str
            Download start date string (YYYY-MM-DD) or _datetime.
            Default is 1900-01-01
        end: str
            Download end date string (YYYY-MM-DD) or _datetime.
            Default is now
        group_by : str
            Group by 'ticker' o

In [8]:
help(yf.Ticker)

Help on class Ticker in module yfinance.ticker:

class Ticker(yfinance.base.TickerBase)
 |  Ticker(ticker, session=None)
 |  
 |  Method resolution order:
 |      Ticker
 |      yfinance.base.TickerBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  option_chain(self, date=None, proxy=None, tz=None)
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties defined here:
 |  
 |  actions
 |  
 |  analysis
 |  
 |  balance_sheet
 |  
 |  balancesheet
 |  
 |  calendar
 |  
 |  cashflow
 |  
 |  dividends
 |  
 |  earnings
 |  
 |  financials
 |  
 |  info
 |  
 |  institutional_holders
 |  
 |  isin
 |  
 |  major_holders
 |  
 |  mutualfund_holders
 |  
 |  news
 |  
 |  options
 |  
 |  quarterly_balance_sheet
 |  
 |  quarterly_balancesheet
 |  
 |  quarterly_cashflow
 |  
 |  quarterly_earnings
 |  
 |  quarterly_financials
 |  
 |  recommendations
 |  
 |  shares
 |  
 |

# Backtesting

# Stock list preprocessing

In [None]:
def preprocess_stock_list(raw_data_path='data/raw_stock_list.xls', output_path='data/stonk_list.csv'):
    df = pd.read_excel(io=raw_data_path)
    
    df.dropna(axis=0, inplace=True)
    df.reset_index(inplace=True, drop=True)
    df.drop(index=0, axis=0, inplace=True)
    df.reset_index(inplace=True, drop=True)
    df.drop(columns=df.columns[[1, 2, 3, 4, 5, 7, 8, 9]], inplace=True)
    
    df.columns = ['ticker', 'subindustry']
    
    df['subindustry'] = df['subindustry'].str.replace(r' \(Primary\)', '')
    df['ticker'] = df['ticker'].str.replace(r'(.*:)', '')
    df['ticker'] = df['ticker'].str.replace('.PR', '-P')
    df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]{2}')]
    
    dotted = df[df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]{1}')]
    
    dashed = dotted.copy()
    dashed['ticker'] = dashed['ticker'].str.replace('.', '-')

    undotted = dotted.copy()
    undotted['ticker'] = undotted['ticker'].str.replace('.', '')

    all_variants = pd.concat([dotted, dashed, undotted])
    
    stonks = yf.download(list(all_variants['ticker'].astype('string').values), period='1d', interval='1d', group_by='column')
    
    valid_tickers = stonks['Adj Close'].iloc[-1].dropna(axis=0).to_frame().reset_index()
    valid_tickers.columns = ['ticker', 'price']
    valid_tickers = valid_tickers.join(all_variants.set_index('ticker'), on='ticker')
    valid_tickers.drop(columns=valid_tickers.columns[[1]], inplace=True)
    
    df = df[~df['ticker'].str.fullmatch(r'[A-Z]*\.[A-Z]')]
    df = pd.concat([df, valid_tickers], axis=0, ignore_index=True)
    
    df['subindustry'] = df['subindustry'].str.replace(' ', '_')
    df['subindustry'] = df['subindustry'].str.lower()
    df['subindustry'] = df['subindustry'].str.replace(',', '')
    
    df.to_csv(path_or_buf=output_path, header=True, index=False)
    

In [19]:
pd.set_option('display.max_rows', 0)