In [41]:
import pandas as pd
import pandas_datareader as pdr
#import yfinance as yf
import time as tm
import logging
import random
import os

logger = logging.getLogger(__name__)

In [16]:
# parameters:

equities = ['XLE', 'XLF']
modeling_start = '2019-01-01' # start date of the modeling period format: '2022-01-01'
modeling_end = '2022-08-31' # end date of the modeling period format: '2022-01-01'

sleep_min = 2 # minumum amount of sleep to take between pulls
sleep_max = 10 # maximum amount of sleep to take between pulls
expire_days = 3 # standard set of days until expiration of the cache
single_dataframe = True # whether to store all stocks into a single dataframe (recommended for Kedro functionality)




In [53]:
# function to pull stocks and optionally put into dataframe:

def pull_stock_data(stocks: list,
                    start_date: str,
                    end_date: str,
                    sleep_min: int,
                    sleep_max: int,
                    expire_days: int = 3,
                    single_dataframe: bool = True) -> pd.DataFrame:
    
    start = tm.time()
    
    if single_dataframe == True:
        pulls = 0

        for stock in stocks:
            print(f'retrieving: {stock.strip()}')

            if pulls == 0:
                df = pdr.get_data_yahoo(stock.strip(), start = start_date, end = end_date)
                pulls+=1

                df = df.reset_index()
                df['ticker'] = stock
                
                sleep_time = random.randint(2, 10)
                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

            else:
                temp = pdr.get_data_yahoo(stock.strip(), start = start_date, end = end_date)
                pulls+=1

                temp = temp.reset_index()
                temp['ticker'] = stock
    
                # union-all into the main dataframe:
                df = pd.concat([df, temp], ignore_index= True)

                del temp
                sleep_time = random.randint(2, 10)

                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

        del pulls, sleep_time

        

        return df

    else:

        for stock in stocks:

            df = pdr.get_data_yahoo(stock.strip(), start = start_date, end = end_date)
            df['ticker'] = stock
            df = df.reset_index()
            df.to_csv(os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'), index = False)
            sleep_time = random.randint(2, 10)
            tm.sleep(sleep_time)
            print(f"sleeping for: {sleep_time} seconds")
            print('saving: ', stock, ' data to: ', os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'))
    
    end = tm.time()
    total = (end - start) / 60
    print(f"completed retrieving: {stock.strip()} data in: {total}")
    
  
    
  



In [54]:
# proving grounds:

df = pull_stock_data(stocks = equities,
                    start_date = modeling_start,
                    end_date = modeling_end,
                    sleep_min = sleep_min,
                    sleep_max = sleep_max,
                    expire_days = 3,
                    single_dataframe = True)

retrieving: XLE
sleeping for: 3 seconds
retrieving: XLF
sleeping for: 8 seconds


In [56]:
# rewrite for Kedro:


def pull_stock_data(data_pull_parameters: dict) -> pd.DataFrame:
    
    start = tm.time()
    
    if data_pull_parameters['single_dataframe'] == True:
        pulls = 0

        for stock in data_pull_parameters['equities']:
            print(f'retrieving: {stock.strip()}')

            if pulls == 0:
                df = pdr.get_data_yahoo(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
                pulls+=1

                df = df.reset_index()
                df['ticker'] = stock
                
                sleep_time = random.randint(2, 10)
                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

            else:
                temp = pdr.get_data_yahoo(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
                pulls+=1

                temp = temp.reset_index()
                temp['ticker'] = stock
    
                # union-all into the main dataframe:
                df = pd.concat([df, temp], ignore_index= True)

                del temp
                sleep_time = random.randint(2, 10)

                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

        del pulls, sleep_time


        return df

    else:

        for stock in data_pull_parameters['equities']:

            df = pdr.get_data_yahoo(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
            df['ticker'] = stock
            df = df.reset_index()
            df.to_csv(os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'), index = False)
            sleep_time = random.randint(2, 10)
            tm.sleep(sleep_time)
            print(f"sleeping for: {sleep_time} seconds")
            print('saving: ', stock, ' data to: ', os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'))
        
        return df # only returns the last df to the catalogue ***
        