In [12]:
# import packages:
import pandas as pd
import yfinance as yf
import datetime as dt
import time as tm
import logging
import random
import os
from utils import fix_columns

logger = logging.getLogger(__name__)

In [3]:
# parameters:

equities = ['XLE', 'XLF']
modeling_start = '2019-01-01' # start date of the modeling period format: '2022-01-01'
modeling_end = '2022-08-31' # end date of the modeling period format: '2022-01-01'

sleep_min = 2 # minumum amount of sleep to take between pulls
sleep_max = 10 # maximum amount of sleep to take between pulls
expire_days = 3 # standard set of days until expiration of the cache
single_dataframe = True # whether to store all stocks into a single dataframe (recommended for Kedro functionality)


parameters = { 'equities' : 'XLE',
                'modeling_start' = '2019-01-01' # start date of the modeling period format: '2022-01-01'
modeling_end = '2022-08-31' # end date of the modeling period format: '2022-01-01'

sleep_min = 2 # minumum amount of sleep to take between pulls
sleep_max = 10 # maximum amount of sleep to take between pulls
expire_days = 3 # standard set of days until expiration of the cache
single_dataframe = True # whether to store all stocks into a single dataframe (recommended for Kedro functionality)
}





In [4]:
# function to pull stocks and optionally put into dataframe:

def pull_stock_data(stocks: list,
                    start_date: str,
                    end_date: str,
                    sleep_min: int,
                    sleep_max: int,
                    expire_days: int = 3,
                    single_dataframe: bool = True) -> pd.DataFrame:
    
    start = tm.time()
    
    if single_dataframe == True:
        pulls = 0

        for stock in stocks:
            print(f'retrieving: {stock.strip()}')

            if pulls == 0:
                df = pdr.get_data_yahoo(stock.strip(), start = start_date, end = end_date)
                pulls+=1

                df = df.reset_index()
                df['ticker'] = stock
                
                sleep_time = random.randint(2, 10)
                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

            else:
                temp = pdr.get_data_yahoo(stock.strip(), start = start_date, end = end_date)
                pulls+=1

                temp = temp.reset_index()
                temp['ticker'] = stock
    
                # union-all into the main dataframe:
                df = pd.concat([df, temp], ignore_index= True)

                del temp
                sleep_time = random.randint(2, 10)

                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

        del pulls, sleep_time

        

        return df

    else:

        for stock in stocks:

            df = pdr.get_data_yahoo(stock.strip(), start = start_date, end = end_date)
            df['ticker'] = stock
            df = df.reset_index()
            df.to_csv(os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'), index = False)
            sleep_time = random.randint(2, 10)
            tm.sleep(sleep_time)
            print(f"sleeping for: {sleep_time} seconds")
            print('saving: ', stock, ' data to: ', os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'))
    
    end = tm.time()
    total = (end - start) / 60
    print(f"completed retrieving: {stock.strip()} data in: {total}")
    
  
    
  



In [5]:
# proving grounds:

df = pull_stock_data(stocks = equities,
                    start_date = modeling_start,
                    end_date = modeling_end,
                    sleep_min = sleep_min,
                    sleep_max = sleep_max,
                    expire_days = 3,
                    single_dataframe = True)

retrieving: XLE


TypeError: string indices must be integers

In [56]:
# rewrite for Kedro:


def pull_stock_data(data_pull_parameters: dict) -> pd.DataFrame:
    
    start = tm.time()
    
    if data_pull_parameters['single_dataframe'] == True:
        pulls = 0

        for stock in data_pull_parameters['equities']:
            print(f'retrieving: {stock.strip()}')

            if pulls == 0:
                df = pdr.get_data_yahoo(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
                pulls+=1

                df = df.reset_index()
                df['ticker'] = stock
                
                sleep_time = random.randint(2, 10)
                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

            else:
                temp = pdr.get_data_yahoo(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
                pulls+=1

                temp = temp.reset_index()
                temp['ticker'] = stock
    
                # union-all into the main dataframe:
                df = pd.concat([df, temp], ignore_index= True)

                del temp
                sleep_time = random.randint(2, 10)

                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

        del pulls, sleep_time


        return df

    else:

        for stock in data_pull_parameters['equities']:

            df = pdr.get_data_yahoo(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
            df['ticker'] = stock
            df = df.reset_index()
            df.to_csv(os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'), index = False)
            sleep_time = random.randint(2, 10)
            tm.sleep(sleep_time)
            print(f"sleeping for: {sleep_time} seconds")
            print('saving: ', stock, ' data to: ', os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'))
        
        return df # only returns the last df to the catalogue ***
        

In [None]:
# paramters:

paramters = {'stocks' : ['XLE', 'XLF', 'APPL']}

In [24]:
start_date = dt.datetime(2024, 1, 1)
end_date = dt.datetime(2024, 12, 31)

df = yf.download('XLE', start = '2019-01-01', end = '2020-01-01')



[*********************100%%**********************]  1 of 1 completed


In [36]:
parameters = {'equities' : ['XLE', 'XLF', 'AAPL'],
             'start_date' : '2019-01-01',
             'end_date' : '2023-12-31',
             'single_dataframe' : True # set as the default
              
              }

In [37]:
# new version of the pull for Kedro:

def pull_stock_data(data_pull_parameters: dict) -> pd.DataFrame:
    
    start = tm.time()
    
    if data_pull_parameters['single_dataframe'] == True:
        pulls = 0

        for stock in data_pull_parameters['equities']:
            print(f'retrieving: {stock.strip()}')

            if pulls == 0:
                df = yf.download(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
                pulls+=1

                df = df.reset_index()
                df['ticker'] = stock
                
                sleep_time = random.randint(2, 10)
                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

            else:
                temp = yf.download(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
                pulls+=1

                temp = temp.reset_index()
                temp['ticker'] = stock
    
                # union-all into the main dataframe:
                df = pd.concat([df, temp], ignore_index= True)

                del temp
                sleep_time = random.randint(2, 10)

                print(f"sleeping for: {sleep_time} seconds")
                # sleep between pulls so to not arouse suspicion:
                tm.sleep(sleep_time)

        del pulls, sleep_time

        df.columns = fix_columns(df.columns)
        return df

    else:

        for stock in data_pull_parameters['equities']:

            df = yf.download(stock.strip(), start = data_pull_parameters['start_date'], end = data_pull_parameters['end_date'])
            df['ticker'] = stock
            df = df.reset_index()
            df.to_csv(os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'), index = False)
            sleep_time = random.randint(2, 10)
            tm.sleep(sleep_time)
            print(f"sleeping for: {sleep_time} seconds")
            print('saving: ', stock, ' data to: ', os.path.join('../data/01_raw/separate_stock_pulls/', stock.strip() +'.csv'))
        
        return df # only returns the last df to the catalogue ***

In [38]:
test = pull_stock_data(data_pull_parameters = parameters)

[*********************100%%**********************]  1 of 1 completed

retrieving: XLE
sleeping for: 8 seconds



[*********************100%%**********************]  1 of 1 completed

retrieving: XLF
sleeping for: 5 seconds





retrieving: AAPL


[*********************100%%**********************]  1 of 1 completed


sleeping for: 7 seconds


In [39]:
test

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,ticker
0,2019-01-02,56.439999,58.869999,56.150002,58.480000,45.037308,24892600,XLE
1,2019-01-03,58.650002,58.860001,57.240002,57.900002,44.590637,18024100,XLE
2,2019-01-04,58.900002,60.049999,58.560001,59.869999,46.107792,21351500,XLE
3,2019-01-07,60.320000,61.200001,59.520000,60.759998,46.793205,18056700,XLE
4,2019-01-08,61.610001,61.750000,60.900002,61.230000,47.155174,18692300,XLE
...,...,...,...,...,...,...,...,...
3769,2023-12-22,195.179993,195.410004,192.970001,193.600006,193.353287,37122800,AAPL
3770,2023-12-26,193.610001,193.889999,192.830002,193.050003,192.803986,28919300,AAPL
3771,2023-12-27,192.490005,193.500000,191.089996,193.149994,192.903839,48087700,AAPL
3772,2023-12-28,194.139999,194.660004,193.169998,193.580002,193.333298,34049900,AAPL
