## Import libraries

In [1]:
import yfinance as yf
import pandas as pd
import os

## Get S&P 500 tickers and metadata

In [2]:
def get_sp500_data():
    """
    Fetch S&P 500 tickers and metadata from Wikipedia.

    Returns:
        dict: A dictionary mapping ticker symbols (str) to their industry data.
    """
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    sp500 = pd.read_html(url)[0]

    # Create dictionary with requested fields
    ticker_data_dict = {
        t.replace(".", "-"): {
            "Ticker": t.replace(".", "-"),
            "Name": n,
            "Industry": ind,
            "Sub_Industry": sub,
        }
        for t, n, ind, sub in zip(
            sp500["Symbol"],
            sp500["Security"],
            sp500["GICS Sector"],
            sp500["GICS Sub-Industry"],
        )
    }

    return ticker_data_dict

## Fetch Ticker Data from Yahoo finance

In [3]:
def fetch_ticker_data(ticker: str, period: str) -> pd.DataFrame:
    """
    Download historical stock data from Yahoo Finance.

    Args:
        ticker (str): Stock ticker symbol.
        period (str): How far to look back from last available trading day. \n
                        Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max

    Returns:
        data (pd.DataFrame): A DataFrame containing historical stock data for the given ticker.
    """
    try:
        # download historical ticker data
        data = yf.download(
            tickers=ticker, 
            period=period,
            interval="1d",
            actions=True, 
            threads=True, 
            auto_adjust=False,
            multi_level_index=False,
        ).reset_index()

        # add ticker column
        data['Ticker'] = ticker

    except Exception as e:
        print(f"Error downloading {ticker}: {e}")
        data=pd.DataFrame()

    return data

## Combine Stock data and metadata

In [4]:
def combine_stock_data(ticker_data_dict: dict, ticker_df: pd.DataFrame) -> pd.DataFrame:
    """
    Combine historical ticker data with industry and subindustry metadata.

    Args:
        ticker (str): Stock ticker symbol.
        ticker_data_dict (dict): Dictionary of Ticker data. This includes Industry and sub-industry.
        ticker_df (pd.DataFrame): DataFrame of historical ticker data.
    """
    # convert ticker_data_dict to a DataFrame and Transpose it, while dropping unecessary index
    ticker_data_dict_df = pd.DataFrame(ticker_data_dict).T.reset_index(drop=True)

    # merge DataFrames together
    combined_df = pd.merge(ticker_data_dict_df, ticker_df, how="right", on="Ticker")

    return combined_df

## Save stock data and metadata

In [5]:
def save_stock_data(ticker: str, full_df: pd.DataFrame, output_path: str) -> None:
    """
    Save metadata and historical stock data to data output folder for model training

    Args:
        ticker (str): Stock ticker symbol.
        full_df (pd.DataFrame): DataFrame after being combined using combine_stock_data.
        output_path (str): output directory for storing data.
    """
    # create directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    full_file_path = os.path.join(output_path, f"{ticker}.csv")
    full_df.to_csv(path_or_buf=full_file_path, index=False)

## Run code on first available ticker

In [6]:
if __name__ == "__main__":
    # get S&P 500 data
    sp_500 = get_sp500_data()
    first_ticker = list(sp_500.keys())[0]

    print(f"Downloading Ticker: {first_ticker}")
    df = fetch_ticker_data(ticker=first_ticker, period='2y')

    combined_df = combine_stock_data(ticker_data_dict=sp_500, ticker_df=df)

    # save_stock_data(first_ticker, full_df=combined_df, output_path="data/input/")
    print("S&P 500 stocks download complete!")

    print(combined_df.head())

Downloading Ticker: MMM


[*********************100%***********************]  1 of 1 completed

S&P 500 stocks download complete!
  Ticker Name     Industry              Sub_Industry       Date  Adj Close  \
0    MMM   3M  Industrials  Industrial Conglomerates 2023-03-20  80.145538   
1    MMM   3M  Industrials  Industrial Conglomerates 2023-03-21  80.130173   
2    MMM   3M  Industrials  Industrial Conglomerates 2023-03-22  78.017235   
3    MMM   3M  Industrials  Industrial Conglomerates 2023-03-23  77.602333   
4    MMM   3M  Industrials  Industrial Conglomerates 2023-03-24  77.709892   

       Close  Dividends       High        Low       Open  Stock Splits  \
0  87.215721        0.0  87.332779  86.304352  86.463211           0.0   
1  87.198997        0.0  88.193977  86.429764  87.993309           0.0   
2  84.899666        0.0  87.416389  84.866219  87.224083           0.0   
3  84.448158        0.0  86.061874  84.247490  85.000000           0.0   
4  84.565216        0.0  84.732445  83.745819  83.921402           0.0   

    Volume  
0  3255392  
1  3137826  
2  3355378  



