In [71]:
import yfinance as yf
import pandas as pd
import os 

WORKSPACE_DIR = os.getenv('WORKSPACE_DIR')
if not os.getcwd().endswith('portfolio_py'):
    os.chdir(f'{WORKSPACE_DIR}/portfolio_py')

In [72]:
starting_stock_data = pd.read_csv('data/clean/master_stock_data.csv')
starting_stock_data

Unnamed: 0,cik_str,Tickers,title,Sector,Sector_Check
0,1090872,A,"AGILENT TECHNOLOGIES, INC.",Healthcare,True
1,1675149,AA,Alcoa Corp,Basic Materials,True
2,1708646,AAAU,Goldman Sachs Physical Gold ETF,,True
3,2034334,AACB,Artius II Acquisition Inc.,Financial Services,True
4,1420529,AACG,ATA Creativity Global,Consumer Defensive,True
...,...,...,...,...,...
10033,1439288,ZWS,Zurn Elkay Water Solutions Corp,,
10034,1975641,ZYBT,Zhengye Biotechnology Holding Ltd,,
10035,1937653,ZYME,Zymeworks Inc.,,
10036,846475,ZYXI,ZYNEX INC,,


In [73]:
stocks_need_sector = starting_stock_data[starting_stock_data['Sector_Check'].isna()]
stocks_need_sector = stocks_need_sector.drop(columns=['Sector', 'Sector_Check'])

In [74]:
STOCK_SECTORS_TO_GET = 670

In [75]:
from utils.helpers import divide_chunks
from concurrent.futures import ThreadPoolExecutor, as_completed
from tenacity import retry, stop_after_attempt, wait_exponential

import numpy as np


tickers = stocks_need_sector['Tickers'].tolist()[:STOCK_SECTORS_TO_GET]

num_in_chunks = 20
ticker_chunks = list(divide_chunks(tickers, num_in_chunks))

@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=2, min=1, max=32))
def fetch_sector(ticker):
    return yf.Ticker(ticker).info.get('sector')

sector_dict = {}

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(fetch_sector, ticker): ticker for ticker in tickers}
    for future in as_completed(futures):
        ticker = futures[future]
        try:
            sector_dict[ticker] = future.result()
        except Exception as e:
            print(f"Failed to fetch {ticker} after retries: {e}")
            sector_dict[ticker] = np.nan  # or use None

sector_df = pd.DataFrame(list(sector_dict.items()), columns=['Tickers', 'Sector'])

sector_df['Sector_Check'] = True


Failed to fetch VCSA after retries: RetryError[<Future at 0x124077110 state=finished raised HTTPError>]
Failed to fetch VENG after retries: RetryError[<Future at 0x1273782d0 state=finished raised HTTPError>]
Failed to fetch VINE after retries: RetryError[<Future at 0x12742d3d0 state=finished raised HTTPError>]
Failed to fetch VISTA after retries: RetryError[<Future at 0x126087890 state=finished raised HTTPError>]
Failed to fetch VMCAW after retries: RetryError[<Future at 0x126021d50 state=finished raised HTTPError>]
Failed to fetch VOXX after retries: RetryError[<Future at 0x127a8e790 state=finished raised HTTPError>]
Failed to fetch VRMMQ after retries: RetryError[<Future at 0x126fd7110 state=finished raised HTTPError>]
Failed to fetch WAVS after retries: RetryError[<Future at 0x1276345d0 state=finished raised HTTPError>]
Failed to fetch WAVSU after retries: RetryError[<Future at 0x1254881d0 state=finished raised HTTPError>]
Failed to fetch WAVSW after retries: RetryError[<Future at 0

In [76]:
sector_df

Unnamed: 0,Tickers,Sector,Sector_Check
0,VCRRX,,True
1,VCV,Financial Services,True
2,VCYT,Healthcare,True
3,VDMCY,Communication Services,True
4,VECO,Technology,True
...,...,...,...
665,YY,,True
666,ZIONL,,True
667,ZOM,,True
668,,,True


In [77]:
master_stock_df = pd.merge(starting_stock_data, sector_df, on='Tickers', how='outer')

master_stock_df['Sector'] = master_stock_df['Sector_x'].combine_first(master_stock_df['Sector_y'])
master_stock_df.drop(columns=['Sector_x', 'Sector_y'], inplace=True)

master_stock_df['Sector_Check'] = master_stock_df['Sector_Check_x'].combine_first(master_stock_df['Sector_Check_y'])
master_stock_df.drop(columns=['Sector_Check_x', 'Sector_Check_y'], inplace=True)


print(master_stock_df['Sector_Check'].isna().value_counts())

master_stock_df.to_csv('data/clean/master_stock_data.csv', index=False)

Sector_Check
False    10038
Name: count, dtype: int64


In [78]:
master_stock_df

Unnamed: 0,cik_str,Tickers,title,Sector,Sector_Check
0,1090872,A,"AGILENT TECHNOLOGIES, INC.",Healthcare,True
1,1675149,AA,Alcoa Corp,Basic Materials,True
2,1708646,AAAU,Goldman Sachs Physical Gold ETF,,True
3,2034334,AACB,Artius II Acquisition Inc.,Financial Services,True
4,1420529,AACG,ATA Creativity Global,Consumer Defensive,True
...,...,...,...,...,...
10033,1439288,ZWS,Zurn Elkay Water Solutions Corp,Industrials,True
10034,1975641,ZYBT,Zhengye Biotechnology Holding Ltd,Healthcare,True
10035,1937653,ZYME,Zymeworks Inc.,Healthcare,True
10036,846475,ZYXI,ZYNEX INC,Healthcare,True


In [79]:
print(f'Sector Check: {master_stock_df["Sector_Check"].value_counts().values}')
print(f'How many left: {master_stock_df.shape[0]-master_stock_df["Sector_Check"].value_counts().values}')


Sector Check: [10038]
How many left: [0]
