In [35]:
import yfinance as yf
import pandas as pd
import os
import warnings


In [50]:
BASE_DIR="/Users/dannyyu/Desktop/AI_Trader/data"
TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'NVDA', 'TSLA', 'INTC', 'AMD', 'IBM']
START_DATE = "2022-01-01"
END_DATE = "2025-5-18"

##create the folders
os.makedirs(os.path.join(BASE_DIR, "prices"), exist_ok=True)
os.makedirs(os.path.join(BASE_DIR, "fundamentals"), exist_ok=True)
os.makedirs(os.path.join(BASE_DIR, "metadata"), exist_ok=True)


"""
Returns the most recent available closing price on or before target_date
by checking historical data within the past `lookback_days`.
"""
def get_closest_price(ticker_obj, target_date, lookback_days=3):
    try:
        start_date = target_date - pd.Timedelta(days=lookback_days)
        end_date = target_date + pd.Timedelta(days=1)

        hist = ticker_obj.history(start=start_date, end=end_date)

        if not hist.empty:
            last_row = hist.iloc[-1]
            return last_row['Close']
        else:
            print(f"No price data found for {ticker_obj.ticker} between {start_date.date()} and {end_date.date()}")
            return None

    except Exception as e:
        print(f"Error retrieving price for {ticker_obj.ticker} near {target_date.date()}: {e}")
        return None

def get_fundamentals_history(ticker, years=3):
    ticker_obj = yf.Ticker(ticker)
    info = ticker_obj.info
    
    # Most recent static values
    recent_eps = info.get("trailingEps", None)
    recent_pe = info.get("trailingPE", None)
    
    # Annual financial statements
    income_stmt = ticker_obj.financials  # Income statement
    balance_sheet = ticker_obj.balance_sheet
    cashflow_stmt = ticker_obj.cashflow

    # All dates available (annual format)
    common_dates = income_stmt.columns.intersection(balance_sheet.columns).intersection(cashflow_stmt.columns)
    
    fundamentals = []
    
    #get the most recent fundamental indicators
    info = ticker_obj.info
    fundamentals.append({
                "Date": pd.Timestamp.today().strftime('%Y-%m-%d'),
                "DilutedEPS": info.get("trailingEps", None),
                "PE": info.get("trailingPE", None),
                "Revenue": info.get("totalRevenue", None),
                "CashFlow": info.get("operatingCashflow", None),
                "EBITDA": info.get("ebitda", None),
                "GrossProfit": info.get("grossProfits", None),
                "OperatingMargin": info.get("operatingMargins", None),
                "ROE": info.get("returnOnEquity", None),
                "DebtToEquity": info.get("debtToEquity", None),
            })

    #get the fundamental indicators for the past 3 years
    for date in common_dates:
        try:
            if pd.Timestamp.today().year - date.year > years:
                continue
    
        
            revenue = income_stmt.loc["Total Revenue", date]
            gross_profit = income_stmt.loc["Gross Profit", date]
            ebitda = income_stmt.loc["EBITDA", date]
            net_income = income_stmt.loc["Net Income", date]
    
            operating_cashflow = cashflow_stmt.loc["Operating Cash Flow", date]
            total_equity = balance_sheet.loc["Stockholders Equity", date]
            total_debt = balance_sheet.loc["Total Debt", date]
    
            roe = net_income / total_equity if total_equity else None
            debt_to_equity = total_debt / total_equity if total_equity else None
            operating_margin = net_income / revenue if revenue else None

            diluted_eps = income_stmt.loc["Diluted EPS", date]

            hist_price = get_closest_price(ticker_obj, date)
            
            
            #ticker_obj.history(start=date, end=pd.to_datetime(date) + pd.Timedelta(days=1))
            if hist_price and diluted_eps:
                pe_ratio = hist_price / diluted_eps
            else:
                pe_ratio = recent_pe
    
            fundamentals.append({
                "Date": date.strftime('%Y-%m-%d'),
                "DilutedEPS": diluted_eps,
                "PE": pe_ratio,
                "Revenue": revenue,
                "CashFlow": operating_cashflow,
                "EBITDA": ebitda,
                "GrossProfit": gross_profit,
                "OperatingMargin": operating_margin,
                "ROE": roe,
                "DebtToEquity": debt_to_equity
            })
        except Exception as e:
            print(f"Error processing {ticker} for {date.date()}: {e}")
            continue

    return pd.DataFrame(fundamentals)

In [48]:
meta_data = []

for ticker in TICKERS:
    print(f"Fetching data for {ticker}...")
    stock = yf.Ticker(ticker)

    # --- Prices ---
    df_price = stock.history(start=START_DATE, end=END_DATE,interval="1d")
    df_price.reset_index(inplace=True)
    df_price.to_csv(os.path.join(BASE_DIR, "prices", f"{ticker}_prices.csv"), index=False)

    # --- Fundamentals ---
df_fund.to_csv(os.path.join(BASE_DIR, "fundamentals", f"{ticker}_fundamentals.csv"), index=False)

    df_fund = get_fundamentals_history(ticker)
    df_fund.to_csv(os.path.join(BASE_DIR, "fundamentals", f"{ticker}_fundamentals.csv"), index=False)

    
    # --- Metadata for overview ---
    meta_data.append({
        "Ticker": ticker,
        "Name": stock.info.get("shortName", ""),
        "Sector": stock.info.get("sector", "")
    })

# === SAVE COMPANY LIST ===
df_meta = pd.DataFrame(meta_data)
df_meta.to_csv(os.path.join(BASE_DIR, "metadata", "company_list.csv"), index=False)   

print("Data collection complete!")

Fetching data for AAPL...
Fetching data for MSFT...
Fetching data for GOOGL...
Fetching data for AMZN...
Fetching data for META...
Fetching data for NVDA...
Fetching data for TSLA...
Fetching data for INTC...
Fetching data for AMD...
Fetching data for IBM...
Data collection complete!
