In [479]:
import yfinance as yf
import pandas as pd
import os
import datetime
import warnings
BASE_DIR="/Users/dannyyu/Desktop/AI_Trader/data"
TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'NVDA', 'TSLA', 'INTC', 'AMD', 'IBM']


def read_CSV_File(ticker,folderName):
    dir = BASE_DIR+"/"+folderName+"/"+ticker+"_"+folderName+".csv"
    table= pd.read_csv(dir,index_col=0)
    return table

def has_divident(metadata):
    metadata["Year"] = pd.to_datetime(metadata.index, utc=True).year #pd.to_datetime(metadata['Date'], utc=True).dt.year
    dividend_by_year = metadata.groupby('Year')['Dividends'].apply(lambda x: int((x > 0).any()))
    metadata['HasDividend'] = metadata['Year'].map(dividend_by_year)
    return metadata

def missing_values(table):
    total_rows = len(table)
    hasMissing=False
    print("Analyzing "+table.iloc[1]["Company"])
    for column in table.columns:
        num_missing = table[column].isna().sum()
        if num_missing == 0:
            #print(f"📊 '{column}': no missing values")
            continue  # skip columns with no missing data
        else:
            hasMissing=True
        percent_missing = 100 * num_missing / total_rows
        print(f"📊 '{column}': {num_missing} missing ({percent_missing:.2f}%)")
    if not hasMissing:
        print(table.iloc[1]["Company"] + " has no missing values")
    return hasMissing

def get_closest_price(ticker_obj, target_date, lookback_days=3):
    try:
        start_date = target_date - pd.Timedelta(days=lookback_days)
        end_date = target_date + pd.Timedelta(days=1)

        hist = ticker_obj.history(start=start_date, end=end_date)
        
        if not hist.empty:
            last_row = hist.iloc[-1]
            timestamp_str = hist.index[-1].strftime("%Y-%m-%d")
           # print("Data retrieved on date "+timestamp_str)
            return last_row['Close']
        else:
            print(f"No price data found for {ticker_obj.ticker} between {start_date.date()} and {end_date.date()}")
            return None

    except Exception as e:
        print(f"Error retrieving price for {ticker_obj.ticker} near {target_date.date()}: {e}")
        return None

    

In [395]:
###estimating the missing values for Diluted EPS and the PE ratio for IBM
#estimate the EPS for 2024 using the currently shares outstanding as a proxy
ibm=read_CSV_File("IBM","metadata")
ibm_ticker= yf.Ticker("IBM")
ibm_fin=ibm_ticker.financials
net_income = ibm_fin["2024-12-31"]["Net Income"]
shares_outstanding = ibm_ticker.info.get("sharesOutstanding", None)
eps_estimate=net_income / shares_outstanding
# Estimating the PE ratio for 2024 using the EPS estimation from the previous step
hist_price=get_closest_price(ibm_ticker, pd.to_datetime("2024-12-31"))
pe_ratio = hist_price / eps_estimate

ibm['DilutedEPS'] =ibm['DilutedEPS'].fillna(eps_estimate)
ibm["DilutedEPS"] = ibm["DilutedEPS"].round(2)
ibm['PE'] = ibm['PE'].fillna(pe_ratio)
ibm['PE'] = ibm['PE'].round(5)
ibm.to_csv(os.path.join(BASE_DIR, "metadata","IBM_metadata.csv"), index=True)



In [413]:
##estimate the missing PE ratio for INTC using today's price
intc=read_CSV_File("INTC","metadata")
intc_ticker= yf.Ticker("INTC")
price=intc_ticker.history(period="1d")["Close"].iloc[-1]
eps=intc.iloc[-1]["DilutedEPS"]
pe=price/eps
intc["PE"]=intc["PE"].fillna(pe)
intc['PE'] = intc['PE'].round(5)
intc.to_csv(os.path.join(BASE_DIR, "metadata","INTC_metadata.csv"), index=True)


In [451]:
for ticker in TICKERS:
    table=read_CSV_File(ticker,"metadata")
    table=has_divident(table)
    table["DebtToEquity"]=table["DebtToEquity"].round(3)
    table["PE"]=table["PE"].round(6)
    table["ROE"]=table["ROE"].round(3)
    missing_value=missing_values(table)
    table.to_csv(os.path.join(BASE_DIR, "metadata", f"{ticker}_metadata.csv"), index=True)

Analyzing AAPL
AAPL has no missing values
Analyzing MSFT
MSFT has no missing values
Analyzing GOOGL
GOOGL has no missing values
Analyzing AMZN
AMZN has no missing values
Analyzing META
META has no missing values
Analyzing NVDA
NVDA has no missing values
Analyzing TSLA
TSLA has no missing values
Analyzing INTC
INTC has no missing values
Analyzing AMD
AMD has no missing values
Analyzing IBM
IBM has no missing values


In [489]:
def get_shares_outstanding(table, year):
    ticker_symbol = table['Company'].iloc[0]
    ticker = yf.Ticker(ticker_symbol)

    current_year = datetime.datetime.now().year
    if year == current_year:
        shares = ticker.info.get('sharesOutstanding', None)
        return round(shares, 0)
    else:
        income = ticker.financials
        income.columns = pd.to_datetime(income.columns)
        matching_cols = [col for col in income.columns if col.year == year]
        col = matching_cols[0]
        net_income = income.at['Net Income', col]
        row = table[table["Year"] == year]
        eps = row['DilutedEPS'].iloc[0]
        estimated_shares = net_income / eps
        return round(estimated_shares,0)


#Adds per-share normalized versions of Revenue, CashFlow, EBITDA, and GrossProfit using estimated shares outstanding for each row.
def normalize_per_share(table):
    table['Revenue_perShare'] = None
    table['CashFlow_perShare'] = None
    table['EBITDA_perShare'] = None
    table['GrossProfit_perShare'] = None
    
    for year in table["Year"].unique():
        mask = (table['Year'] == year)
        row_subset = table[mask]
        shares = get_shares_outstanding(table, year)

        table.loc[mask, 'Revenue_perShare']     = table.loc[mask, 'Revenue']     / shares
        table.loc[mask, 'CashFlow_perShare']    = table.loc[mask, 'CashFlow']    / shares
        table.loc[mask, 'EBITDA_perShare']      = table.loc[mask, 'EBITDA']      / shares
        table.loc[mask, 'GrossProfit_perShare'] = table.loc[mask, 'GrossProfit'] / shares

    per_share_cols = ['Revenue_perShare', 'CashFlow_perShare', 'EBITDA_perShare', 'GrossProfit_perShare']
    table[per_share_cols] = table[per_share_cols].astype(float).round(5)
    return table


for ticker in TICKERS:
    table=read_CSV_File(ticker,"metadata")
    table=normalize_per_share(table)
    table.to_csv(os.path.join(BASE_DIR, "metadata", f"{ticker}_metadata.csv"), index=True)


    

NameError: name 'combine_metadata_files' is not defined

In [562]:
#combines all the metadata files in the given direcotr
def combine_metadata_files(folderName="metadata"):
    combined=[]
    for file in os.listdir(os.path.join(BASE_DIR, folderName)):
        if file.endswith("_metadata.csv"):
            ticker = file.split("_")[0]
            df = read_CSV_File(ticker, folderName)
            combined.append(df)

    if combined:
        master_df = pd.concat(combined, ignore_index=False)
        return master_df
    else:
        print("error occured")
        return None

master_df = combine_metadata_files()
master_df.to_csv(os.path.join(BASE_DIR, "metadata", "MASTER_metadata.csv"), index=True)

In [530]:
def compute_percentile(table):
    indicators = ["OperatingMargin","ROE","DebtToEquity","Revenue_perShare","CashFlow_perShare","EBITDA_perShare","GrossProfit_perShare"]
    for indicator in indicators:
        table[f"{indicator}_Pct"]=table.groupby('Year')[indicator].rank(pct=True)
        table[f"{indicator}_Pct"]=table[f"{indicator}_Pct"].round(5)
    return table

master=read_CSV_File("MASTER", "metadata")
master=compute_percentile(master)
master.to_csv(os.path.join(BASE_DIR, "metadata", "MASTER_metadata.csv"), index=True)
