In [12]:
import yfinance as yf
import requests
import pandas_market_calendars as mcal
import pandas as pd
from edgar_functions_V2 import *
from edgar_functions import get_explored_facts_for_ticker

headers = {"User-Agent": "russ@sunriseanalysis.com"}

In [13]:
def create_ml_dataset_for_single_ticker(ticker, calender, early, late):
    def adjust_for_market_open(date):
        market_days = calender.valid_days(start_date=early, end_date=late)
        next_day = market_days[market_days >= date].min()
        return next_day

    def adjust_for_before(date):
        market_days = calender.valid_days(start_date=early, end_date=late)
        prev_day = market_days[market_days <= date].max()
        return prev_day

    def get_close_price(date, ticker=ticker):
        try:
            yf_ticker = yf.Ticker(ticker)
            return (
                yf_ticker.history(
                    period="1d",
                    interval="1d",
                    start=date,
                    end=date + pd.Timedelta(days=1),
                )["Close"][0]
            ).round(2)
        except:
            return None

    def calculate_pct_diff_before_after_target(ticker_df):
        pct_diff_before_after_target = (
            np.divide(
                ticker_df["week_out_close_price"] - ticker_df["day_before_close_price"],
                ticker_df["day_before_close_price"],
            )
            * 100
        )
        ticker_df["pct_diff_before_after(target)"] = pct_diff_before_after_target
        return ticker_df

    company_facts = get_explored_facts_for_ticker(ticker)
    ten_k = get_filtered_filings(ticker, ten_k=True)
    ten_q = get_filtered_filings(ticker, ten_k=False)
    filings = pd.concat([ten_k, ten_q])
    company_facts = company_facts.dropna(subset=["frame"], axis=0).reset_index(
        drop=True
    )
    company_facts = company_facts.drop(columns=["unit_key", "form"])
    company_facts = company_facts[
        company_facts["accn"].isin(filings["accessionNumber"])
    ].reset_index(drop=True)
    company_facts = company_facts.drop(columns=["start", "end", "accn", "frame"])
    pivot = (
        pd.pivot_table(
            company_facts,
            index=["filed", "fp", "fy"],
            columns=[company_facts["account_key"]],
            values="val",
        )
        .rename_axis(None, axis=1)
        .sort_index(ascending=False)
        .reset_index()
    )
    one_hot = pd.get_dummies(pivot["fp"], dtype=float)
    ticker_df = pd.concat([pivot, one_hot], axis=1).drop(columns=["fp"])
    ticker_df["filed"] = pd.to_datetime(ticker_df["filed"])
    ticker_df["week_from_filed(weekday)"] = pd.to_datetime(
        ticker_df["filed"] + pd.Timedelta(days=7), utc=True
    )
    ticker_df["week_from_filed(weekday)"] = ticker_df["week_from_filed(weekday)"].apply(
        adjust_for_market_open
    )
    ticker_df["day_before_filed(weekday)"] = pd.to_datetime(
        ticker_df["filed"] + pd.Timedelta(days=-1), utc=True
    )
    ticker_df["day_before_filed(weekday)"] = ticker_df[
        "day_before_filed(weekday)"
    ].apply(adjust_for_before)
    ticker_df["day_before_close_price"] = ticker_df["day_before_filed(weekday)"].apply(
        get_close_price
    )
    ticker_df["week_out_close_price"] = ticker_df["week_from_filed(weekday)"].apply(
        get_close_price
    )
    ticker_df = ticker_df.drop(
        columns=["week_from_filed(weekday)", "day_before_filed(weekday)", "fy"]
    )
    ticker_df = calculate_pct_diff_before_after_target(ticker_df)
    ticker_df = ticker_df.sort_values(by="filed").reset_index(drop=True)
    ticker_df["ticker"] = ticker
    return ticker_df


def create_ml_dataset_for_multiple_tickers(tickers):
    nyse = mcal.get_calendar("NYSE")
    early = pd.to_datetime("2007-01-01", utc=True)
    late = pd.to_datetime("2025-01-01", utc=True)
    df = pd.DataFrame()
    for i, ticker in enumerate(tickers):
        single_ticker_df = create_ml_dataset_for_single_ticker(
            ticker, nyse, early, late
        )
        df = pd.concat([df, single_ticker_df], join="outer", ignore_index=True)
        print(f"Finished Processing Number {i+1} of {len(tickers)}: {ticker}")

    begining_columns = ["ticker", "filed"]
    ending_columns = [
        "day_before_close_price",
        "week_out_close_price",
        "pct_diff_before_after(target)",
    ]
    middle_columns = [
        col
        for col in df.columns
        if col not in begining_columns and col not in ending_columns
    ]
    middle_columns = sorted(middle_columns)
    sorted_columns = begining_columns + middle_columns + ending_columns
    df = df[sorted_columns]
    df = df.sort_values(by="filed").reset_index(drop=True)
    return df

In [14]:
url = "https://en.wikipedia.org/wiki/S%26P_100"
response = requests.get(url)
tables = pd.read_html(response.text)
sp100_tickers = tables[2]["Symbol"].tolist()
sp100_tickers = [ticker.replace(".", "-") for ticker in sp100_tickers]

In [15]:
multiple_df = create_ml_dataset_for_multiple_tickers(sp100_tickers)

Finished Processing Number 1 of 101: AAPL
Finished Processing Number 2 of 101: ABBV
Finished Processing Number 3 of 101: ABT
Finished Processing Number 4 of 101: ACN
Finished Processing Number 5 of 101: ADBE
Finished Processing Number 6 of 101: AIG
Finished Processing Number 7 of 101: AMD
Finished Processing Number 8 of 101: AMGN
Finished Processing Number 9 of 101: AMT
Finished Processing Number 10 of 101: AMZN
Finished Processing Number 11 of 101: AVGO
Finished Processing Number 12 of 101: AXP
Finished Processing Number 13 of 101: BA
Finished Processing Number 14 of 101: BAC
Finished Processing Number 15 of 101: BK
Finished Processing Number 16 of 101: BKNG
Finished Processing Number 17 of 101: BLK
Finished Processing Number 18 of 101: BMY
Finished Processing Number 19 of 101: BRK-B
Finished Processing Number 20 of 101: C
Finished Processing Number 21 of 101: CAT
Finished Processing Number 22 of 101: CHTR
Finished Processing Number 23 of 101: CL
Finished Processing Number 24 of 101: 

NKE: Data doesn't exist for startDate = 1697155200, endDate = 1697241600


Finished Processing Number 73 of 101: NKE
Finished Processing Number 74 of 101: NVDA
Finished Processing Number 75 of 101: ORCL


PEP: Data doesn't exist for startDate = 1697500800, endDate = 1697587200


Finished Processing Number 76 of 101: PEP
Finished Processing Number 77 of 101: PFE
Finished Processing Number 78 of 101: PG
Finished Processing Number 79 of 101: PM
Finished Processing Number 80 of 101: PYPL
Finished Processing Number 81 of 101: QCOM
Finished Processing Number 82 of 101: RTX
Finished Processing Number 83 of 101: SBUX
Finished Processing Number 84 of 101: SCHW
Finished Processing Number 85 of 101: SO
Finished Processing Number 86 of 101: SPG
Finished Processing Number 87 of 101: T
Finished Processing Number 88 of 101: TGT
Finished Processing Number 89 of 101: TMO
Finished Processing Number 90 of 101: TMUS
Finished Processing Number 91 of 101: TSLA
Finished Processing Number 92 of 101: TXN
Finished Processing Number 93 of 101: UNH
Finished Processing Number 94 of 101: UNP
Finished Processing Number 95 of 101: UPS
Finished Processing Number 96 of 101: USB
Finished Processing Number 97 of 101: V
Finished Processing Number 98 of 101: VZ
Finished Processing Number 99 of 101

In [16]:
multiple_df.to_csv("ml_datasets/multiple_df.csv", index=False)
multiple_df

Unnamed: 0,ticker,filed,AcceleratedShareRepurchaseProgramAdjustment,AcceleratedShareRepurchasesFinalPricePaidPerShare,AcceleratedShareRepurchasesInitialPricePaidPerShare,AcceleratedShareRepurchasesSettlementPaymentOrReceipt,AccountsAndFinancingReceivableAllowanceForCreditLoss,AccountsAndNotesReceivableNet,AccountsAndOtherReceivablesNetCurrent,AccountsNotesAndLoansReceivableNetCurrent,...,WeightedAverageNumberOfSharesRestrictedStock,WeightedAverageRateDomesticDepositCertificatesOfDeposit,WithdrawalFromContractHoldersFunds,WorkersCompensationDiscountAmount,WorkersCompensationLiabilityCurrent,WorkersCompensationLiabilityCurrentAndNoncurrent,WriteOffOfDeferredDebtIssuanceCost,day_before_close_price,week_out_close_price,pct_diff_before_after(target)
0,SPG,2010-05-10,,,,,,355469000.0,,,...,,,,,,,,45.76,46.60,1.835664
1,EMR,2010-08-04,,,,,,,,,...,,,,,,,,34.72,33.23,-4.291475
2,SPG,2010-08-06,,,,,,343588000.0,,,...,,,,,,,,50.07,48.31,-3.515079
3,SPG,2010-11-05,,,,,,383168000.0,,,...,,,,,,,,56.20,54.77,-2.544484
4,EMR,2010-11-23,,,,,,,,,...,,,,,,,,38.53,38.09,-1.141967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2937,ORCL,2023-09-12,,,,,,,,,...,,,,,,,,126.71,112.77,-11.001499
2938,FDX,2023-09-20,,,,,,,,,...,,,,,,,,249.99,262.71,5.088204
2939,ADBE,2023-09-27,,,,,,,,,...,,,,,,,,506.30,518.42,2.393838
2940,NKE,2023-10-06,,,,,,,,,...,,,,,,,,95.79,,


In [17]:
multiple_df["pct_diff_before_after(target)"].describe()

count    2940.000000
mean        0.543920
std         5.266139
min       -31.512864
25%        -2.084009
50%         0.394037
75%         2.955388
max        33.685559
Name: pct_diff_before_after(target), dtype: float64

In [18]:
columns = multiple_df.columns.tolist()

In [20]:
multiple_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2942 entries, 0 to 2941
Columns: 4757 entries, ticker to pct_diff_before_after(target)
dtypes: datetime64[ns](1), float64(4755), object(1)
memory usage: 106.8+ MB


In [30]:
percentage_missing  = multiple_df.isnull().sum() * 100 / len(multiple_df)
percentage_missing_df = pd.DataFrame({'% Missing': percentage_missing}).reset_index()
percentage_missing_df = percentage_missing_df.sort_values(by = '% Missing', ascending = True)
percentage_missing_df[:40]

Unnamed: 0,index,% Missing
0,ticker,0.0
1,filed,0.0
4754,day_before_close_price,0.0
3888,Q3,0.0
3887,Q2,0.0
3886,Q1,0.0
1743,FY,0.0
4755,week_out_close_price,0.067981
4756,pct_diff_before_after(target),0.067981
2661,LiabilitiesAndStockholdersEquity,1.121686
