In [5]:
from datasets import load_dataset
from itertools import islice
import pandas as pd
from collections import Counter

def companies_with_min_k_occurrences(tickers_list, k):
    """
    tickers_list: list of tickers (can have duplicates)
    k: minimum number of occurrences

    Returns:
        set of tickers that occur at least k times
    """
    counts = Counter(tickers_list)
    return {ticker for ticker, count in counts.items() if count >= k}

In [6]:
# ============================
# FNSPID configuration
# ============================

N = 1_200_000   # CHANGE THIS ANYTIME
K = 30          # minimum number of appearances

print(f"FNSPID slice size: {N}")
print(f"Minimum frequency threshold k: {K}")

# Load FNSPID in streaming mode
ds_fnspid = load_dataset(
    "Zihan1004/FNSPID",
    split="train",
    streaming=True
)

# Take a slice
sample_fnspid = list(islice(ds_fnspid, N))
df_fnspid = pd.DataFrame(sample_fnspid)

print(f"Rows loaded from FNSPID: {len(df_fnspid)}")

FNSPID slice size: 1200000
Minimum frequency threshold k: 30


Repo card metadata block was not found. Setting CardData to empty.


Rows loaded from FNSPID: 1200000


In [7]:
# Unique companies
unique_companies_fnspid = sorted(
    df_fnspid["Stock_symbol"].dropna().astype(str).str.upper().unique()
)

# All ticker occurrences
all_tickers_fnspid = (
    df_fnspid["Stock_symbol"]
    .dropna()
    .astype(str)
    .str.upper()
    .tolist()
)

print(f"Unique FNSPID companies (slice): {len(unique_companies_fnspid)}")
print("Sample companies:", unique_companies_fnspid[:50])


# Companies appearing at least K times
fnspid_companies_min_k = companies_with_min_k_occurrences(
    all_tickers_fnspid, K
)

print(f"FNSPID companies appearing ≥ {K} times: {len(fnspid_companies_min_k)}")

Unique FNSPID companies (slice): 5333
Sample companies: ['A', 'AA', 'AAC', 'AADR', 'AAL', 'AAMC', 'AAME', 'AAN', 'AAOI', 'AAON', 'AAP', 'AAPL', 'AAU', 'AAV', 'AAVL', 'AAWW', 'AAXJ', 'AB', 'ABAC', 'ABAX', 'ABB', 'ABBV', 'ABC', 'ABCB', 'ABCD', 'ABCO', 'ABCW', 'ABDC', 'ABEV', 'ABG', 'ABGB', 'ABIO', 'ABM', 'ABMD', 'ABR', 'ABTL', 'ABX', 'ABY', 'ACAD', 'ACAS', 'ACAT', 'ACC', 'ACCO', 'ACCU', 'ACE', 'ACET', 'ACFC', 'ACFN', 'ACG', 'ACGL']
FNSPID companies appearing ≥ 30 times: 3896


In [17]:
ratings_df = pd.read_csv("raw_analyst_ratings.csv")

print(f"Rows in analyst ratings dataset: {len(ratings_df)}")

# Unique analyst companies
analyst_companies = set(
    ratings_df["stock"]
    .dropna()
    .astype(str)
    .str.upper()
    .unique()
)

# All ticker occurrences
all_tickers_analyst = (
    ratings_df["stock"]
    .dropna()
    .astype(str)
    .str.upper()
    .tolist()
)

print(f"Unique companies in analyst dataset: {len(analyst_companies)}")

Rows in analyst ratings dataset: 1407328
Unique companies in analyst dataset: 6204


In [18]:
analyst_companies_min_k = companies_with_min_k_occurrences(
    all_tickers_analyst, K
)

print(f"Analyst companies appearing ≥ {K} times: {len(analyst_companies_min_k)}")

Analyst companies appearing ≥ 30 times: 4562


In [19]:
sp500_hist = pd.read_csv(
    "sp_500_historical_components.csv",
    parse_dates=["date"]
)

print(f"S&P 500 history rows: {len(sp500_hist)}")

# Year ranges
sp500_1999_2023 = sp500_hist[
    (sp500_hist["date"].dt.year >= 1999) &
    (sp500_hist["date"].dt.year <= 2023)
]

sp500_2009_2020 = sp500_hist[
    (sp500_hist["date"].dt.year >= 2009) &
    (sp500_hist["date"].dt.year <= 2020)
]

sp500_tickers_99to23 = set(
    ticker.strip().upper()
    for tickers_str in sp500_1999_2023["tickers"].dropna()
    for ticker in tickers_str.split(",")
)

sp500_tickers_09to20 = set(
    ticker.strip().upper()
    for tickers_str in sp500_2009_2020["tickers"].dropna()
    for ticker in tickers_str.split(",")
)

print(f"S&P 500 companies (union, 1999–2023): {len(sp500_tickers_99to23)}")
print(f"S&P 500 companies (union, 2009–2020): {len(sp500_tickers_09to20)}")

S&P 500 history rows: 3482
S&P 500 companies (union, 1999–2023): 1022
S&P 500 companies (union, 2009–2020): 692


In [20]:
fnspid_in_sp500_99to23 = set(unique_companies_fnspid) & sp500_tickers_99to23
fnspid_not_in_sp500_99to23 = set(unique_companies_fnspid) - sp500_tickers_99to23

print("\n--- S&P 500 coverage (FNSPID, 1999–2023) ---")
print(f"Companies in S&P 500: {len(fnspid_in_sp500_99to23)}")
print(f"Companies NOT in S&P 500: {len(fnspid_not_in_sp500_99to23)}")

analyst_in_sp500_09to20 = analyst_companies & sp500_tickers_09to20
analyst_not_in_sp500_09to20 = analyst_companies - sp500_tickers_09to20

print("\n--- S&P 500 coverage (Analyst dataset, 2009–2020) ---")
print(f"Companies in S&P 500: {len(analyst_in_sp500_09to20)}")
print(f"Companies NOT in S&P 500: {len(analyst_not_in_sp500_09to20)}")


--- S&P 500 coverage (FNSPID, 1999–2023) ---
Companies in S&P 500: 545
Companies NOT in S&P 500: 4788

--- S&P 500 coverage (Analyst dataset, 2009–2020) ---
Companies in S&P 500: 533
Companies NOT in S&P 500: 5671
