In [47]:
import pandas as pd
from datetime import datetime

# Load the data
# Assuming sp500_constituents.csv has columns: 'Firm', 'EntryDate', 'ExitDate'
df = pd.read_csv('./sp500_constituents.csv')

# Convert date columns to datetime format
df['start'] = pd.to_datetime(df['start'], errors='coerce')
df['ending'] = pd.to_datetime(df['ending'], errors='coerce')

# Define the range of years we are interested in
years = range(2006, 2025)

# Dictionary to hold the yearly snapshots
sp500_by_year = {}
permno_to_ticker = {}
for year in years:
    # Define the start and end of each year
    start_of_year = datetime(year, 1, 1)
    end_of_year = datetime(year, 12, 31)
    
    # Filter firms active during the year, ensuring they only appear once per year by permno
    active_firms = df[
        (df['start'] <= end_of_year) & 
        ((df['ending'].isna()) | (df['ending'] >= start_of_year))
    ].drop_duplicates(subset=['permno'])
    
    # Exclude entries where a firm was in and out within a partial year
    # This step further removes any firms that had re-entries or overlaps in date ranges
    active_firms = active_firms[
        (active_firms['start'] <= start_of_year) | (active_firms['ending'] >= end_of_year)
    ]

    # Convert the resulting DataFrame of firms to a list of unique permnos
    permno_to_ticker = dict(zip(active_firms['permno'], active_firms['ticker']))

    # Store the list of active firms for the year
    sp500_by_year[year] = permno_to_ticker
    

print(sp500_by_year)
# Display the results for each year
for year, firms in sp500_by_year.items():
    print(f"Year {year}: {len(firms)} firms")
    # Uncomment below to see the list of firms per year
    print(firms.values())


{2006: {77178: 'QCOM', 48485: 'JP', 19502: 'WAG', 56573: 'ITW', 64936: 'D', 43123: 'ATI', 52978: 'HAS', 38156: 'WMB', 85631: 'ADSK', 22103: 'EMR', 70092: 'KBH', 87055: 'COST', 17806: 'PD', 75912: 'PMTC', 19561: 'BA', 51377: 'NSM', 56274: 'CAG', 82686: 'CTXS', 14277: 'SLB', 15069: 'MRO', 77063: 'HCR', 28484: 'AHC', 11607: 'DCN', 62770: 'ASO', 13928: 'P', 44601: 'AVY', 50032: 'ABS', 76887: 'AW', 58683: 'LUV', 60206: 'SNA', 26518: 'BOL', 10147: 'EMC', 53613: 'MU', 24272: 'ASH', 40272: 'IFF', 15579: 'TXN', 13821: 'PGL', 60599: 'CTL', 12060: 'GE', 25778: 'CA', 37584: 'BEN', 14008: 'AMGN', 14541: 'CHV', 75154: 'CCL', 59440: 'SAFC', 77606: 'KSS', 85926: 'SEE', 67467: 'CNS', 44644: 'AUD', 27828: 'HWP', 76201: 'XLNX', 60986: 'NWL', 76171: 'COL', 27633: 'R', 40125: 'CSC', 45751: 'MMC', 75257: 'TLAB', 65787: 'TRB', 68144: 'STI', 82775: 'HIG', 27991: 'PNW', 91380: 'RBK', 61399: 'LOW', 46886: 'KLAC', 25785: 'F', 48960: 'KRI', 81593: 'WM', 56223: 'LPX', 10145: 'HON', 38682: 'DNY', 15720: 'EIX', 4220

In [48]:
import requests

def get_cik(ticker):
    """
    Retrieves the CIK number for a given ticker symbol from the SEC's EDGAR system.
    
    Parameters:
        ticker (str): The stock ticker symbol.
    
    Returns:
        str: The CIK number for the given ticker symbol, or a message if not found.
    """
    url = 'https://www.sec.gov/files/company_tickers.json'
    headers = {
        'User-Agent': 'University of Edinburgh s2101369@ed.ac.uk',
        'Accept-Encoding': 'gzip, deflate'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        for item in data.values():
            if item['ticker'].lower() == ticker.lower():
                cik = item['cik_str']
                return str(cik).zfill(10)  # Pad CIK to 10 digits
        
        return "CIK not found for the given ticker."
    
    except requests.exceptions.RequestException as e:
        return f"Error accessing SEC data: {e}"

# Example usage
ticker = 'QCOM'  # Replace with the desired ticker
cik_number = get_cik(ticker)
print(f"CIK for {ticker}: {cik_number}")


CIK for QCOM: 0000804328


In [38]:

total_sp500_cik_to_ticker = {}
for year, firms in sp500_by_year.items():
    print(f"Year {year}: {len(firms)} firms")
    for ticker in firms.values():
        cik_number = get_cik(ticker)
        total_sp500_cik_to_ticker[cik_number] = ticker
        sp500_by_year[year] = {cik_number : ticker}
    print()

Year 2006: 531 firms

Year 2007: 537 firms

Year 2008: 534 firms

Year 2009: 526 firms

Year 2010: 516 firms

Year 2011: 520 firms

Year 2012: 517 firms

Year 2013: 518 firms

Year 2014: 516 firms

Year 2015: 529 firms

Year 2016: 533 firms

Year 2017: 533 firms

Year 2018: 529 firms

Year 2019: 526 firms

Year 2020: 522 firms

Year 2021: 524 firms

Year 2022: 521 firms

Year 2023: 503 firms

Year 2024: 0 firms



In [46]:
total_sp500_cik_to_ticker

df = pd.DataFrame(list(total_sp500_cik_to_ticker.items()), columns=['CIK', 'Symbol' ])
df['CIK'] = pd.to_numeric(df['CIK'], errors='coerce')
df = df.dropna(subset=['CIK'])
# convert the 'key' column back to integers if needed
df['CIK'] = df['CIK'].astype(int)
df.to_csv('sp500_total_constituents.csv', index=False)


In [41]:
sp500_by_year

{2006: {'0000097216': 'TEX'},
 2007: {'0001326380': 'GME'},
 2008: {'0001060391': 'RSG'},
 2009: {'0000882835': 'ROP'},
 2010: {'0001048695': 'FFIV'},
 2011: {'0000908255': 'BWA'},
 2012: {'0001121788': 'GRMN'},
 2013: {'0000851968': 'MHK'},
 2014: {'0000884887': 'RCL'},
 2015: {'0000313927': 'CHD'},
 2016: {'0000912595': 'MAA'},
 2017: {'0001513761': 'NCLH'},
 2018: {'0001539838': 'FANG'},
 2019: {'0001335258': 'LYV'},
 2020: {'0001318605': 'TSLA'},
 2021: {'CIK not found for the given ticker.': 'SBNY'},
 2022: {'0001274494': 'FSLR'},
 2023: {'0001274494': 'FSLR'},
 2024: {}}

In [49]:
seen = {}
total_sp500_cik_to_ticker = {}

year, firms = next(iter(sp500_by_year.items()))

for ticker in firms.values():
    cik_number = get_cik(ticker)
    seen[ticker] = cik_number                    

for year, firms in sp500_by_year.items():
    print(f"Year {year}: {len(firms)} firms")
    for ticker in firms.values():
        if ticker in seen:
            total_sp500_cik_to_ticker[seen[ticker]] = ticker
            sp500_by_year[year] = {seen[ticker] : ticker}
        else:
            cik_number = get_cik(ticker)
            total_sp500_cik_to_ticker[cik_number] = ticker
            sp500_by_year[year] = {cik_number : ticker}
    print()

Year 2006: 531 firms

Year 2007: 537 firms

Year 2008: 534 firms

Year 2009: 526 firms

Year 2010: 516 firms

Year 2011: 520 firms

Year 2012: 517 firms

Year 2013: 518 firms

Year 2014: 516 firms

Year 2015: 529 firms

Year 2016: 533 firms

Year 2017: 533 firms

Year 2018: 529 firms

Year 2019: 526 firms

Year 2020: 522 firms

Year 2021: 524 firms

Year 2022: 521 firms

Year 2023: 503 firms

Year 2024: 0 firms



In [50]:
sp500_by_year

{2006: {'0000097216': 'TEX'},
 2007: {'0001326380': 'GME'},
 2008: {'0001060391': 'RSG'},
 2009: {'0000882835': 'ROP'},
 2010: {'0001048695': 'FFIV'},
 2011: {'0000908255': 'BWA'},
 2012: {'0001121788': 'GRMN'},
 2013: {'0000851968': 'MHK'},
 2014: {'0000884887': 'RCL'},
 2015: {'0000313927': 'CHD'},
 2016: {'0000912595': 'MAA'},
 2017: {'0001513761': 'NCLH'},
 2018: {'0001539838': 'FANG'},
 2019: {'0001335258': 'LYV'},
 2020: {'0001318605': 'TSLA'},
 2021: {'CIK not found for the given ticker.': 'SBNY'},
 2022: {'0001274494': 'FSLR'},
 2023: {'0001274494': 'FSLR'},
 2024: {}}