In [1]:
import pandas as pd
from datetime import datetime

# Load the data
# Assuming sp500_constituents.csv has columns: 'Firm', 'EntryDate', 'ExitDate'
df = pd.read_csv('./sp500_constituents.csv')

# Convert date columns to datetime format
df['start'] = pd.to_datetime(df['start'], errors='coerce')
df['ending'] = pd.to_datetime(df['ending'], errors='coerce')
df['nameendt'] = pd.to_datetime(df['nameendt'], errors='coerce')  # Ensure nameendt is datetime
# Define the range of years we are interested in
years = range(2006, 2025)

# Dictionary to hold the yearly snapshots
sp500_by_year = {}
permno_to_ticker = {}
for year in years:
    # Define the start and end of each year
    start_of_year = datetime(year, 1, 1) # 2006-01-01
    end_of_year = datetime(year, 12, 31) # 2006-12-31
    
    # Filter firms active during the year, ensuring they only appear once per year by permno
    active_firms = df[
        (df['start'] <= end_of_year) & 
        ((df['ending'].isna()) | (df['ending'] >= start_of_year))
    ]
    # Get the last entry(nameendt) for each permno, and remove permno duplicate except the last entry of active_firms
    active_firms = active_firms.sort_values(by=['permno', 'nameendt']).groupby('permno').last().reset_index()
    # Convert the resulting DataFrame of firms to a list of unique permnos
    permno_to_ticker = dict(zip(active_firms['permno'], active_firms['ticker']))

    # Store the list of active firms for the year
    sp500_by_year[year] = permno_to_ticker
    

# print(sp500_by_year)
# # Display the results for each year
# for year, firms in sp500_by_year.items():
#     print(f"Year {year}: {len(firms)} firms")
#     # Uncomment below to see the list of firms per year
#     print(firms.values())


In [2]:
reversed_dict = {}
for year, firms in sp500_by_year.items():
    reversed_dict[year] = {ticker : permno for permno, ticker in sp500_by_year[year].items()}
    



In [3]:
data = []
for year, firms in sp500_by_year.items():
    for permno, ticker in firms.items():
        data.append({'year': year, 'permno': permno, 'ticker': ticker})
        
# sp500_by_year = pd.DataFrame(data)
# sp500_by_year.to_parquet('sp500_by_year2.parquet', index=False)


In [4]:
sp500_by_year[sp500_by_year['ticker'] == "UBER"]

Unnamed: 0,year,permno,ticker
9038,2023,18576,UBER


In [2]:
import requests

def get_cik(ticker):
    """
    Retrieves the CIK number for a given ticker symbol from the SEC's EDGAR system.
    
    Parameters:
        ticker (str): The stock ticker symbol.
    
    Returns:
        str: The CIK number for the given ticker symbol, or a message if not found.
    """
    url = 'https://www.sec.gov/files/company_tickers.json'
    headers = {
        'User-Agent': 'University of Edinburgh s2101369@ed.ac.uk',
        'Accept-Encoding': 'gzip, deflate'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        for item in data.values():
            if item['ticker'].lower() == ticker.lower():
                cik = item['cik_str']
                return str(cik).zfill(10)  # Pad CIK to 10 digits
        
        return "CIK not found for the given ticker."
    
    except requests.exceptions.RequestException as e:
        return f"Error accessing SEC data: {e}"

# Example usage
ticker = 'ELV'  # Replace with the desired ticker
cik_number = get_cik(ticker)
print(f"CIK for {ticker}: {cik_number}")


CIK for ELV: 0001156039


In [3]:

total_sp500_cik_to_ticker = {}
for year, firms in sp500_by_year.items():
    print(f"Year {year}: {len(firms)} firms")
    for ticker in firms.values():
        cik_number = get_cik(ticker)
        total_sp500_cik_to_ticker[cik_number] = ticker
        sp500_by_year[year] = {cik_number : ticker}
    print()

Year 2006: 531 firms

Year 2007: 537 firms

Year 2008: 534 firms

Year 2009: 526 firms

Year 2010: 516 firms

Year 2011: 520 firms

Year 2012: 517 firms

Year 2013: 518 firms

Year 2014: 516 firms

Year 2015: 530 firms

Year 2016: 534 firms

Year 2017: 533 firms

Year 2018: 529 firms

Year 2019: 526 firms

Year 2020: 522 firms

Year 2021: 524 firms

Year 2022: 521 firms

Year 2023: 518 firms

Year 2024: 0 firms



In [4]:
total_sp500_cik_to_ticker

df = pd.DataFrame(list(total_sp500_cik_to_ticker.items()), columns=['CIK', 'Symbol' ])
df['CIK'] = pd.to_numeric(df['CIK'], errors='coerce')
df = df.dropna(subset=['CIK'])
# convert the 'key' column back to integers if needed
df['CIK'] = df['CIK'].astype(int)
df.to_csv('sp500_total_constituents2.csv', index=False)


In [1]:
data = []
for year, firms in sp500_by_year.items():
    for cik_number, ticker in firms.items():
        data.append({'year': year, 'cik_number': cik_number, 'ticker': ticker})
        
sp500_by_year = pd.DataFrame(data)
sp500_by_year.to_parquet('sp500_by_year2.parquet', index=False)


NameError: name 'sp500_by_year' is not defined

In [5]:
seen = {}
total_sp500_cik_to_ticker = {}

year, firms = next(iter(sp500_by_year.items()))

for ticker in firms.values():
    cik_number = get_cik(ticker)
    seen[ticker] = cik_number                    

for year, firms in sp500_by_year.items():
    print(f"Year {year}: {len(firms)} firms")
    for ticker in firms.values():
        if ticker in seen:
            total_sp500_cik_to_ticker[seen[ticker]] = ticker
            sp500_by_year[year] = {seen[ticker] : ticker}
        else:
            cik_number = get_cik(ticker)
            total_sp500_cik_to_ticker[cik_number] = ticker
            sp500_by_year[year] = {cik_number : ticker}
    print()

Year 2006: 1 firms

Year 2007: 1 firms

Year 2008: 1 firms

Year 2009: 1 firms

Year 2010: 1 firms

Year 2011: 1 firms

Year 2012: 1 firms

Year 2013: 1 firms

Year 2014: 1 firms

Year 2015: 1 firms

Year 2016: 1 firms

Year 2017: 1 firms

Year 2018: 1 firms

Year 2019: 1 firms

Year 2020: 1 firms

Year 2021: 1 firms

Year 2022: 1 firms

Year 2023: 1 firms

Year 2024: 0 firms

