In [32]:
import pandas as pd
from datetime import datetime

In [75]:


# Load the data
# Assuming sp500_constituents.csv has columns: 'Firm', 'EntryDate', 'ExitDate'
df = pd.read_csv('../Code_4_SECfilings/sp500_constituents.csv')

# Convert date columns to datetime format
df['start'] = pd.to_datetime(df['start'], errors='coerce')
df['ending'] = pd.to_datetime(df['ending'], errors='coerce')

# Define the range of years we are interested in
years = range(2006, 2025)

# Dictionary to hold the yearly snapshots
sp500_by_year = {}
permno_to_ticker = {}
for year in years:
    # Define the start and end of each year
    start_of_year = datetime(year, 1, 1)
    end_of_year = datetime(year, 12, 31)
    
    # Filter firms active during the year, ensuring they only appear once per year by permno
    active_firms = df[
        (df['start'] <= end_of_year) & 
        ((df['ending'].isna()) | (df['ending'] >= start_of_year))
    ].drop_duplicates(subset=['permno'])
    
    # Exclude entries where a firm was in and out within a partial year
    # This step further removes any firms that had re-entries or overlaps in date ranges
    active_firms = active_firms[
        (active_firms['start'] <= start_of_year) | (active_firms['ending'] >= end_of_year)
    ]

    # Convert the resulting DataFrame of firms to a list of unique permnos
    permno_to_ticker = dict(zip(active_firms['permno'], active_firms['ticker']))

    # Store the list of active firms for the year
    sp500_by_year[year] = permno_to_ticker
    

# print(sp500_by_year)
# # Display the results for each year
# for year, firms in sp500_by_year.items():
#     print(f"Year {year}: {len(firms)} firms")
#     # Uncomment below to see the list of firms per year
#     print(firms.values())


In [76]:
path = '../Code_4_SECfilings/sp500_total_constituents.csv'
df = pd.read_csv(path)
reversed_dict = {}
for year, firms in sp500_by_year.items():
    reversed_dict[year] = {ticker: pernmo for pernmo, ticker in sp500_by_year[year].items()}
    for permno, ticker in sp500_by_year[year].items():
        if ticker in df["Symbol"].tolist():
            cik = df[df["Symbol"] == ticker]["CIK"].values[0]
            reversed_dict[year][ticker] = cik


In [77]:
sp500_by_year = reversed_dict.copy()


In [78]:
path = "/Users/apple/PROJECT/hons_project/data/SP500/analysis_reports/intermediate/batch_10.parquet"
batch_number = 10
df = pd.read_parquet(path)
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df.shape

(6825, 14213)

In [79]:
df['Year'] = df["Date"].dt.year

vaild_pairs = set()
for year, firms in sp500_by_year.items():
    for cik in firms.values():
        cik = str(cik).zfill(10)
        vaild_pairs.add((year, cik))
df_filtered = df[df.apply(lambda row: (row["Year"], row["_cik"]) in vaild_pairs, axis=1)]
df_filtered = df_filtered.drop(columns=["Year"])
df_filtered.shape

(5897, 14213)

In [80]:
df_filtered.to_parquet(f"/Users/apple/PROJECT/hons_project/data/SP500/analysis_reports/filtered/batch_filtered_{batch_number}.parquet")