## Import libraries and file paths


In [1]:
import pandas as pd
import re
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import os
import sys
import time

parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from path_location import folder_location

ABNORMAL_RETURNS_FOLDER = folder_location.ABNORMAL_RETURNS_FOLDER
PROCESSED_DATA_FOLDER = folder_location.PROCESSED_DATA_FOLDER
NETWORK_RAW_FOLDERS = folder_location.PROFILE_DATA_FOLDERS
TRANSACTIONS_LABELLED_FILE = folder_location.TRANSACTIONS_LABELLED_FILE

## Importing relevant files

In [2]:
df_name_match = pd.read_csv(f"{PROCESSED_DATA_FOLDER}/final_final_name_match.csv")
print(df_name_match.columns)
# Create a dictionary that maps SEC_RPTOWNERCIK to NODEID
mapping_dict = df_name_match.set_index("SEC_RPTOWNERCIK")["NODEID"].to_dict()

Index(['SEC_RPTOWNERCIK', 'NODEID'], dtype='object')


In [None]:
df_txns = pd.read_csv(f"{PROCESSED_DATA_FOLDER}/{TRANSACTIONS_LABELLED_FILE}",
                      usecols=["TRANS_SK", "ACCESSION_NUMBER", "TRANS_DATE", "RPTOWNERCIK_;", "ISSUERTRADINGSYMBOL"],
                      parse_dates=["TRANS_DATE"])
df_txns["id"] = df_txns["RPTOWNERCIK_;"].map(mapping_dict)

print(df_txns.columns)
print("First 5 rows of the updated DataFrame:")
display(df_txns.head())

First 5 rows of the updated DataFrame:


Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,TRANS_DATE,ISSUERTRADINGSYMBOL,RPTOWNERCIK,id
0,3794004,0001181431-09-023155,2009-05-04,EXAC,1000032,249380.0
1,2834113,0001181431-12-005367,2012-01-31,EXAC,1000032,249380.0
2,3043733,0001181431-12-047732,2012-08-03,EXAC,1000032,249380.0
3,3043734,0001181431-12-047732,2012-08-07,EXAC,1000032,249380.0
4,3043284,0001181431-12-049839,2012-09-07,EXAC,1000032,249380.0


In [79]:
print(df_txns.shape)
# Optionally, save the updated DataFrame.
df_txns.to_csv("txns_for_features.csv", index=False)
print("Updated 'txns_for_features.csv' with new 'id' column.")

(3171001, 6)
Updated 'txns_for_features.csv' with new 'id' column.


In [None]:
# We save the dictionary to pickle in case we need it.

import pickle

# Load the dictionary from the pickle file
with open(f"{NETWORK_RAW_FOLDERS}/mapping_CIK2Node_txn_dict.pkl", "wb") as f:
    pickle.dump(mapping_dict, f)

print("Dictionary saved to mapping_dict.pkl")

Dictionary saved to mapping_dict.pkl



---

## In this notebook, we create dictionaries that parse congress committee membership by date ranges, and in turn split them by committee.

### We also construct a dictionary for mapping congressmen to their nodeids for later use. (from ```"congress_matches_7apr_603pm.csv"```)

1) We import ```"congress_matches_7apr_603pm.csv"``` to generate a ```congress_nodeid_mapper``` dictionary.
2) We import ```"house.csv"``` to generate our dictionary, ```congress_date_subcomm_mapper``` dictionary.
3) We import ```"TIC to SIC.csv"``` to generate a dictionary that stores the relevant subcommittees for each company.
4) We export all three to pickle files.

---




**Step 1: Load Your Data**

- **house.csv** contains the following columns (among others):
  - **"ID #"** – The congress member’s internal ID.
  - **"Name"** – The member's name.
  
- **congress_matches_7apr_603pm.csv** contains:
  - **"name"** – The member's name.
  - **"id"** – The Littlesis nodeid.
  
**Step 2: Merge on Name**

We merge the two DataFrames on the name fields (assuming the naming is consistent) so that each row now has both the internal congress ID (`"ID #"` from house.csv) and the Littlesis nodeid (`"id"` from congress_matches file).

**Step 3: Build the Dictionary**

After merging, we select the columns `"ID #"` and `"id"`, remove duplicates, and create a dictionary mapping `"ID #"` to `"id"`. For example, if after the merge we have:

| **ID #** | **id** |
|----------|--------|
| 101      | 5001   |
| 102      | 5002   |
| 103      | 5003   |

Then the resulting dictionary will be:

> **congress_mapper[101] = 5001**


In [None]:
import pandas as pd
import pickle

# =============================================================================
# STEP 1: LOAD AND MERGE CONGRESS MEMBER DATA
# =============================================================================
# Load house and senate data (for Congress ≥ 109), keep only "Name" and "ID #",
# drop duplicates, and combine them into a single DataFrame.
df_house = pd.read_csv(f"{NETWORK_RAW_FOLDERS}/house.csv")
df_house = df_house[df_house["Congress"] >= 109]
df_senate = pd.read_csv(f"{NETWORK_RAW_FOLDERS}/senate_ass.csv")
df_senate = df_senate[df_senate["Congress"] >= 109]

house_unique  = df_house[["Name", "ID #"]].drop_duplicates()
senate_unique = df_senate[["Name", "ID #"]].drop_duplicates()
unique_members = pd.concat([house_unique, senate_unique], ignore_index=True)\
                   .drop_duplicates().reset_index(drop=True)

# =============================================================================
# STEP 2: AUTOMATIC BIOGUIDE MATCHING
# =============================================================================
def match_bioguide_ids(unique_members: pd.DataFrame, mapper_path: str) -> pd.DataFrame:
    """
    Maps member names to bioguide IDs using multiple matching strategies:
      1) Full‑name exact match
      2) "Surname, First" prefix match (unambiguous)
      3) Surname‑only match (if one candidate exists)
      4) "Surname, First3" short‑prefix match (if unambiguous)
    Returns a DataFrame with a new column "bioguide".
    """
    # Parse the mapper file (each line: "Lastname, Firstname (Party - State)  BIOMAPID")
    name_to_biog = {}
    with open(mapper_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            name_party, bioguide = line.rsplit(maxsplit=1)
            name = name_party.split(" (")[0]
            name_to_biog[name] = bioguide

    df = unique_members.copy()
    # 2a. Full‑name exact match.
    df["bioguide"] = df["Name"].map(name_to_biog)

    # 2b. "Surname, First" prefix match.
    prefix_to_full = {}
    for full_name in name_to_biog:
        if ", " in full_name:
            surname, rest = full_name.split(", ", 1)
            first = rest.split()[0]
            prefix = f"{surname}, {first}"
            prefix_to_full.setdefault(prefix, []).append(full_name)
    prefix_to_biog = {p: name_to_biog[names[0]] 
                      for p, names in prefix_to_full.items() 
                      if len(names) == 1}
    for idx, row in df[df["bioguide"].isnull()].iterrows():
        parts = row["Name"].split(" (")[0].split(", ", 1)
        if len(parts) == 2:
            surname, rest = parts
            first = rest.split()[0]
            pref = f"{surname}, {first}"
            if pref in prefix_to_biog:
                df.at[idx, "bioguide"] = prefix_to_biog[pref]

    # 2c. Surname‑only match.
    surname_to_full = {}
    for full_name in name_to_biog:
        surname = full_name.split(",", 1)[0]
        surname_to_full.setdefault(surname, []).append(full_name)
    for idx, row in df[df["bioguide"].isnull()].iterrows():
        surname = row["Name"].split(",", 1)[0]
        candidates = surname_to_full.get(surname, [])
        if len(candidates) == 1:
            df.at[idx, "bioguide"] = name_to_biog[candidates[0]]

    # 2d. "Surname, First3" short‑prefix match.
    short_to_full = {}
    for full_name in name_to_biog:
        if ", " in full_name:
            surname, rest = full_name.split(", ",1)
            first = rest.split()[0]
            sp = f"{surname}, {first[:3]}"
            short_to_full.setdefault(sp, []).append(full_name)
    short_to_biog = {sp: name_to_biog[names[0]] 
                     for sp, names in short_to_full.items() 
                     if len(names) == 1}
    for idx, row in df[df["bioguide"].isnull()].iterrows():
        parts = row["Name"].split(" (")[0].split(", ", 1)
        if len(parts) == 2:
            surname, rest = parts
            first = rest.split()[0]
            sp = f"{surname}, {first[:3]}"
            if sp in short_to_biog:
                df.at[idx, "bioguide"] = short_to_biog[sp]
    return df

df_auto = match_bioguide_ids(unique_members, f"{NETWORK_RAW_FOLDERS}/name_bioguide_mapper.txt")

# =============================================================================
# STEP 3: MANUAL CORRECTIONS
# =============================================================================
# Load manual corrections from manual_fill.csv and merge them with automatic results.
df_manual = pd.read_csv(f"{NETWORK_RAW_FOLDERS}/manual_fill.csv", dtype={"ID #": str})[["Name", "ID #", "bioguide"]]
df_auto["ID #"] = df_auto["ID #"].astype(str)
df_manual["ID #"] = df_manual["ID #"].astype(str)
df_merged = pd.merge(df_auto, df_manual, on=["Name"], how="left", suffixes=("", "_manual"))
df_merged["bioguide"] = df_merged["bioguide"].combine_first(df_merged["bioguide_manual"])
df_merged.drop(columns=["bioguide_manual"], inplace=True)

# Build mapping_dict: key = "ID #", value = dict with { "Name", "ID #", "bioguide" }.
mapping_dict = {
    id_num: {"Name": row["Name"], "ID #": id_num, "bioguide": row["bioguide"]}
    for id_num, row in df_merged.set_index("ID #").iterrows()
}

# =============================================================================
# STEP 4: UPDATE WITH LITTLESIS NODEIDs
# =============================================================================
# Load Littlesis entities and create a mapping: bioguide -> list of node IDs.
df_entities = pd.read_csv(f"{NETWORK_RAW_FOLDERS}/entities_merged.csv", dtype=str)
df_entities = df_entities.dropna(subset=["ext_ElectedRepresentative_bioguide_id"])
df_entities["ext_ElectedRepresentative_bioguide_id"] = df_entities["ext_ElectedRepresentative_bioguide_id"].astype(str)
df_entities["id"] = df_entities["id"].astype(str)
littlesis_map = df_entities.groupby("ext_ElectedRepresentative_bioguide_id")["id"].apply(list).to_dict()

# For each member in mapping_dict, update with NODEID (list of node ids) if available.
for key, entry in mapping_dict.items():
    bg = entry.get("bioguide")
    if bg and bg in littlesis_map:
        mapping_dict[key]["NODEID"] = littlesis_map[bg]
    else:
        mapping_dict[key]["NODEID"] = []

# =============================================================================
# STEP 5: APPLY MANUAL UPDATES TO THE MAPPING DICTIONARY
# =============================================================================
manual_updates = {
    "29941.0": {"bioguide": "G000545", "NODEID": ["13706"]},
    "29729.0": {"bioguide": "M000452", "NODEID": ["75681"]},
    "21140.0": {"bioguide": "K000037", "NODEID": ["91599"]},
    "21101.0": {"bioguide": "R000134", "NODEID": ["96165"]},
    "21341.0": {"bioguide": "G000480", "NODEID": ["127277"]},
    "21533.0": {"bioguide": "A000373", "NODEID": ["68986"]},
    "21727.0": {"bioguide": "J000148", "NODEID": ["284895"]},
    "21909.0": {"bioguide": "C001124", "NODEID": ["284848"]},
    "21926.0": {"bioguide": "G000545", "NODEID": ["359084"]},
    "14651.0": {"bioguide": "N000146", "NODEID": ["13497"]},
    "40103.0": {"bioguide": "N000073", "NODEID": ["13498"]}
}

# Iterate over the manual_updates and update mapping_dict accordingly.
for member_id, manual_info in manual_updates.items():
    updated = False
    for key, entry in mapping_dict.items():
        if str(entry.get("ID #")).strip() == member_id:
            if entry.get("bioguide") == manual_info.get("bioguide") or not entry.get("bioguide"):
                mapping_dict[key]["NODEID"] = manual_info.get("NODEID")
                print(f"Manually updated NODEID for {entry.get('Name')} (ID #: {member_id}) with NODEID {manual_info.get('NODEID')}")
                updated = True
    if not updated:
        print(f"No mapping_dict entry found with ID # {member_id} for manual update.")

# =============================================================================
# STEP 6: EXPORT FINAL MAPPING
# =============================================================================
# Optionally build a DataFrame for CSV output.
df_mapping = pd.DataFrame(list(mapping_dict.values()))
df_mapping.set_index("ID #", inplace=True)
df_mapping.reset_index(inplace=True)
#df_mapping.to_csv("all_matched_bioguide.csv", index=False, encoding="utf-8")

# Export mapping_dict to a pickle file.
with open(f"{NETWORK_RAW_FOLDERS}/congress_nodeid_mapper.pkl", "wb") as f:
    pickle.dump(mapping_dict, f)
    
print("Exported final mapping to 'all_matched_bioguide.csv' and 'congress_nodeid_mapper.pkl'")


Manually updated NODEID for Green, Mark (ID #: 29941.0) with NODEID ['13706']
Manually updated NODEID for McGovern, Jim (ID #: 29729.0) with NODEID ['75681']
Manually updated NODEID for Keating, William (ID #: 21140.0) with NODEID ['91599']
Manually updated NODEID for Reed, Thomas W., II (ID #: 21101.0) with NODEID ['96165']
Manually updated NODEID for Grisham, Michelle Lujan (ID #: 21341.0) with NODEID ['127277']
Manually updated NODEID for Ashford, Brad (ID #: 21533.0) with NODEID ['68986']
Manually updated NODEID for Johnson, James M. (Mike) (ID #: 21727.0) with NODEID ['284895']
Manually updated NODEID for Cox, TJ (ID #: 21909.0) with NODEID ['284848']
Manually updated NODEID for Green, Mark (ID #: 21926.0) with NODEID ['359084']
Manually updated NODEID for Nelson, Clarence William (Bill) (ID #: 14651.0) with NODEID ['13497']
Manually updated NODEID for Nelson, Earl Benjamin (Ben) (ID #: 40103.0) with NODEID ['13498']
Exported final mapping to 'all_matched_bioguide.csv' and 'congre


## **2. Building the `congress_date_subcomm_mapper`**

Next, we process **`house.csv`**. This file has a row per congress member with their start (assignment) and end (termination) dates, plus a subcommittee name (in the column **"Committee Name"**).

### **Steps:**

1. **Preprocess `house.csv`:**  
   Convert the `"Date of Assignment"` and `"Date of Termination"` columns into datetime objects.

2. **For each subcommittee:**  
   - **Extract rows for that subcommittee.**  
   - Create two events per row:  
     - A **join** event on the assignment date.  
     - A **leave** event on the termination date **+ 1 day** (so the member is active through their final day).  
     
   **Example Events:**

   | **ID #** | **Event Date** | **Event** | **Subcommittee**   |
   |----------|----------------|-----------|--------------------|
   | 1        | 2020‑01‑01     | join      | *Subcommittee A*   |
   | 1        | 2020‑01‑04     | leave     | *Subcommittee A*   |
   | 2        | 2020‑01‑02     | join      | *Subcommittee A*   |
   | 2        | 2020‑01‑03     | leave     | *Subcommittee A*   |


3. **Sort events and “sweep” through them:**  
   As you iterate through the sorted events, maintain a set of active members. After processing each event, record a snapshot (as a sorted list) in a dictionary keyed by the event date.

4. **Store snapshots in a nested dictionary:**  
   The final **`congress_date_subcomm_mapper`** will map each subcommittee name to its membership timeline.  

   **Example:**

   ```python
   congress_date_subcomm_mapper = {
       "Subcommittee A": {
           Timestamp("2020-01-01 00:00:00"): [1],
           Timestamp("2020-01-02 00:00:00"): [1, 2],
           Timestamp("2020-01-03 00:00:00"): [1],    # After member 2 left.
           Timestamp("2020-01-04 00:00:00"): []        # After member 1 left.
       },
       "Subcommittee B": {
           ... # Similar structure for another subcommittee.
       }
   }
   ```

---



In [None]:

# -------- Helper Function --------
def ensure_flat_list(x):
    """Ensure the input is returned as a flat list."""
    if isinstance(x, list):
        flat = []
        for item in x:
            if isinstance(item, list):
                flat.extend(item)
            else:
                flat.append(item)
        return flat
    else:
        return [x]

# -------- Membership Timeline by Subcommittee Function --------
def create_date_subcomm_mapper(df, mapping_dict):
    """
    Build a subcommittee membership timeline using the membership data in df.
    
    The DataFrame df must contain the columns:
      - "ID #" (the congress internal member id)
      - "Date of Assignment" (when the member joined the subcommittee)
      - "Date of Termination" (when the member left; if missing, assumed active through 31 Dec 2021)
      - "Committee Name" (the subcommittee)
      
    The mapping_dict is assumed to map a member’s "ID #" (as a string) 
    to a dictionary that has a key "NODEID" holding the Littlesis node id(s).
    
    For each subcommittee, the function creates a timeline dictionary where keys are event dates and values are 
    sorted lists of active Littlesis node IDs as of that date. Membership events are generated using:
      - A "join" event on the assignment date.
      - A "leave" event on the day after the termination date (or after Dec 31, 2021 if missing).
      
    Returns a dictionary mapping each subcommittee to its timeline.
    """
    # Convert date columns
    df["Date of Assignment"] = pd.to_datetime(df["Date of Assignment"], errors="coerce", dayfirst=True)
    df["Date of Termination"] = pd.to_datetime(df["Date of Termination"], errors="coerce", dayfirst=True)
    
    subcomm_mapper = {}
    subcommittees = df["Committee Name"].unique()
    
    for subcomm in tqdm(subcommittees, desc="Processing subcommittees"):
        df_sub = df[df["Committee Name"] == subcomm].copy()
        events = []  # Will store tuples of (event_date, node, event_type)
        
        for _, row in df_sub.iterrows():
            member = str(row["ID #"]).strip()
            # Look up the corresponding Littlesis node ids from mapping_dict;
            # if no mapping, skip this row.
            node_ids = mapping_dict.get(member, {}).get("NODEID")
            if node_ids is None:
                continue
            node_ids = ensure_flat_list(node_ids)
            
            assign_date = row["Date of Assignment"]
            term_date = row["Date of Termination"]
            # If termination is missing, use December 31, 2021
            effective_term = term_date if pd.notna(term_date) else pd.Timestamp("2021-12-31")
            
            if pd.notna(assign_date):
                for node in node_ids:
                    events.append((assign_date, node, "join"))
            # Create leave events one day after effective termination.
            for node in node_ids:
                events.append((effective_term + pd.Timedelta(days=1), node, "leave"))
        
        # Sort events by date
        events.sort(key=lambda x: x[0])
        timeline = {}
        active_nodes = set()
        for event_date, node, event_type in events:
            # Cast node to string (if not already) so that it is hashable.
            node = str(node)
            if event_type == "join":
                active_nodes.add(node)
            elif event_type == "leave":
                active_nodes.discard(node)
            # Snapshot the active nodes (sorted for consistency)
            timeline[event_date] = sorted(active_nodes)
        subcomm_mapper[subcomm] = timeline
    return subcomm_mapper

# -------- Main Code --------
# Assume df_house and df_senate have already been read from "house.csv" and "senate.csv" respectively.
# Restrict the columns to those needed.
df_senate = pd.read_csv(f"{NETWORK_RAW_FOLDERS}/senate_ass.csv")
print(df_house.columns)
print(df_senate.columns)

df_senate["Date of Assignment"] = df_senate['Date of Appointment']
# Restrict columns (and create a copy) for each chamber.
df_house = df_house[["ID #", 'Date of Assignment', "Date of Termination", "Committee Name"]].copy()
df_senate = df_senate[["ID #", 'Date of Assignment', 'Date of Termination', 'Committee Name']].copy()


# Create the subcommittee membership timelines using the mapping_dict,
# which should map internal "ID #" values (as strings) to a dictionary with key "NODEID".
house_date_subcomm_mapper = create_date_subcomm_mapper(df_house, mapping_dict)
senate_date_subcomm_mapper = create_date_subcomm_mapper(df_senate, mapping_dict)

# Export the dictionaries to pickle files.
with open(f"{NETWORK_RAW_FOLDERS}/house_date_subcomm_mapper.pkl", "wb") as f:
    pickle.dump(house_date_subcomm_mapper, f)
with open(f"{NETWORK_RAW_FOLDERS}/senate_date_subcomm_mapper.pkl", "wb") as f:
    pickle.dump(senate_date_subcomm_mapper, f)

print("Exported house_date_subcomm_mapper.pkl and senate_date_subcomm_mapper.pkl.")


Index(['Congress', 'Committee code', 'ID #', 'Name', 'Maj/Min',
       'Rank Within Party Status', 'Party', 'Date of Assignment',
       'Date of Termination', 'Senior Party Member', 'Committee Seniority',
       'Committee Period of Service',
       'Committee status at end of this Congress',
       'Committee continuity of assignment in next Congress',
       'Appointment Citation', 'Committee Name', 'State', 'CD', 'State Name',
       'Notes', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23'],
      dtype='object')
Index(['Congress', 'Committee Code', 'ID #', 'Name', 'Maj/Min',
       'Rank Within Party', 'Party Code', 'Date of Appointment',
       'Date of Termination', 'Senior Party Member', 'Committee Seniority',
       'Committee Period of Service',
       'Committee status at end of this Congress',
       'Committee continuity of assignment in next Congress',
       'Appointment Citation', 'Committee Name', 'State Code', 'District',
       'State Name', 'Notes', 'Unna

Processing subcommittees:   0%|          | 0/80 [00:00<?, ?it/s]

Processing subcommittees:   0%|          | 0/35 [00:00<?, ?it/s]

Exported house_date_subcomm_mapper.pkl and senate_date_subcomm_mapper.pkl.


In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import pickle
import numpy as np

# ----- Helper Function -----
def ensure_flat_list(x):
    """
    If x is a list-of-lists or a single value, return a flat list.
    """
    if isinstance(x, list):
        flat = []
        for item in x:
            if isinstance(item, list):
                flat.extend(item)
            else:
                flat.append(item)
        return flat
    else:
        return [x]

# ----- Function to Create Membership Timeline -----
def create_membership_timeline(df, mapping_dict, output_pickle_filename):
    """
    Given a membership DataFrame (df) with at least the columns:
      - "ID #"
      - "Date of Assignment" and "Date of Termination"
    
    This function uses mapping_dict (which maps each congress member's internal 
    "ID #" to a dictionary that has a "NODEID" field containing Littlesis node id(s))
    to create a timeline dictionary where keys are event dates and values 
    are lists of active Littlesis node IDs as of that date.
    
    If "Date of Termination" is NaN, it is substituted with December 31, 2021.
    Then, the leave event is set to the following day (i.e. January 1, 2022).
    
    The resulting timeline is exported to a pickle file.
    """
    # Copy and ensure proper date conversion.
    df = df.copy()
    df["Date of Assignment"] = pd.to_datetime(df["Date of Assignment"], errors="coerce", dayfirst=True)
    df["Date of Termination"] = pd.to_datetime(df["Date of Termination"], errors="coerce", dayfirst=True)
    
    events = []  # Will store tuples: (event_date, node, event_type)
    
    # Iterate over each row.
    for _, row in df.iterrows():
        # Get the congress member's internal ID as string.
        member_id = str(row["ID #"]).strip()
        # Look up the Littlesis node id(s) from mapping_dict.
        node_ids = mapping_dict.get(member_id, {}).get("NODEID")
        if node_ids is None:
            continue  # Skip if no mapping found.
        # Ensure we have a flat list of node ids.
        node_ids = ensure_flat_list(node_ids)
        # Get the join (assignment) date.
        assign_date = row["Date of Assignment"]
        # Use termination date if present; otherwise substitute with Dec 31, 2021.
        term_date = row["Date of Termination"]
        effective_term_date = term_date if pd.notna(term_date) else pd.Timestamp("2021-12-31")
        
        # Create join events (if assignment date is valid).
        if pd.notna(assign_date):
            for node in node_ids:
                events.append((assign_date, node, "join"))
        
        # Create leave events one day after the effective termination date.
        for node in node_ids:
            events.append((effective_term_date + pd.Timedelta(days=1), node, "leave"))
    
    # Sort the events chronologically.
    events.sort(key=lambda x: x[0])
    
    # Sweep through events, accumulating the active node IDs.
    timeline = {}
    active_nodes = set()
    for event_date, node, event_type in events:
        # Since node IDs must be hashable (e.g. a string or number), if not already, cast node to string.
        node = str(node)
        if event_type == "join":
            active_nodes.add(node)
        elif event_type == "leave":
            active_nodes.discard(node)
        # Save a snapshot of the current active nodes at this event date.
        timeline[event_date] = sorted(active_nodes)
    
    # Export the timeline dictionary to a pickle file.
    with open(output_pickle_filename, "wb") as f:
        pickle.dump(timeline, f)
    
    return timeline

# ----- Main Code to Build Both House and Senate Membership Timelines -----

# (Assume that df_house and df_senate have been read already from their respective CSV files.)
# For example, you might have:
# df_house = pd.read_csv("house.csv")
df_senate = pd.read_csv(f"{NETWORK_RAW_FOLDERS}/senate_ass.csv")
print(df_house.columns)
print(df_senate.columns)

df_senate["Date of Assignment"] = df_senate["Date of Appointment"]
# Restrict columns (and create a copy) for each chamber.
df_house = df_house[["ID #", 'Date of Assignment', "Date of Termination", "Committee Name"]].copy()
df_senate = df_senate[["ID #", 'Date of Assignment', 'Date of Termination', 'Committee Name']].copy()

# Now create the membership timelines using the mapping_dict that maps internal IDs to Littlesis NODEIDs.
house_membership_timeline = create_membership_timeline(df_house, mapping_dict, f"{NETWORK_RAW_FOLDERS}/house_membership_by_date.pkl")
senate_membership_timeline = create_membership_timeline(df_senate, mapping_dict, f"{NETWORK_RAW_FOLDERS}/senate_membership_by_date.pkl")

print("House membership timeline and Senate membership timeline have been built and exported.")


Index(['ID #', 'Date of Assignment', 'Date of Termination', 'Committee Name'], dtype='object')
Index(['Congress', 'Committee Code', 'ID #', 'Name', 'Maj/Min',
       'Rank Within Party', 'Party Code', 'Date of Appointment',
       'Date of Termination', 'Senior Party Member', 'Committee Seniority',
       'Committee Period of Service',
       'Committee status at end of this Congress',
       'Committee continuity of assignment in next Congress',
       'Appointment Citation', 'Committee Name', 'State Code', 'District',
       'State Name', 'Notes', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')
House membership timeline and Senate membership timeline have been built and exported.



---

## **3. Building the TIC-to‑Subcommittee Dictionary**

We start by importing the TIC‑to‑SIC data (for example, from **`TIC to SIC.xlsx`**). This file contains mapping data with at least the following columns:

- **TIC**: the company’s TIC.
- **Committee 1**, **Committee 2**, **Committee 3**, **Committee 4**: the subcommittee names associated with the company.

**Example Table:**

| **TIC** | **Committee 1**    | **Committee 2**    | **Committee 3** | **Committee 4** |
|---------|--------------------|--------------------|-----------------|-----------------|
| XYZ     | Subcommittee A     | Subcommittee B     | NaN             | NaN             |
| ABC     | Subcommittee A     | NaN                | NaN             | NaN             |

After grouping by **TIC** and filtering out any missing values (i.e. NaNs), we obtain—for example:

> **tic_to_subcomm_mapper["XYZ"] = {"Subcommittee A", "Subcommittee B"}**

Next, we add a predefined set of **universal committees** to every company. For example, if:

> **UNIVERSAL_COMMITTEES = {'BUDGET','COMMERCE','ECONOMIC (JOINT)','Energy and Commerce','Small Business','TAXATION (JOINT)','Ways and Means'}**

then the final mapping for "XYZ" might be:

> **tic_to_subcomm_mapper["XYZ"] = {"Subcommittee A", "Subcommittee B", "BUDGET", "COMMERCE", "ECONOMIC (JOINT)", "Energy and Commerce", "Small Business", "TAXATION (JOINT)", "Ways and Means"}**

*<mark>Note:</mark>* We store the committees as a **set** for uniqueness and efficient membership updates. If ordering is required later, you can always convert the set to a sorted list.

---

## **4. Exporting the Dictionaries to Pickle Files**

Once you have built the mapping dictionaries (including **`tic_to_subcomm_mapper`** and the others you create later), you can export them to pickle files for later use.

---


In [5]:
import pandas as pd

# Read the TIC-to-SIC file (Excel version)
df_ticsic = pd.read_excel(f"{NETWORK_RAW_FOLDERS}/TIC to SIC.xlsx")
print("Original columns:", df_ticsic.columns.tolist())

# Keep only the relevant columns
df_ticsic = df_ticsic[['tic', 'Committee 1', 'Committee 2', 'Committee 3', 'Committee 4']]

# --- Step 3: Create the Mapping Dictionary ---
# Convert each row’s committee values to strings and filter out NaN.
tic_to_subcomm_mapper = (
    df_ticsic
    .set_index('tic')
    .apply(lambda row: set(filter(pd.notna, row)), axis=1)
    .to_dict()
)

UNIVERSAL_COMMITTEES = {'BUDGET','COMMERCE','ECONOMIC (JOINT)','Energy and Commerce','Small Business','TAXATION (JOINT)','Ways and Means'}

for tic, comm_set in tic_to_subcomm_mapper.items():
    # remove 0 or '0'
    comm_set.discard(0)
    comm_set.discard('0')
    # add universal
    comm_set.update(UNIVERSAL_COMMITTEES)


# --- Optional: Display some examples ---
print("\nExample mapping (first 3 entries):")
for tic, committees in list(tic_to_subcomm_mapper.items())[:3]:
    print(f"**{tic}** -> {committees}")


Original columns: ['Unnamed: 0', 'tic', 'gvkey', 'conm', 'cusip', 'cik', 'sic', 'naics', 'IDBFLAG', 'INCORP', 'GSECTOR', 'GGROUP', 'GIND', 'GSUBIND', 'Unnamed: 14', 'Committee 1', 'Committee 2', 'Committee 3', 'Committee 4', 'Unnamed: 19', 'Unnamed: 20', 'https://infotrie.com/docs/static-data/classifications/sd-1-2-gics-classifications/']

Example mapping (first 3 entries):
**AIR** -> {'TAXATION (JOINT)', 'Energy and Commerce', 'Ways and Means', 'ECONOMIC (JOINT)', 'Small Business', 'BUDGET', 'ARMED SERVICES', 'COMMERCE'}
**AAL** -> {'Small Business', 'Energy and Commerce', 'Ways and Means', 'BUDGET', 'TAXATION (JOINT)', 'COMMERCE', 'ECONOMIC (JOINT)'}
**CECO** -> {'Small Business', 'Energy and Commerce', 'Ways and Means', 'BUDGET', 'TAXATION (JOINT)', 'COMMERCE', 'ECONOMIC (JOINT)'}


In [None]:

# --- Step 4: Export the Dictionary to a Pickle File ---
with open(f"{NETWORK_RAW_FOLDERS}/tic_to_subcomm_mapper.pkl", "wb") as f:
    pickle.dump(tic_to_subcomm_mapper, f)
print("\nSaved tic_to_subcomm_mapper to 'tic_to_subcomm_mapper.pkl'")


Saved tic_to_subcomm_mapper to 'tic_to_subcomm_mapper.pkl'


In [6]:
## building House_membership_by_date

df_sen_ticsic = pd.read_csv(f"{NETWORK_RAW_FOLDERS}/Senate TIC to SIC.csv")
df_sen_ticsic = df_ticsic[['tic', 'Committee 1', 'Committee 2', 'Committee 3', 'Committee 4']]

sen_tic_to_subcomm_mapper = (
    df_sen_ticsic
    .set_index('tic')
    .apply(lambda row: set(filter(pd.notna, row)), axis=1)
    .to_dict()
)
UNIVERSAL_COMMITTEES = {
    "Appropriations",
    "Banking, Housing, and Urban Affairs",
    "Budget",
    "Commerce, Science, and Transportation",
    "Economic (Joint Committee)",
    "Finance",
    "Small Business",
    "Small Business and Entrepreneurship",
    "Taxation (Joint)"
}

for tic, comm_set in sen_tic_to_subcomm_mapper.items():
    # remove 0 or '0'
    comm_set.discard(0)
    comm_set.discard('0')
    # add universal
    comm_set.update(UNIVERSAL_COMMITTEES)


# --- Optional: Display some examples ---
print("\nExample mapping (first 3 entries):")
for tic, committees in list(sen_tic_to_subcomm_mapper.items())[:3]:
    print(f"**{tic}** -> {committees}")



Example mapping (first 3 entries):
**AIR** -> {'Commerce, Science, and Transportation', 'Taxation (Joint)', 'Finance', 'Small Business and Entrepreneurship', 'Appropriations', 'Small Business', 'Banking, Housing, and Urban Affairs', 'Economic (Joint Committee)', 'Budget', 'ARMED SERVICES'}
**AAL** -> {'Commerce, Science, and Transportation', 'Taxation (Joint)', 'Finance', 'Small Business and Entrepreneurship', 'Appropriations', 'Small Business', 'Banking, Housing, and Urban Affairs', 'Economic (Joint Committee)', 'Budget'}
**CECO** -> {'Commerce, Science, and Transportation', 'Taxation (Joint)', 'Finance', 'Small Business and Entrepreneurship', 'Appropriations', 'Small Business', 'Banking, Housing, and Urban Affairs', 'Economic (Joint Committee)', 'Budget'}


In [None]:
# --- Step 4: Export the Dictionary to a Pickle File ---
with open(f"{NETWORK_RAW_FOLDERS}/sen_tic_to_subcomm_mapper.pkl", "wb") as f:
    pickle.dump(sen_tic_to_subcomm_mapper, f)
print("\nSaved sen_tic_to_subcomm_mapper to 'sen_tic_to_subcomm_mapper.pkl'")


Saved sen_tic_to_subcomm_mapper to 'sen_tic_to_subcomm_mapper.pkl'
