In [1]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

In [2]:
# List of 97 suspicious members of congress
# https://www.nytimes.com/interactive/2022/09/13/us/politics/congress-members-stock-trading-list.html

members = [
    "Rick W. Allen", "Cindy Axne", "Donald S. Beyer Jr.", "Gus Bilirakis", "Earl Blumenauer", "Richard Blumenthal",
    "Roy Blunt", "John Boozman", "Mo Brooks", "Michael C. Burgess", "Richard M. Burr", "Cheri Bustos",
    "Shelley Moore Capito", "Thomas R. Carper", "Bill Cassidy", "Judy Chu", "Katherine M. Clark", "Steve Cohen",
    "Tom Cole", "James R. Comer", "Gerald E. Connolly", "Chris Coons", "Joe Courtney", "Angie Craig", "Daniel Crenshaw",
    "John Curtis", "Debbie Dingell", "Tammy Duckworth", "Dwight Evans", "Pat Fallon", "Dianne Feinstein",
    "Chuck Fleischmann", "Lizzie Fletcher", "Virginia Foxx", "Lois Frankel", "Scott Franklin", "Bob Gibbs",
    "Josh Gottheimer", "Mark E. Green", "Michael Guest", "Bill Hagerty", "Diana Harshbarger", "Kevin Hern",
    "John Hickenlooper", "John Hoeven", "James M. Inhofe", "William Keating", "Mike Kelly", "Ro Khanna",
    "Angus King", "Adam Kinzinger", "Jim Langevin", "John B. Larson", "Susie Lee", "Zoe Lofgren", "Billy Long",
    "Alan Lowenthal", "Cynthia Lummis", "Tom Malinowski", "Kathy Manning", "Roger Marshall", "Thomas Massie",
    "Brian Mast", "Michael McCaul", "A. Donald McEachin", "David B. McKinley", "Dan Meuser", "Carol Miller",
    "Blake D. Moore", "Jerry Moran", "Marie Newman", "Frank Pallone Jr.", "Rand Paul", "Ed Perlmutter",
    "Gary Peters", "Dean Phillips", "John W. Rose", "Deborah K. Ross", "John Rutherford", "Brad Schneider",
    "Kurt Schrader", "Austin Scott", "Robert C. Scott", "Pete Sessions", "Mikie Sherrill", "Mike Simpson",
    "Tina Smith", "Dan Sullivan", "Patrick J. Toomey", "Tommy Tuberville", "Fred Upton", "Mark Warner",
    "Peter Welch", "Sheldon Whitehouse", "Rob Wittman", "Ron Wyden", "John Yarmuth", "Nancy Pelosi", "Mitch McConnell"
]

In [3]:
# For transaction analysis
littlesis_df = pd.read_csv("entities_merged.csv")
name_match = pd.read_csv("final_final_name_match.csv")
sec_df = pd.read_csv("full_features_complete_light.csv")

  littlesis_df = pd.read_csv("entities_merged.csv")
  sec_df = pd.read_csv("full_features_complete_light.csv")


In [4]:
print(littlesis_df.columns)

Index(['aliases', 'end_date', 'ext_BusinessPerson_sec_cik',
       'ext_Business_annual_profit', 'ext_Business_assets', 'ext_Business_aum',
       'ext_Business_marketcap', 'ext_Business_net_income',
       'ext_ElectedRepresentative_bioguide_id',
       'ext_ElectedRepresentative_crp_id',
       'ext_ElectedRepresentative_govtrack_id',
       'ext_ElectedRepresentative_pvs_id',
       'ext_ElectedRepresentative_watchdog_id', 'ext_GovernmentBody_city',
       'ext_GovernmentBody_county', 'ext_GovernmentBody_is_federal',
       'ext_GovernmentBody_state_id', 'ext_Lobbyist_lda_registrant_id',
       'ext_Org_employees', 'ext_Org_fedspending_id',
       'ext_Org_lda_registrant_id', 'ext_Org_name', 'ext_Org_name_nick',
       'ext_Org_revenue', 'ext_Person_birthplace', 'ext_Person_gender_id',
       'ext_Person_is_independent', 'ext_Person_name_first',
       'ext_Person_name_last', 'ext_Person_name_maiden',
       'ext_Person_name_middle', 'ext_Person_name_nick',
       'ext_Person_name_p

In [5]:
import re

# Build a regex pattern from your members list
pattern = '|'.join(re.escape(name) for name in members)

# Filter the full littlesis_df for aliases matching any member name
filtered_littlesis_df = littlesis_df.loc[
    littlesis_df['aliases'].str.contains(pattern, na=False, case=False)
]
print(filtered_littlesis_df.head)

<bound method NDFrame.head of                                                   aliases    end_date  \
4050           Mr Patrick W. Allender; Patrick W Allender         NaN   
12791                     Gus Bilirakis; Gus M. Bilirakis         NaN   
12804                    Earl Blumenauer; Earl Blumenauer         NaN   
12806                                Roy Blunt; Roy Blunt         NaN   
12813                          John Boozman; John Boozman         NaN   
...                                                   ...         ...   
430485  M001198; Roger Marshall; Roger W Marshall; Rog...         NaN   
430492          G000583; Josh Gottheimer; Josh Gottheimer         NaN   
430501       A. Donald McEachin; Donald McEachin; M001200  2022-11-28   
430507  C001114; John Curtis; John R. Curtis; John Rea...         NaN   
430508                    S001203; Tina Smith; Tina Smith         NaN   

        ext_BusinessPerson_sec_cik  ext_Business_annual_profit  \
4050                     12

In [6]:
# For each member, find all aliases in littlesis_df that match (case-insensitive substring match)
from collections import defaultdict

# Create a dictionary to store matches
member_to_aliases = defaultdict(set)

for member in members:
    matched_aliases = littlesis_df[
        littlesis_df['aliases'].str.contains(member, case=False, na=False)
    ]['aliases'].unique()
    member_to_aliases[member].update(matched_aliases)

# Convert to a DataFrame for easy viewing
member_alias_rows = []
for member, aliases in member_to_aliases.items():
    for alias in aliases:
        member_alias_rows.append((member, alias))

member_alias_df = pd.DataFrame(member_alias_rows, columns=["Member", "Matched_Alias"])
member_alias_df.sort_values(by="Member", inplace=True)
member_alias_df.reset_index(drop=True, inplace=True)

print(member_alias_df)

                 Member                                      Matched_Alias
0    A. Donald McEachin       A. Donald McEachin; Donald McEachin; M001200
1        Adam Kinzinger                     Adam Kinzinger Future 1st Cmte
2        Adam Kinzinger            Adam Kinzinger; Adam Kinzinger; K000378
3        Alan Lowenthal                        Alan Lowenthal for Congress
4        Alan Lowenthal         Alan Lowenthal; Alan S. Lowenthal; L000579
..                  ...                                                ...
195    Tommy Tuberville  Thomas H Tuberville; Thomas Hawley Tuberville;...
196       Virginia Foxx          F000450; Virginia Ann Foxx; Virginia Foxx
197       Virginia Foxx                         Virginia Foxx for Congress
198         Zoe Lofgren                  L000397; Zoe Lofgren; Zoe Lofgren
199         Zoe Lofgren                         Sheila Zoe Lofgren Collins

[200 rows x 2 columns]


In [7]:
matched_members = [
    name for name in members
    if filtered_littlesis_df['aliases'].str.contains(re.escape(name), case=False, na=False).any()
]

print("Matched members:")
print(matched_members)
print("No. of matched members:")
print(len(matched_members))

Matched members:
['Rick W. Allen', 'Cindy Axne', 'Gus Bilirakis', 'Earl Blumenauer', 'Richard Blumenthal', 'Roy Blunt', 'John Boozman', 'Mo Brooks', 'Michael C. Burgess', 'Cheri Bustos', 'Shelley Moore Capito', 'Thomas R. Carper', 'Bill Cassidy', 'Judy Chu', 'Katherine M. Clark', 'Steve Cohen', 'Tom Cole', 'Gerald E. Connolly', 'Chris Coons', 'Joe Courtney', 'Angie Craig', 'John Curtis', 'Debbie Dingell', 'Tammy Duckworth', 'Dwight Evans', 'Pat Fallon', 'Dianne Feinstein', 'Chuck Fleischmann', 'Lizzie Fletcher', 'Virginia Foxx', 'Lois Frankel', 'Scott Franklin', 'Bob Gibbs', 'Josh Gottheimer', 'Mark E. Green', 'Michael Guest', 'Bill Hagerty', 'Diana Harshbarger', 'Kevin Hern', 'John Hickenlooper', 'John Hoeven', 'James M. Inhofe', 'Mike Kelly', 'Ro Khanna', 'Angus King', 'Adam Kinzinger', 'Jim Langevin', 'John B. Larson', 'Susie Lee', 'Zoe Lofgren', 'Billy Long', 'Alan Lowenthal', 'Cynthia Lummis', 'Tom Malinowski', 'Kathy Manning', 'Roger Marshall', 'Thomas Massie', 'Brian Mast', 'A. 

200 aliases among 90 members, some members have multiple aliases.

We need to ensure each member only has one match.

In [8]:
print(name_match.columns)

Index(['SEC_RPTOWNERCIK', 'NODEID'], dtype='object')


In [9]:
print(sec_df.columns)

Index(['TRANS_SK', 'ACCESSION_NUMBER', 'SECURITY_TITLE', 'TRANS_DATE',
       'DEEMED_EXECUTION_DATE', 'TRANS_CODE', 'EQUITY_SWAP_INVOLVED',
       'TRANS_TIMELINESS', 'TRANS_SHARES', 'TRANS_PRICEPERSHARE',
       'TRANS_ACQUIRED_DISP_CD', 'SHRS_OWND_FOLWNG_TRANS',
       'DIRECT_INDIRECT_OWNERSHIP', 'NATURE_OF_OWNERSHIP', 'trans_amt',
       'FILING_DATE', 'PERIOD_OF_REPORT', 'ISSUERCIK', 'ISSUERNAME',
       'ISSUERTRADINGSYMBOL', 'RPTOWNERCIK_;', 'NUM_RPTOWNERCIK',
       'RPTOWNERNAME_;', 'RPTOWNER_RELATIONSHIP_;', 'RPTOWNER_TITLE_#',
       'PERMNO', 'date_x', 'VOL', 'PRC', 'RET', 'TICKER',
       'external_validation', 'accession_insider_volume', 'volume_mkt_ratio',
       'volume_mkt_z', 'volume_mkt_sig', 'ticker_total_insider_shares',
       'volume_peer_ratio', 'volume_peer_z', 'volume_peer_sig',
       'turnover_ratio', 'turnover_z', 'turnover_sig', 'cumulative_turnover',
       'snorkel_prob', 'snorkel_pred', 'best_threshold', 'y_pred', 'js_bin',
       's_bin', 'b_bin', 'jb

In [10]:
# Filter for suspicious transactions only
sus_sec_df = sec_df[sec_df['y_pred'] == 1]

In [11]:
littlesis_merged = pd.merge(
    name_match,
    filtered_littlesis_df,
    left_on='NODEID',
    right_on='id',
    how='inner'
)

print(littlesis_merged.shape)
littlesis_merged.head(10)

(30, 67)


Unnamed: 0,SEC_RPTOWNERCIK,NODEID,aliases,end_date,ext_BusinessPerson_sec_cik,ext_Business_annual_profit,ext_Business_assets,ext_Business_aum,ext_Business_marketcap,ext_Business_net_income,...,link_self,name,parent_id,primary_ext,start_date,tags,type,types,updated_at,website
0,1055973,284924.0,C001114; John Curtis; John R. Curtis; John Rea...,,,,,,,,...,https://littlesis.org/entities/284924-John_Curtis,John Curtis,,Person,1960-05-10,,entities,Person; Elected Representative,2025-01-15T19:41:31Z,https://www.curtis.senate.gov
1,1629451,284924.0,C001114; John Curtis; John R. Curtis; John Rea...,,,,,,,,...,https://littlesis.org/entities/284924-John_Curtis,John Curtis,,Person,1960-05-10,,entities,Person; Elected Representative,2025-01-15T19:41:31Z,https://www.curtis.senate.gov
2,1059224,34457.0,Mark R Warner; Mark R. Warner; Mark Warner; W0...,,,,,,,,...,https://littlesis.org/entities/34457-Mark_Warner,Mark Warner,,Person,1954-12-15,,entities,Person; Political Candidate; Elected Represent...,2025-01-23T20:44:22Z,https://www.warner.senate.gov
3,936612,118766.0,Thomas Massie; Thomas Massie,,,,,,,,...,https://littlesis.org/entities/118766-Thomas_M...,Thomas Massie,,Person,1971-01-13,,entities,Person; Political Candidate; Elected Represent...,2025-02-07T20:45:43Z,https://massie.house.gov
4,1205268,4203.0,Mr Patrick W. Allender; Patrick W Allender,,1205268.0,,,,,,...,https://littlesis.org/entities/4203-Patrick_W_...,Patrick W Allender,,Person,,corporate-mapping-project,entities,Person; Business Person,2019-01-07T17:09:43Z,
5,1184782,13592.0,Michael K Simpson; Michael K. Simpson; Mike Si...,,,,,,,,...,https://littlesis.org/entities/13592-Michael_K...,Michael K Simpson,,Person,1950-09-08,,entities,Person; Political Candidate; Elected Represent...,2025-01-15T19:41:16Z,https://simpson.house.gov
6,1355422,14974.0,Stephen Cohen; Steve Cohen; Steven A Cohen; St...,,,,,,,,...,https://littlesis.org/entities/14974-Steven_A_...,Steven A Cohen,,Person,1956-00-00,,entities,Person; Business Person,2025-02-21T21:49:56Z,
7,1362199,58541.0,John Curtis Linscott,,1362199.0,,,,,,...,https://littlesis.org/entities/58541-John_Curt...,John Curtis Linscott,,Person,,,entities,Person; Business Person,2013-10-06T16:40:14Z,
8,1513076,13166.0,C000174; Thomas R. Carper; Thomas Richard Carp...,,,,,,,,...,https://littlesis.org/entities/13166-Tom_Carper,Tom Carper,,Person,1947-01-23,,entities,Person; Political Candidate; Elected Represent...,2024-10-15T20:53:22Z,https://www.carper.senate.gov/public
9,1091449,337080.0,John Rose; John W. Rose; John W. Rose,,,,,,,,...,https://littlesis.org/entities/337080-John_W._...,John W. Rose,,Person,1965-02-23,,entities,Person; Elected Representative,2025-01-15T19:41:37Z,https://johnrose.house.gov


In [12]:
merged_df = pd.merge(
    littlesis_merged,
    sus_sec_df,
    left_on='SEC_RPTOWNERCIK',
    right_on='RPTOWNERCIK_;',
    how='inner'
)

print(merged_df.shape)
merged_df.head(10)

(183, 156)


Unnamed: 0,SEC_RPTOWNERCIK,NODEID,aliases,end_date,ext_BusinessPerson_sec_cik,ext_Business_annual_profit,ext_Business_assets,ext_Business_aum,ext_Business_marketcap,ext_Business_net_income,...,sen_important_connections,sen_full_congress_connections,sen_t2_important_connections,sen_t2_full_congress_connections,sen_t1_important_connections,sen_t1_full_congress_connections,house_t2_important_connections,house_t2_full_congress_connections,house_t1_important_connections,house_t1_full_congress_connections
0,1055973,284924.0,C001114; John Curtis; John R. Curtis; John Rea...,,,,,,,,...,24,20,0,0,0,0,0,0,0,0
1,1055973,284924.0,C001114; John Curtis; John R. Curtis; John Rea...,,,,,,,,...,24,20,0,0,0,0,0,0,0,0
2,1055973,284924.0,C001114; John Curtis; John R. Curtis; John Rea...,,,,,,,,...,24,20,0,0,0,0,0,0,0,0
3,1629451,284924.0,C001114; John Curtis; John R. Curtis; John Rea...,,,,,,,,...,94,100,0,0,0,0,0,0,0,0
4,1629451,284924.0,C001114; John Curtis; John R. Curtis; John Rea...,,,,,,,,...,94,99,0,0,0,0,0,0,0,0
5,1629451,284924.0,C001114; John Curtis; John R. Curtis; John Rea...,,,,,,,,...,94,99,0,0,0,0,0,0,0,0
6,1059224,34457.0,Mark R Warner; Mark R. Warner; Mark Warner; W0...,,,,,,,,...,0,98,0,0,0,0,0,0,0,0
7,936612,118766.0,Thomas Massie; Thomas Massie,,,,,,,,...,89,98,0,0,0,0,0,0,0,0
8,936612,118766.0,Thomas Massie; Thomas Massie,,,,,,,,...,89,98,0,0,0,0,0,0,0,0
9,936612,118766.0,Thomas Massie; Thomas Massie,,,,,,,,...,89,98,0,0,0,0,0,0,0,0


In [13]:
# Standardize both member names and aliases for case-insensitive matching
matched_members = []

for member in members:
    for alias in merged_df['aliases'].dropna():
        if member.lower() in alias.lower():
            matched_members.append(member)
            break  # break to avoid duplicates if the member is found in multiple aliases

# Deduplicate in case some members matched multiple times
matched_members = list(set(matched_members))

print(f"{len(matched_members)} members found in aliases:")
print(matched_members)

15 members found in aliases:
['Ro Khanna', 'Thomas Massie', 'Mike Simpson', 'Bill Cassidy', 'John Rutherford', 'Dean Phillips', 'John Curtis', 'Thomas R. Carper', 'Steve Cohen', 'Bill Hagerty', 'Mark Warner', 'Dwight Evans', 'Mark E. Green', 'John W. Rose', 'Rick W. Allen']


In [14]:
matched_nodeids = []

for member in members:
    for idx, alias in merged_df[['aliases', 'NODEID']].dropna().iterrows():
        if member.lower() in alias['aliases'].lower():
            matched_nodeids.append(alias['NODEID'])
            break  # avoid duplicates for the same member

# Remove duplicates just in case
matched_nodeids = list(set(matched_nodeids))

print(f"Found {len(matched_nodeids)} NODEIDs:")
print(matched_nodeids)

Found 15 NODEIDs:
[13592.0, 31682.0, 284866.0, 337058.0, 284873.0, 13706.0, 4203.0, 415885.0, 13166.0, 13199.0, 255567.0, 118766.0, 337080.0, 34457.0, 284924.0]


I choose to focus on these 15 members.

---------------------------------------------------------------------------------------------------

# Graph Visualization

In [15]:
# For graph viz
full_node_list = pd.read_csv("fullnodelist.csv")
adj_list = pd.read_csv("adjacency_list.csv")

In [16]:
print(full_node_list.columns)

Index(['SEC_RPTOWNERNAME', 'SEC_RPTOWNERCIK', 'Unnamed: 0', 'BP_LITTLESIS_ID',
       'PC_LITTLESIS_ID', 'NODEID', 'MATCHED_TYPE', 'SCORE',
       'MATCHED_HEURISTIC', 'littlesis_name'],
      dtype='object')


In [17]:
print(adj_list.columns)

Index(['source', 'target', 'cat_is_board', 'description2', 'start_date',
       'relationship_id', 'cat_boss_id', 'entity1_id', 'cat_is_executive',
       'description1', 'category_id', 'entity2_id', 'description',
       'cat_is_employee', 'cat_compensation', 'end_date', 'is_current',
       'has_start_date', 'has_end_date', 'has_both_dates', 'orig_entity1',
       'orig_entity2', 'has_date', 'reverse'],
      dtype='object')


In [19]:
filtered_adj_list = adj_list[
    adj_list['source'].isin(matched_nodeids) |
    adj_list['target'].isin(matched_nodeids)
]

In [21]:
full_node_list = full_node_list.dropna(subset=['NODEID'])  # drop rows with missing IDs
full_node_list['NODEID'] = full_node_list['NODEID'].astype(int)  # convert to native int

In [22]:
# Constructing edgelist

# Display the first few rows to inspect the data.
print("Edges DataFrame (first 5 rows):")
display(filtered_adj_list.head())

# Reconstruct the adjacency list from the DataFrame.
# We group by the 'source' column, and for each source, we create a list of tuples (target, attr_dict).
def reconstruct_adj_list(df):
    # Group the DataFrame by 'source'
    grouped = list(df.groupby("source"))
    filtered_adj_list = {}
    # Wrap the outer loop with tqdm to monitor progress.
    for source, group in tqdm(grouped, desc="Reconstructing adjacency list", total=len(grouped)):
        edges = []
        for _, row in group.iterrows():
            target = row["target"]
            # Convert row to dictionary and drop 'source' and 'target'
            attr = row.to_dict()
            attr.pop("source", None)
            attr.pop("target", None)
            edges.append((target, attr))
        filtered_adj_list[source] = edges
    return filtered_adj_list

# Reconstruct the adjacency list.
adj_list_reconstructed = reconstruct_adj_list(filtered_adj_list)

# Print a sample of the reconstructed adjacency list.
print("\nSample of the reconstructed adjacency list:")
for node, edges in list(adj_list_reconstructed.items())[:5]:
    print(f"Node {node}:")
    for neighbor, attr in edges:
        print(f"  connects to {neighbor} with attributes {attr}")

Edges DataFrame (first 5 rows):


Unnamed: 0,source,target,cat_is_board,description2,start_date,relationship_id,cat_boss_id,entity1_id,cat_is_executive,description1,...,cat_compensation,end_date,is_current,has_start_date,has_end_date,has_both_dates,orig_entity1,orig_entity2,has_date,reverse
766,1009,13166,,Campaign Contribution,,159201,,1009,,Campaign Contribution,...,,,,False,False,False,1009,13166,False,
1595,1029,31682,,,2021-07-15,1881446,,1029,,Campaign Contribution,...,,2021-07-15,False,True,True,True,1029,31682,True,
2558,15289,337058,,Campaign Contribution,2021-04-26,1656806,,15289,,Campaign Contribution,...,,2021-08-18,False,True,True,True,15289,337058,True,
3650,1051,13166,,Campaign Contribution,,160228,,1051,,Campaign Contribution,...,,,,False,False,False,1051,13166,False,
3685,1052,34457,,Campaign Contribution,,161499,,1052,,Campaign Contribution,...,,,,False,False,False,1052,34457,False,


Reconstructing adjacency list: 100%|██████████| 4481/4481 [00:00<00:00, 12861.28it/s]


Sample of the reconstructed adjacency list:
Node 240:
  connects to 4203 with attributes {'cat_is_board': False, 'description2': 'Executive Vice President', 'start_date': nan, 'relationship_id': 4849, 'cat_boss_id': nan, 'entity1_id': 4203, 'cat_is_executive': True, 'description1': 'Executive Vice President', 'category_id': 1, 'entity2_id': 240, 'description': 'Patrick W Allender  has a position (Executive Vice President) at  Danaher Corporation ', 'cat_is_employee': nan, 'cat_compensation': nan, 'end_date': nan, 'is_current': True, 'has_start_date': False, 'has_end_date': False, 'has_both_dates': False, 'orig_entity1': 4203, 'orig_entity2': 240, 'has_date': False, 'reverse': True}
Node 1009:
  connects to 13166 with attributes {'cat_is_board': nan, 'description2': 'Campaign Contribution', 'start_date': nan, 'relationship_id': 159201, 'cat_boss_id': nan, 'entity1_id': 1009, 'cat_is_executive': nan, 'description1': 'Campaign Contribution', 'category_id': 5, 'entity2_id': 13166, 'descri




In [23]:
import pickle

# Load the pickle file
with open("congress_nodeid_mapper.pkl", "rb") as f:
    congress_nodeid_mapper = pickle.load(f)

# Convert nested dict to DataFrame
congress_nodeid_mapper = pd.DataFrame.from_dict(congress_nodeid_mapper, orient='index').reset_index()
congress_nodeid_mapper = congress_nodeid_mapper.rename(columns={'index': 'Congress_ID'})

# Convert NODEID list to string
congress_nodeid_mapper['NODEID'] = congress_nodeid_mapper['NODEID'].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)

# Display the DataFrame
print(congress_nodeid_mapper.head())

  Congress_ID                  Name     ID # bioguide NODEID
0     39308.0  Goodlatte, Robert W.  39308.0  G000289  13308
1     29137.0      Boehner, John A.  29137.0  B000589  13083
2     29311.0     Pombo, Richard W.  29311.0  P000419  13730
3     29300.0        Everett, Terry  29300.0  E000268  13276
4     29393.0       Lucas, Frank D.  29393.0  L000491  13427


In [24]:
congress_nodeids = congress_nodeid_mapper['NODEID']

In [88]:
import networkx as nx
from pyvis.network import Network
from IPython.display import IFrame, display

# Step 0: Build a NetworkX DiGraph from adj_list_reconstructed
G = nx.DiGraph()
for source, edges in adj_list_reconstructed.items():
    for target, attrs in edges:
        G.add_edge(int(source), int(target), **attrs)

# Step 1: Get all nodes within path length ≤ 4 from any red node (matched_nodeids)
max_depth = 4
reachable_nodes = set()

for node in matched_nodeids:
    if node in G:
        reachable = nx.single_source_shortest_path_length(G, node, cutoff=max_depth)
        reachable_nodes.update(reachable.keys())

# Step 2: Filter adj_list_reconstructed to include only edges between reachable nodes
filtered_adj_list = {}

for source, edges in adj_list_reconstructed.items():
    source = int(source)
    if source in reachable_nodes:
        filtered_edges = []
        for target, attrs in edges:
            target = int(target)
            if target in reachable_nodes:
                filtered_edges.append((target, attrs))
        if filtered_edges:
            filtered_adj_list[source] = filtered_edges

# Step 3: Get nodes actually involved in the filtered edges
filtered_node_ids = set()
for source, edges in filtered_adj_list.items():
    filtered_node_ids.add(source)
    for target, _ in edges:
        filtered_node_ids.add(target)

# Step 4: Filter full_node_list
connected_node_df = full_node_list[full_node_list['NODEID'].isin(filtered_node_ids)].copy()

# Step 5: Build Pyvis graph
net = Network(height="800px", width="100%", directed=True, notebook=True, cdn_resources="remote")
net.barnes_hut()

# Add filtered nodes
for _, row in connected_node_df.iterrows():
    try:
        node_id = int(row['NODEID'])
        label = row['SEC_RPTOWNERNAME']

        if node_id in matched_nodeids:
            color = "red"
            size = 300
            font_size = 300
        elif node_id in congress_nodeids:
            color = "orange"
            size = 100
            font_size = 100
        else:
            color = "lightgreen"
            size = 50
            font_size = 50

        net.add_node(
            node_id,
            label=label,
            color=color,
            size=size,
            font={"size": font_size}
        )
    except Exception as e:
        print(f"⚠️ Skipped node due to error: {e}")


# To store added edges (to prevent duplicates)
added_edges = set()

# Add edges with coloring and deduplication
for source, edges in filtered_adj_list.items():
    for target, attrs in edges:
        edge_key = (min(source, target), max(source, target))

        if edge_key not in added_edges and source in net.get_nodes() and target in net.get_nodes():
            category_id = attrs.get("category_id", None)
            title = attrs.get("description", "Relationship")

            if source in congress_nodeids and target in matched_nodeids:
                edge_color = "red"
                edge_width = 30
            else:
                edge_color = "gray"
                edge_width = 1

            net.add_edge(source, target, title=title, color=edge_color, width=edge_width)
            added_edges.add(edge_key)

In [89]:
# Add dummy nodes for legend
legend_items = [
    {"label": "Suspicious Individual", "color": "red", "y": -10000},
    {"label": "Congress Member", "color": "orange", "y": -12000},
    {"label": "Other Entity", "color": "lightgreen", "y": -14000},
]

x_pos = 15000  # keep it far to the right
for i, item in enumerate(legend_items):
    net.add_node(
        f"legend_{i}",
        label=item["label"],
        color=item["color"],
        x=x_pos,
        y=item["y"],
        physics=False,
        fixed=True,
        shape="dot",
        size=300,
        font={"size": 300}
    )

In [53]:
print("Number of nodes:", len(net.nodes))
print("Number of edges:", len(net.edges))

Number of nodes: 1230
Number of edges: 1421


In [90]:
net.show("littlesis_congress_network.html")
display(IFrame("littlesis_congress_network.html",width="100%", height="800px"))

littlesis_congress_network.html


# Transaction Analysis

In [29]:
# Need to rejoin to include all entities
all_littlesis_merged = pd.merge(
    name_match,
    littlesis_df,
    left_on='NODEID',
    right_on='id',
    how='inner'
)

all_merged_df = pd.merge(
    all_littlesis_merged,
    sec_df,
    left_on='SEC_RPTOWNERCIK',
    right_on='RPTOWNERCIK_;',
    how='inner'
)

print(all_merged_df.head)

<bound method NDFrame.head of         SEC_RPTOWNERCIK   NODEID                                  aliases  \
0               1142633   1315.0  David M Lawrence; Dr. David M. Lawrence   
1               1142633   1315.0  David M Lawrence; Dr. David M. Lawrence   
2               1142633   1315.0  David M Lawrence; Dr. David M. Lawrence   
3               1142633   1315.0  David M Lawrence; Dr. David M. Lawrence   
4               1142633   1315.0  David M Lawrence; Dr. David M. Lawrence   
...                 ...      ...                                      ...   
1794098         1223277  10155.0                         Bernard Mariette   
1794099         1223277  10155.0                         Bernard Mariette   
1794100         1223277  10155.0                         Bernard Mariette   
1794101         1223286  10156.0                           Steven L Brink   
1794102         1223286  10156.0                           Steven L Brink   

        end_date  ext_BusinessPerson_sec_cik 

In [30]:
# Make sure your aliases column is lowercase for matching
merged_df['aliases_lower'] = merged_df['aliases'].str.lower()

# Define the members you want to check
target_members = [
    'Ro Khanna', 'Thomas Massie', 'John W. Rose', 'Steve Cohen', 'Dwight Evans',
    'Bill Hagerty', 'Thomas R. Carper', 'Rick W. Allen', 'Mark E. Green',
    'Dean Phillips', 'John Rutherford', 'Mark Warner', 'Bill Cassidy',
    'Mike Simpson', 'John Curtis'
]

# Count suspicious trades (y_pred == 1) for each member
suspicious_counts = {}
for member in target_members:
    member_lower = member.lower()
    count = merged_df[
        merged_df['aliases_lower'].str.contains(member_lower, na=False) &
        (merged_df['y_pred'] == 1)
    ].shape[0]
    suspicious_counts[member] = count

# Convert to a DataFrame
suspicious_df = pd.DataFrame(list(suspicious_counts.items()), columns=["Member", "Suspicious_Trades"])
suspicious_df.sort_values(by="Suspicious_Trades", ascending=False, inplace=True)
suspicious_df.reset_index(drop=True, inplace=True)

# Combine suspicious trade counts with their corresponding NODEIDs

# Prepare a list with members that have suspicious trades
suspicious_members = suspicious_df[suspicious_df["Suspicious_Trades"] > 0]

# Match each member to a NODEID
nodeids = []
for member in suspicious_members["Member"]:
    member_lower = member.lower()
    match = merged_df[
        merged_df['aliases_lower'].str.contains(member_lower, na=False) &
        (merged_df['y_pred'] == 1) &
        (merged_df['NODEID'].notna())
    ]
    if not match.empty:
        nodeids.append(int(match.iloc[0]['NODEID']))
    else:
        nodeids.append(None)

# Add NODEID column to the suspicious_members DataFrame
suspicious_members = suspicious_members.copy()
suspicious_members["NODEID"] = nodeids

# Reorder columns and sort by suspicious trades
final_df = suspicious_members[["Member", "NODEID", "Suspicious_Trades"]]
final_df.sort_values(by="Suspicious_Trades", ascending=False, inplace=True)
final_df.reset_index(drop=True, inplace=True)

print(final_df)

              Member  NODEID  Suspicious_Trades
0       Mike Simpson   13592                 56
1       John W. Rose  337080                 44
2      Dean Phillips  337058                 15
3      Mark E. Green   13706                 10
4        John Curtis  284924                 10
5   Thomas R. Carper   13166                  9
6          Ro Khanna  284873                  8
7        Mark Warner   34457                  7
8        Steve Cohen   13199                  5
9      Rick W. Allen    4203                  5
10   John Rutherford  415885                  5
11     Thomas Massie  118766                  4
12      Bill Hagerty  255567                  2
13      Bill Cassidy   31682                  2
14      Dwight Evans  284866                  1


In [31]:
# Initialize lists
nodeids = []
rptownernames = []
aliasnames = []

for member in suspicious_members["Member"]:
    member_lower = member.lower()
    match = merged_df[
        merged_df['aliases_lower'].str.contains(member_lower, na=False) &
        (merged_df['y_pred'] == 1) &
        (merged_df['NODEID'].notna())
    ]
    if not match.empty:
        nodeids.append(int(match.iloc[0]['NODEID']))
        rptownernames.append(match.iloc[0]['RPTOWNERNAME_;'])
        aliasnames.append(match.iloc[0]['aliases'])
    else:
        nodeids.append(None)
        rptownernames.append(None)

# Add NODEID and SEC_RPTOWNERNAME columns
suspicious_members = suspicious_members.copy()
suspicious_members["NODEID"] = nodeids
suspicious_members["RPTOWNERNAME_;"] = rptownernames
suspicious_members["aliases"] = aliasnames

# Reorder and sort
final_df = suspicious_members[["Member", "NODEID", "RPTOWNERNAME_;", "aliases", "Suspicious_Trades"]]
final_df.sort_values(by="Suspicious_Trades", ascending=False, inplace=True)
final_df.reset_index(drop=True, inplace=True)

# Display
print(final_df)


              Member  NODEID           RPTOWNERNAME_;  \
0       Mike Simpson   13592          SIMPSON MICHAEL   
1       John W. Rose  337080              ROSE JOHN W   
2      Dean Phillips  337058          Phillips Dean B   
3      Mark E. Green   13706            Green Mark A.   
4        John Curtis  284924            CURTIS JOHN D   
5   Thomas R. Carper   13166         Thomas Richard C   
6          Ro Khanna  284873             Khanna Rohit   
7        Mark Warner   34457            Warner Mark S   
8        Steve Cohen   13199          Cohen Stephen B   
9      Rick W. Allen    4203       ALLENDER PATRICK W   
10   John Rutherford  415885        RUTHERFORD JOHN S   
11     Thomas Massie  118766          MASSIE THOMAS L   
12      Bill Hagerty  255567  Hagerty William Francis   
13      Bill Cassidy   31682       Cassidy William J.   
14      Dwight Evans  284866           EVANS DWIGHT H   

                                              aliases  Suspicious_Trades  
0   Michael 

Double checking because some names on my graph viz look weird. Some RPTOWNER name very weird. But should be ok coz it was matched on alias.

Hypothesis: If someone is connected to any one of these 15 member, they are more likely to have suspicious trades than someone who is not connected to them.

In [32]:
# Initialize results list
results = []

# Loop over each node and compute suspicion rates
for _, row in final_df.iterrows():
    node_id = row["NODEID"]
    member = row["Member"]

    connected_nodeids = set()

    for source, edges in adj_list_reconstructed.items():
        for target, _ in edges:
            if source == node_id:
                connected_nodeids.add(target)
            elif target == node_id:
                connected_nodeids.add(source)

    connected_group = all_merged_df[all_merged_df['NODEID'].isin(connected_nodeids)]
    non_connected_group = all_merged_df[~all_merged_df['NODEID'].isin(connected_nodeids)]

    connected_rate = (connected_group['y_pred'] == 1).mean()
    non_connected_rate = (non_connected_group['y_pred'] == 1).mean()

    results.append((member, node_id, connected_rate, non_connected_rate))

# Create result DataFrame
comparison_df = pd.DataFrame(results, columns=["Member", "NODEID", "Connected_Suspicion_Rate", "NonConnected_Suspicion_Rate"])

print(comparison_df)

              Member  NODEID  Connected_Suspicion_Rate  \
0       Mike Simpson   13592                  0.467682   
1       John W. Rose  337080                  0.000000   
2      Dean Phillips  337058                  0.215184   
3      Mark E. Green   13706                  0.320197   
4        John Curtis  284924                  0.543755   
5   Thomas R. Carper   13166                  0.343270   
6          Ro Khanna  284873                  0.161286   
7        Mark Warner   34457                  0.329949   
8        Steve Cohen   13199                  0.259753   
9      Rick W. Allen    4203                  0.666667   
10   John Rutherford  415885                       NaN   
11     Thomas Massie  118766                  0.385106   
12      Bill Hagerty  255567                  0.600000   
13      Bill Cassidy   31682                  0.224166   
14      Dwight Evans  284866                  0.329897   

    NonConnected_Suspicion_Rate  
0                      0.333546  
1  

In [33]:
# Compute overall average suspicion rates across all targets
average_connected_rate = comparison_df["Connected_Suspicion_Rate"].mean()
average_nonconnected_rate = comparison_df["NonConnected_Suspicion_Rate"].mean()

average_connected_rate, average_nonconnected_rate

(0.34620803351632484, 0.3343954026066095)

Around the same, hypothesis not proved...
Only John Curtis, Rick W. Allen, Bill Hagerty show signs of the hypothesis being true.

In [34]:
# Is this difference statistically significant?

from scipy.stats import ttest_ind

# Get the actual y_pred lists
connected_y = connected_group['y_pred']
nonconnected_y = non_connected_group['y_pred']

t_stat, p_val = ttest_ind(connected_y, nonconnected_y, equal_var=False)
print("p-value:", p_val)

p-value: 0.8290181069269092


Not statistically significant

# Feature Analysis

In [35]:
# Initialize results list
results = []

# Loop over each member node
for _, row in final_df.iterrows():
    node_id = row["NODEID"]
    member = row["Member"]

    # Get direct neighbors of this node
    connected_nodeids = set()
    for source, edges in adj_list_reconstructed.items():
        for target, _ in edges:
            if source == node_id:
                connected_nodeids.add(target)
            elif target == node_id:
                connected_nodeids.add(source)

    # Split full merged data into connected and non-connected groups
    connected_group = all_merged_df[all_merged_df['NODEID'].isin(connected_nodeids)]
    non_connected_group = all_merged_df[~all_merged_df['NODEID'].isin(connected_nodeids)]

    # Compute mean of 'important_connections' and 'full_congress_connections'
    connected_important = connected_group['important_connections'].mean()
    non_connected_important = non_connected_group['important_connections'].mean()

    connected_congress = connected_group['full_congress_connections'].mean()
    non_connected_congress = non_connected_group['full_congress_connections'].mean()

    results.append((
        member, node_id,
        connected_important, non_connected_important,
        connected_congress, non_connected_congress
    ))

# Create DataFrame for result
connection_comparison_df = pd.DataFrame(
    results,
    columns=[
        "Member", "NODEID",
        "Connected_Important_Connections", "NonConnected_Important_Connections",
        "Connected_Congress_Connections", "NonConnected_Congress_Connections"
    ]
)

print(connection_comparison_df)

              Member  NODEID  Connected_Important_Connections  \
0       Mike Simpson   13592                       109.681192   
1       John W. Rose  337080                         0.000000   
2      Dean Phillips  337058                       120.378308   
3      Mark E. Green   13706                       120.742200   
4        John Curtis  284924                       132.817502   
5   Thomas R. Carper   13166                        81.322465   
6          Ro Khanna  284873                        48.497899   
7        Mark Warner   34457                        84.741301   
8        Steve Cohen   13199                        82.277037   
9      Rick W. Allen    4203                        79.333333   
10   John Rutherford  415885                              NaN   
11     Thomas Massie  118766                        99.889362   
12      Bill Hagerty  255567                        72.400000   
13      Bill Cassidy   31682                       110.660448   
14      Dwight Evans  284

In [36]:
average_connected_impt_connections = connection_comparison_df["Connected_Important_Connections"].mean()
average_nonconnected_impt_connections = connection_comparison_df["NonConnected_Important_Connections"].mean()

average_connected_impt_connections, average_nonconnected_impt_connections

(88.1640021880547, 52.40636069397074)

Those connected to the 15 members tend to have more important connections

In [37]:
average_connected_congress_connections = connection_comparison_df["Connected_Congress_Connections"].mean()
average_nonconnected_congress_connections = connection_comparison_df["NonConnected_Congress_Connections"].mean()

average_connected_congress_connections, average_nonconnected_congress_connections

(375.71734233697174, 229.92145688773599)

Those connected to the 15 members tend to have more congress connections