In [1]:
# Set thresholds for kout and kin
congressperson_kout=1
followee_kin=1

In [105]:
# Parameters
congressperson_kout = 1
followee_kin = 10


In [106]:
import os
import datetime
import pandas as pd
from networkx.algorithms import bipartite

from utils.network_analysis import *

## **1. Read following dataframes for House and Senate**

In [107]:
# Read House and Senate following dataframes
master_following_house = pd.read_csv('data/interim/house_master_following_table.csv')
master_following_senate = pd.read_csv('data/interim/senate_master_following_table.csv')
master_following_all = pd.read_csv('data/interim/congress_master_following_table.csv')

# Print summary statistics
print('HOUSE FOLLOWING DATAFRAME:')
print(f'{master_following_house['congressperson'].nunique()} unique congressmembers')
print(f'{master_following_house['followee'].nunique()} unique followees')
print(f'{len(master_following_house)} links')
print(''*10)

print('SENATE FOLLOWING DATAFRAME:')
print(f'{master_following_senate['congressperson'].nunique()} unique congressmembers')
print(f'{master_following_senate['followee'].nunique()} unique followees')
print(f'{len(master_following_senate)} links')
print(''*10)

print('ALL CONGRESS FOLLOWING DATAFRAME:')
print(f'{master_following_all['congressperson'].nunique()} unique congressmembers')
print(f'{master_following_all['followee'].nunique()} unique followees')
print(f'{len(master_following_all)} links')

HOUSE FOLLOWING DATAFRAME:
109 unique congressmembers
116719 unique followees
205737 links

SENATE FOLLOWING DATAFRAME:
55 unique congressmembers
58729 unique followees
79837 links

ALL CONGRESS FOLLOWING DATAFRAME:
164 unique congressmembers
156200 unique followees
285574 links


## **2. Remove nodes based on Out/In-degree**

In [108]:
def removeNodes(df, congressperson_kout=1, followee_kin=1):
    """Remove nodes from the master following tables (bipartite networks) if either congressmember nodes or followee nodes have k_out/k_in below a certain threshold

    Args:
        - df (pd.DataFrame) : dataframe following links between congressperson and followees
        - congressperson_kout (int) : minimum number of out-degree (to followees) a congressperson must have to be retained in the network
        - followee_kin (int) : minumum number of in-degree (from congressmembers) a followee must have to be retained in the network
    
    """
    k_in_followees = df.groupby('followee')['congressperson'].nunique() \
                        .reset_index() \
                        .sort_values(by='congressperson', ascending=False) \
                        .rename(columns={'congressperson' : 'k_in'})


    k_out_congressperson = df.groupby('congressperson')['followee'].nunique() \
                        .reset_index() \
                        .sort_values(by='followee', ascending=False) \
                        .rename(columns={'followee' : 'k_out'})
    

    filtered_followees = list(k_in_followees[k_in_followees['k_in'] >= followee_kin]['followee'])
    filtered_congressperons = list(k_out_congressperson[k_out_congressperson['k_out'] >= congressperson_kout]['congressperson'])

    df = df[df['congressperson'].isin(filtered_congressperons)]
    df = df[df['followee'].isin(filtered_followees)]

    return df.sort_values(by=['congressperson', 'followee'], ascending=False)


#### **2.1 Remove nodes**

In [109]:
# Remove low degree B and C nodes
master_following_house = removeNodes(master_following_house, congressperson_kout=congressperson_kout, followee_kin=followee_kin)
master_following_senate = removeNodes(master_following_senate, congressperson_kout=congressperson_kout, followee_kin=followee_kin)
master_following_all = removeNodes(master_following_all, congressperson_kout=congressperson_kout, followee_kin=followee_kin)

In [7]:
def saveEdgelist(df, filename = 'saveedgelist_output.csv'):
    """Save edgelists for processed networks (from following dataframe)"""

    folder_path = 'data/processed/edgelists/'
    file_path = folder_path + filename
    df[['congressperson', 'followee']].to_csv(file_path, header=False, index=False)

    return file_path

#### **2.2 Save edgelists / training data**

In [8]:
house_path = saveEdgelist(master_following_house, f'house_edgelist_kout{congressperson_kout}_kin{followee_kin}.csv')
senate_path = saveEdgelist(master_following_senate, f'senate_edgelist_kout{congressperson_kout}_kin{followee_kin}.csv')
all_path = saveEdgelist(master_following_all, f'combined_edgelist_kout{congressperson_kout}_kin{followee_kin}.csv')