In [8]:
import os
import pandas as pd
from networkx.algorithms import bipartite

from utils.network_analysis import *

## **1. Read / process data**

#### **1.1 Read summary csv of all 183 congresspeople**

In [9]:
# Pull list of all 183 congressmembers
filepath = 'data/raw/congressmembers_characteristics.csv'
all_congress_members = pd.read_csv(filepath).drop(['Unnamed: 0'], axis=1)

#### **1.2 Read in following CSV for each congressperson, compile into dataframe**

In [10]:
# raw data folder
following_data_folder = 'data/raw/following_data_raw/'
arr = os.listdir(following_data_folder)

# Iterate through CSVs, compile into dataframe
df_list = []
for file in arr:
    filepath = following_data_folder + file
    congressperson = '_'.join(file.split('_')[1:-1]) # Get username of congressperson from filename

    # Read following data, reformat columns
    following_table = pd.read_csv(filepath).rename(columns={'user_name': 'followee', 'followers_count' : 'followee_followers_count', 'following_count' : 'followee_following_count'})
    following_table['congressperson'] = congressperson
    df_list.append(following_table)

# Concat into master following dataframe   
full_following_table = pd.concat(df_list)

# Data value formatting
full_following_table['followee'] = full_following_table['followee'].str.lower()
full_following_table['congressperson'] = full_following_table['congressperson'].str.lower()
full_following_table['created_at'] = full_following_table['created_at'].str.extract(r'(\d{4})').astype(int) # Extract year that followee account was created
full_following_table = full_following_table[['congressperson', 'followee', 'created_at', 'followee_followers_count', 'followee_following_count']]

# Filter out followee accounts if created > 2015
full_following_table = full_following_table[full_following_table['created_at'] <= 2015]

# Filter out follows between congresspeople (retain only Congressperson -> noncongress follows)
congressperson_list = list(full_following_table['congressperson'].unique())
full_following_table = full_following_table[~full_following_table['followee'].isin(congressperson_list)]

# Print summary counts
print(f'Full following table produced: {full_following_table['congressperson'].nunique()} congresspeople, {full_following_table['followee'].nunique()} unique followees')
full_following_table.head(5)

Full following table produced: 164 congresspeople, 156200 unique followees


Unnamed: 0,congressperson,followee,created_at,followee_followers_count,followee_following_count
1,chuckgrassley,housegop,2008,1644576,852
2,chuckgrassley,senategop,2008,1566688,94
3,chuckgrassley,ronnamcdaniel,2014,1253363,2629
4,chuckgrassley,govmikehuckabee,2008,1914352,580
5,chuckgrassley,newtgingrich,2009,2365337,1006


#### **1.3 Join info on congresspersons, save master table to CSV**

In [11]:
# Join info on congressperson to following list
master_following = full_following_table.merge(all_congress_members, on='congressperson', how='left')

# Rename columns
master_following = master_following.rename(columns = {'created_at' : 'followee_created_at', \
                                                      'party' : 'congressperson_party', \
                                                      'dw.nom.1' : 'congressperson_DW', \
                                                      'chamber' : 'congressperson_chamber', \
                                                      'gender' : 'congressperson_gender', \
                                                      'stdis' : 'congressperson_state', \
                                                      'following_count' : 'congressperson_following_count'})


# Remove multiple follower/following counts for the same followee node -> retain maximum of possible values for each node
master_following['followee_followers_count'] = master_following.groupby('followee')['followee_followers_count'].transform('max')
master_following['followee_following_count'] = master_following.groupby('followee')['followee_following_count'].transform('max')


# Save final master following table to csv
master_following.to_csv('data/interim/congress_master_following_table.csv', header=True, index=False)
master_following.head(5)

print('TOTAL FOLLOWING DATAFRAME:')
print(f'{master_following['congressperson'].nunique()} unique congressmembers')
print(f'{master_following['followee'].nunique()} unique followees')
print(f'{len(master_following)} links')

TOTAL FOLLOWING DATAFRAME:
164 unique congressmembers
156200 unique followees
285574 links


#### **1.4 Split into House and Senate dataframes, save to CSVs**

In [12]:
# Separate into House and Senate
master_following_house = master_following[master_following['congressperson_chamber'] == 'House']
master_following_senate = master_following[master_following['congressperson_chamber'] == 'Senate']

print('HOUSE FOLLOWING DATAFRAME:')
print(f'{master_following_house['congressperson'].nunique()} unique congressmembers')
print(f'{master_following_house['followee'].nunique()} unique followees')
print(f'{len(master_following_house)} links')
print(''*10)

print('SENATE FOLLOWING DATAFRAME:')
print(f'{master_following_senate['congressperson'].nunique()} unique congressmembers')
print(f'{master_following_senate['followee'].nunique()} unique followees')
print(f'{len(master_following_senate)} links')

# Save House and Senate following tables to csvs
master_following_house.to_csv('data/interim/house_master_following_table.csv', header=True, index=False)
master_following_senate.to_csv('data/interim/senate_master_following_table.csv', header=True, index=False)

HOUSE FOLLOWING DATAFRAME:
109 unique congressmembers
116719 unique followees
205737 links

SENATE FOLLOWING DATAFRAME:
55 unique congressmembers
58729 unique followees
79837 links
