In [101]:
# Compute authority and hub scores for lobbying network
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

In [2]:
df = pd.read_csv("./data/silver/df_meetings_lobbyists.csv")

In [None]:
lobbyist_topics_columns = [
    "l_agriculture",
    "l_economics_and_trade",
    "l_education",
    "l_environment_and_climate",
    "l_foreign_and_security_affairs",
    "l_health",
    "l_human_rights",
    "l_infrastructure_and_industry",
    "l_technology",
]

df_authorities = pd.DataFrame()
for date in tqdm(df["meeting_date"].sort_values().unique()):
    df_date = df[df["meeting_date"] <= date]
    G = nx.from_pandas_edgelist(
        df_date, source="lobbyist_id", target="member_id", create_using=nx.DiGraph()
    )

    # Use NetworkX's HITS implementation
    hubs, authorities = nx.hits(G)

    new_df_authorities = pd.DataFrame(
        [{"id": k, "date": date, "authority": v} for k, v in authorities.items()]
    )

    # compute cumulative topics
    df_cum_meetings_topics = (
        df_date[["member_id", *lobbyist_topics_columns]].groupby("member_id").sum()
    )

    df_cum_meetings_topics["total_topics_cum"] = df_cum_meetings_topics.sum(axis=1)

    df_cum_meetings_topics.reset_index(inplace=True)

    # for each lobbyist_topics_column. compute percentage of total topics
    for col in lobbyist_topics_columns:
        df_cum_meetings_topics[f"{col}_percentage"] = (
            df_cum_meetings_topics[col] / df_cum_meetings_topics["total_topics_cum"]
        )

    # compute authority * topic_percentage for each lobbyist_topics_column

    df_cum_meetings_topics = df_cum_meetings_topics.merge(
        new_df_authorities,
        left_on="member_id",
        right_on="id",
        how="left",
    )

    for col in lobbyist_topics_columns:
        df_cum_meetings_topics[f"{col}_authority_percentage"] = (
            df_cum_meetings_topics[f"{col}_percentage"]
            * df_cum_meetings_topics["authority"]
        )

    del df_cum_meetings_topics["id"]

    df_authorities = pd.concat([df_authorities, df_cum_meetings_topics])


100%|██████████| 1396/1396 [01:57<00:00, 11.84it/s]


In [293]:
from datetime import datetime

df_authorities['Y-m'] = df_authorities['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').strftime('%Y-%m'))
df_authorities['Y-w'] = df_authorities['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').strftime('%Y-W%W'))


df_ym = df_authorities.copy()
del df_ym['Y-w']
del df_ym['date']
df_ym = df_ym.groupby(['member_id', 'Y-m']).median().reset_index()


df_yw = df_authorities.copy()
del df_yw['date']
del df_yw['Y-m']
df_yw = df_yw.groupby(['member_id', 'Y-w']).median().reset_index()

In [296]:
df_ym.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
l_agriculture,28082.0,13.526351,24.245832,0.0,1.0,5.0,15.0,260.0
l_economics_and_trade,28082.0,29.735115,48.322843,0.0,4.0,12.0,35.0,612.0
l_education,28082.0,16.343601,26.481876,0.0,2.0,6.0,20.0,395.0
l_environment_and_climate,28082.0,27.415088,46.468246,0.0,3.0,10.0,32.0,582.0
l_foreign_and_security_affairs,28082.0,24.115181,38.71592,0.0,4.0,10.0,29.0,527.0
l_health,28082.0,20.157645,32.7135,0.0,3.0,8.0,24.0,433.0
l_human_rights,28082.0,15.201036,26.58519,0.0,2.0,6.0,17.0,416.0
l_infrastructure_and_industry,28082.0,28.024375,47.923003,0.0,3.0,11.0,32.0,586.0
l_technology,28082.0,27.267841,45.219566,0.0,4.0,11.0,32.0,577.0
total_topics_cum,28082.0,201.786233,325.372926,1.0,28.0,81.0,239.0,4274.0


In [297]:
df_authorities.to_csv("./data/gold/panel_data_graph_v020704.csv", index=False)
df_ym.to_csv("./data/gold/panel_data_graph_Ym_v020704.csv", index=False)
df_yw.to_csv("./data/gold/panel_data_graph_Yw_v020704.csv", index=False)