In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

In [2]:
df = pd.read_csv("./data/gold/panel_data_3.csv")
df["date"] = pd.to_datetime(df["date"])

In [3]:
df_questions_and_meetings = (
    df[["mep_id", "questions", "meetings"]].groupby("mep_id").sum()
)
df_mep_data = df[["mep_id", *df.columns[4:]]].groupby("mep_id").max()
df_cros = df_questions_and_meetings.join(df_mep_data)

df_cros["questions_log"] = np.log(df_cros["questions"] + 1)

# Aggregate memembership vars

In [4]:
from collections import defaultdict


unique_memberships = []
for m in [
    c.split(" - ")[0].strip() for c in df_cros.columns if len(c.split(" - ")) > 1
]:
    if m not in unique_memberships:
        unique_memberships.append(m)


memebership_summary = []
for index, row in df_cros.iterrows():
    res = defaultdict(int)
    for m in unique_memberships:
        m_values = []
        for c, v in row.items():
            if m in c:
                m_values.append(v)
        res[m] = max(m_values)
    res["mep_id"] = index
    memebership_summary.append(res)

df_m_summary = pd.DataFrame(memebership_summary).set_index("mep_id")


new_columns = []
for c in df_cros.columns:
    teste = [m for m in unique_memberships if m in c]
    if len(teste) > 0:
        continue
    new_columns.append(c)


df_cros_summ = df_cros[new_columns].join(df_m_summary)

In [5]:
del df_cros_summ["EU_INSTITUTION"]
del df_cros_summ["EU_POLITICAL_GROUP"]
del df_cros_summ["NATIONAL_CHAMBER"]

In [6]:
df_cros_summ.to_csv("./data/gold/cross_section.csv")

# Treatment data

In [7]:
df_treatment = df[["mep_id", "meetings"]].groupby("mep_id").sum()
df_treatment["treatment"] = df_treatment["meetings"] > 0

df_treatment_period = df[["mep_id", "date", "meetings"]].sort_values(["mep_id", "date"])

df_treatment_period["is_treatment"] = (df_treatment_period["meetings"] > 0).astype(int)

df_initial_treatment_dates = (
    df_treatment_period[df_treatment_period["is_treatment"] == True]
    .groupby(["mep_id"])
    .min()
)

del df_initial_treatment_dates["meetings"]
del df_initial_treatment_dates["is_treatment"]

df_treatment = df_treatment.join(df_initial_treatment_dates).rename(
    {"date": "initial_treatment_date", "meetings": "total_meetings"}, axis=1
)

df_treatment["treatment"] = df_treatment["treatment"].astype(int)

# Aggregate membership columns in panel data

In [8]:
df_memeberships = pd.DataFrame()
for m in unique_memberships:
    df_memeberships[m] = df.loc[:, [c for c in df.columns if m in c]].max(axis=1)

# Add treatment data

In [9]:
df_final = df.merge(df_treatment, left_on="mep_id", right_index=True)

In [10]:
df_final["treatment_started"] = df_final["date"] >= df_final["initial_treatment_date"]
df_final["received_treatment_and_started"] = (
    df_final["treatment"] * df_final["treatment_started"]
)

df_final["received_treatment_and_started"] = df_final[
    "received_treatment_and_started"
].astype(int)

In [11]:
df_final['quetions_log'] = np.log(df_final['questions'] + 1)

## add memeberships

In [12]:
df_final = df_final.join(df_memeberships)

In [13]:
important_columns = [
    'mep_id',
    'date',
    'questions',
    'quetions_log',
    'meetings',
    'total_meetings',
    'treatment',
    'initial_treatment_date',
    'treatment_started',
    'received_treatment_and_started'
]
country_columns = [c for c in df_final.columns if 'country' in c]
p_groups_columns = [c for c in df_final.columns if 'political_group' in c]
membership_columns = unique_memberships

columns_to_keep = important_columns + country_columns + p_groups_columns + membership_columns


In [14]:
df_final = df_final[columns_to_keep]


In [16]:
df_final.to_csv("./data/gold/panel_data_treated.csv", index=False)