In [5]:
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
df_questions_ymd = pd.read_csv('./data/silver/df_questions_by_period_YYYY-MM-DD.csv', index_col=0)
df_meetings_ymd = pd.read_csv('./data/silver/df_meetings_by_period_YYYY-MM-DD.csv', index_col=0)

df_meps = pd.read_csv('./data/silver/mep_membership_timeline.csv')

In [7]:
all_periods = sorted(set(df_questions_ymd.columns.tolist() + df_meetings_ymd.columns.tolist()))

# Fill missing periods with 0
df_questions = df_questions_ymd.reindex(all_periods, axis=1, fill_value=0)
df_meetings = df_meetings_ymd.reindex(all_periods, axis=1, fill_value=0)

# Combine dataframes



In [8]:
# I need to transform the columns into rows
df_questions_melted = df_questions.melt(ignore_index=False, var_name='date', value_name='questions')
df_meetings_melted = df_meetings.melt(ignore_index=False, var_name='date', value_name='meetings')

# Reset index to handle duplicate indices
df_questions_melted = df_questions_melted.reset_index()
df_meetings_melted = df_meetings_melted.reset_index()

# now join the two dataframes
df_combined = pd.merge(df_questions_melted, df_meetings_melted, 
                      on=['member_id', 'date'])

# Aggregate the data by week
# df_combined['date'] = pd.to_datetime(df_combined['date']).dt.strftime('%Y-%U')
# df_combined = df_combined.groupby(['member_id', 'date']).sum().reset_index()


# Add MEP data

In [9]:
df_combined["member_id"] = df_combined["member_id"].astype(int)

df = df_meps.merge(
    df_questions_melted,
    right_on=["member_id", "date"],
    left_on=["ID", "date"],
    how="left",
).merge(
    df_meetings_melted,
    right_on=["member_id", "date"],
    left_on=["ID", "date"],
    how="left",
)

# fILL NA

In [13]:
df = df.fillna(0)

# Add dummies

In [14]:
new_columns = []

# Drop political group 0 as reference category
political_groups = df["POLITICAL_GROUP"].unique()
political_groups = political_groups[political_groups != 0]  # Remove 0
for group in political_groups:
    col_name = f"political_group_{int(group)}"
    new_columns.append(col_name)
    df[col_name] = (df["POLITICAL_GROUP"] == group).astype(int)



# DROP country FRA as reference
countries = df["COUNTRY"].unique()
countries = countries[countries != "FRA"]  #FRA as Reference
for country in countries:
    col_name = f"country_{country}"
    new_columns.append(col_name)
    df[col_name] = (df["COUNTRY"] == country).astype(int)

# Clean columns

In [22]:
cols_to_ignore = [
    'date', # sort
    'meetings', # sort
    'questions', # sort
    "ID",
    "member_id_x",
    "member_id_y",
    "POLITICAL_GROUP",
    "COUNTRY",
    " - CHAIR",
    " - CHAIR_VICE",
    " - MEMBER",
    " - MEMBER_PARLIAMENT",
    " - PRESIDENT",
    " - PRESIDENT_PARLIAMENT_STOA",
    " - PRESIDENT_VICE",
    " - QUAESTOR",
]

In [28]:
cols_to_select = [c for c in df.columns if c not in cols_to_ignore]
cols_to_select.sort()


df_gold = df[["ID", "date", "questions", "meetings", *cols_to_select]].rename(
    {"ID": "mep_id"}, axis=1
)

In [29]:
df_gold.to_csv('./data/gold/panel_data_3.csv', index=False)