In [1]:
import pandas as pd
import json

In [2]:
df_meps_raw = pd.read_csv('data/mep_detail.csv')
df_meps_raw['hasMembership'] = df_meps_raw['hasMembership'].str.replace("'", '"').apply(json.loads)

In [3]:
from datetime import datetime, timedelta
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

def generate_dates(start_date: datetime, end_date: datetime):
    dates = []
    current_date = start_date
    while current_date <= end_date:
        dates.append(current_date)
        current_date += timedelta(days=1)
    return dates

def create_membership_timeline(mep_id, memberships_data, dates: list[datetime] | None):
    if not dates:
        # Get all unique dates where changes happen
        dates = set()
        for membership in memberships_data:
            member_during = membership.get('memberDuring', {})
            start_date = member_during.get('startDate')
            end_date = member_during.get('endDate')
            
            if start_date:
                dates.add(start_date)
            if end_date:
                end_dt = datetime.strptime(end_date, '%Y-%m-%d')
                dates.add(end_dt.strftime('%Y-%m-%d'))
    
    dates = sorted(dates)
    
    # Initialize results structure
    timeline = []
    
    # For each date, check active memberships
    for date in dates:
        date_data = defaultdict(int)
        if isinstance(date, str):
            current_date = datetime.strptime(date, '%Y-%m-%d')
        else:
            current_date = date
        
        # Add the MEP ID
        date_data['ID'] = mep_id
        date_data['date'] = date
        
        # Check each membership
        for membership in memberships_data:
            member_during = membership.get('memberDuring', {})
            start_date = member_during.get('startDate')
            end_date = member_during.get('endDate')
            
            # Check if membership is active on this date
            is_active = False
            if start_date:
                start_dt = datetime.strptime(start_date, '%Y-%m-%d')
                if end_date:
                    end_dt = datetime.strptime(end_date, '%Y-%m-%d')
                    is_active = start_dt <= current_date <= end_dt
                else:
                    is_active = start_dt <= current_date
            
            if is_active:
                # Handle country representation
                if 'represents' in membership:
                    date_data['COUNTRY'] = membership['represents'].split('/')[-1]
                
                # Handle organization (like political group)
                org = membership.get('organization', '')
                if org:
                    org_type = membership.get('membershipClassification', '').split('/')[-1]
                    if org_type == 'EU_POLITICAL_GROUP':
                        date_data['POLITICAL_GROUP'] = org.split('/')[-1]
                
                # Handle role
                entity = membership.get('membershipClassification', '').split('/')[-1]
                role = membership.get('role', '').split('/')[-1]
                date_data[f"{entity} - {role}"] = 1
                
        
        timeline.append(dict(date_data))
    
    # Convert to DataFrame
    df = pd.DataFrame(timeline)
    
    return df

# Selecionar datas dos outros dois dataframes

Utilizar a base inteira resulta em um dataframe muito pesado sem necessidade.


In [4]:
df_questions_ymd = pd.read_csv('./data/silver/df_questions_by_period_YYYY-MM-DD.csv', index_col=0)
df_meetings_ymd = pd.read_csv('./data/silver/df_meetings_by_period_YYYY-MM-DD.csv', index_col=0)

dates = sorted(set(df_questions_ymd.columns.tolist() + df_meetings_ymd.columns.tolist()))
dates = [datetime.strptime(d, "%Y-%m-%d") for d in dates]

# Get unique MEP Ids
mep_ids =[int(mid) for mid in set(list(df_questions_ymd.index) + list(df_meetings_ymd.index))]

# Run

In [17]:
df_to_run = df_meps_raw[df_meps_raw['notation_codictPersonId'].isin(mep_ids)].reset_index(drop=True)

df_final = pd.DataFrame()

for index, row in tqdm(df_to_run.iterrows(), total=len(df_to_run)):
    mep_id = row['id'].split('/')[-1]
    df = create_membership_timeline(mep_id, row['hasMembership'], dates)
    df_final = pd.concat([df_final, df])


df_final.sort_values(by=['ID', 'date'], inplace=True)

df_final.to_csv('./data/silver/mep_membership_timeline.csv', index=False)


100%|██████████| 1353/1353 [33:19<00:00,  1.48s/it]
