In [1]:
import requests
import urllib.parse
from src.ParliamentDataHandler import ParliamentDataHandler
import pandas as pd
import json

In [2]:
handler = ParliamentDataHandler()

# MEPs

In [None]:
api_endpoint = "https://api.europarl.europa.eu/documents/mep/{mep_id}"

# Replace with the MEP's identifier
mep_id = "person/1"

# Make the API request
response = requests.get(api_endpoint.format(mep_id=mep_id))


# Meetings

In [4]:
# Initialize start and end dates
init_date = '01/01/2024'
end_date = '14/10/2024'

# Execute the batch scraping with concurrency
meetings = handler.scrape_meetings_in_parallel(init_date, end_date)


df_new = pd.DataFrame(meetings)
df_meetings = pd.read_csv('./data/meetings.csv')
pd.concat([df_meetings, df_new]).drop_duplicates().to_csv('./data/meetings.csv', index=False)

10it [00:04,  2.38it/s]


# Questions

In [3]:
# questions = handler.get_questions()
# print(len(questions))
# pd.DataFrame(questions).to_csv('./data/questions.csv', index=False)
questions = pd.read_csv('./data/questions.csv').to_dict(orient='records')

In [None]:
from tqdm import tqdm
import concurrent.futures
import os

from src.ParliamentDataHandler import RateLimiter

# time.sleep(300)

rate_limiter = RateLimiter(max_requests=500, time_window=300)  # 500 requests per 5 minutes

def fetch_question_details(identifiers):
    results = []
    for identifier in identifiers:
        if identifier:
            # check if the json file exists
            if os.path.exists(f'./data/questions/{identifier}.json'):
                with open(f'./data/questions/{identifier}.json', 'r') as f:
                    results.append(json.load(f))
            else:
                rate_limiter.wait_if_needed()
                new_data = handler.get_questions_details(identifier)
                results.append(new_data)
                with open(f'./data/questions/{identifier}.json', 'w') as f:
                    json.dump(new_data, f)
    return results

# Split questions into batches of 10
batch_size = 10
question_batches = []
current_batch = []

for q in questions:
    identifier = q.get("identifier")
    if len(current_batch) < batch_size:
        current_batch.append(identifier)
    else:
        question_batches.append(current_batch)
        current_batch = [identifier]
if current_batch:  # Fixed the indentation of this check
    question_batches.append(current_batch)

# Limit concurrent threads to avoid exceeding rate limit
max_workers = min(10, 500 // batch_size)  # Ensure we don't spawn too many threads

final = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(fetch_question_details, batch) for batch in question_batches]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(question_batches)):
        final.extend(future.result())


 90%|████████▉ | 6555/7322 [10:27:57<1:41:07,  7.91s/it] 

An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2014-006388?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)
An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2015-000938?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)
An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2015-001909?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)
An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2014-010514?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)
An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2015-004492?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)
An error occurred: U

 92%|█████████▏| 6738/7322 [10:45:15<34:30,  3.55s/it]  

An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2016-003349?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)
An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2016-002003?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)


 92%|█████████▏| 6739/7322 [10:46:07<2:55:02, 18.01s/it]

An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2016-002202?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)
An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2016-002179?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)


 92%|█████████▏| 6740/7322 [10:46:12<2:16:09, 14.04s/it]

An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2016-005566?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)
An error occurred: URL - https://data.europarl.europa.eu/api/v2/parliamentary-questions/P-8-2016-003971?format=application%2Fld%2Bjson&language=en ERROR: Expecting value: line 1 column 1 (char 0)


100%|██████████| 7322/7322 [11:43:40<00:00,  5.77s/it]  


In [32]:
df = pd.DataFrame([f[0] for f in final if len(f) > 0])
df.to_csv('./data/questions_details.csv', index=False)