# Eurostat Web-Crawling & Extraction Tool

In [None]:
import os, json
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI
from extraction_utils import ask_chatgpt
from navigator_utils import search_google, split_markdown, clean_content
from save_utils import convert_for_saving, save_json
import pandas as pd

os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY_HERE" # Set your OpenAI API key here

***

This parte of the notebook is the main entry point to run the crawling and LLM-based
information extraction pipeline for the Eurostat statistical competition.

- It reads the list of companies from `data/extraction_empty.csv`
- It crawls the web for each company (annual reports and related pages)
- It uses an LLM to extract the target fields (ACTIVITY, COUNTRY, EMPLOYEES, TURNOVER, ASSETS, WEBSITE)
- It saves one JSON file per company inside the `save/` folder.

In [None]:
CSV = "./extraction.csv"  # Input CSV file with company names
PROMPT_FILE = "./data/prompt.json"  # JSON file containing system and user prompts
PATH = "./data/save_data/" # Directory to save each company results

In [None]:
df = pd.read_csv(CSV, sep=";", encoding="utf-8")
ent_list_ID =  df["NAME"].unique()
ent_list_ID_tuple = list(zip(df["ID"].unique(), df["NAME"].unique()))
field_key = df['VARIABLE'].unique()
id_list = df["ID"].unique()
id_list = [str(i) for i in id_list]

field =  {'ACTIVITY':'','COUNTRY':'Headquarter country', 'EMPLOYEES':'number of employees', 
           'TURNOVER':'turnover or  net revenue', 'ASSETS':'total assets', 'WEBSITE':'official website'}

In [None]:
## loading prompt for extraction
with open(PROMPT_FILE, 'r', encoding='utf-8') as file:
    diz_prompt =json.load(file)

prompt_diz = diz_prompt['PROMPT']

In [None]:
YEAR = 2024
results = {}

for id, company_name in tqdm(ent_list_ID_tuple):
    results[company_name]= {}
    links_ricerca = search_google(company_name, YEAR, field=field, n_pdf=3, n_nopdf=5)
    for k, v in links_ricerca.items():
        print(f"Processing {company_name} - {k}")
        if len(v) == 0:
            continue
        for link in v:
            url = link['url']
            print(url)
            year = link['year']
            try:
                extract = await clean_content(link['url'])
                print('extract!')
                if extract.markdown:
                    try:
                        markdown = extract.markdown.fit_markdown
                        if len(markdown) <= 1:
                            continue
                        elif len(markdown) < 10000:
                            chat_extract = ask_chatgpt(link, markdown, user_prompt=prompt_diz[k].replace("{T}", str(link['year'])))
                            if 'null' not in chat_extract:
                                results[company_name][k] = [str(id), url, chat_extract, str(year)]
                                print(results[company_name][k])
                                break
                        else:
                            list_markdown=split_markdown(markdown)
                            for l in list_markdown:
                                chat_extract = ask_chatgpt(link, l, user_prompt=prompt_diz[k].replace("{T}", str(link['year'])))
                                if 'null' not in chat_extract:
                                    results[company_name][k] = [str(id), url, chat_extract, str(year)]
                                    print(results[company_name][k])
                                    break
                        if 'null' not in chat_extract:
                            break
                    except:
                        continue
                else:
                    continue
            except:
                continue
    with open(PATH+f"ris_{id}_{company_name.replace('/','-')}.json", "w", encoding='utf-8') as f:
        json.dump(results[company_name], f, indent=4, ensure_ascii=False)
            

***

This part of the notebook aggregates the results of the previous part to create a complete csv file with all the extracted information.

In [None]:
# loading all data saved into a single dictionary
list = [f for f in os.listdir(PATH) if f.endswith('.json')]
list_id = [f.split('_')[1] for f in list ]

diz = {}
for idf, file in enumerate(list):
    path = PATH+file
    with open(path, "r") as fi:
        try:
            diz[list_id[idf]]=json.load(fi)
        except:
            continue

In [None]:
TOT = convert_for_saving(diz, df)
save_json(CSV, TOT, df)