# 0) Import

In [10]:
import pandas as pd
import httpx
from selectolax.parser import HTMLParser
from bs4 import BeautifulSoup
import json
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

In [11]:
def save_dict_to_json(dictionary, path):
    with open(path, 'w') as file:
        json.dump(dictionary, file)

# 1) Creazione risorsa Uffici Finanziari

In [None]:
url = 'https://www1.agenziaentrate.gov.it/servizi/tassazioneattigiudiziari/registrazione.htm?passo=0'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"}

resp = httpx.get(url, headers=headers)

dict_uffici = {}
errors=[]
if resp.status_code == 200:
    html = HTMLParser(resp.text)
    uffici = html.css("option")
    for ufficio in uffici:
        value = ufficio.attributes.get('value')
        dict_uffici[value] = ufficio.text().strip()
else:
    print('Not entered')

In [12]:
data = [{"code": key, "office": value} for key, value in dict_uffici.items()]
uffici_finanziari = pd.DataFrame(data)

uffici_finanziari.to_excel('Resources/uffici_finanziari.xlsx', index=False)

In [None]:
uffici = pd.read_excel('Resources/uffici_finanziari.xlsx')
uffici = uffici.iloc[1:]
lista_uffici = uffici['office'].str.lower().tolist()

# 2) Creazione risorsa Enti

In [None]:
def click_button(driver, button):
    try:
        button.click()
    except Exception as e:
        driver.execute_script("arguments[0].click();", button) 



 
def fetch_uffici(driver, comune):
    lista = []
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "select#ufficio.form-control")))
        button = driver.find_element(By.CSS_SELECTOR, "select#ufficio.form-control")
        button.send_keys(comune)
        button.send_keys(Keys.ENTER)


        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input#avanti.btn.btn-primary")))
        button = driver.find_element(By.CSS_SELECTOR, "input#avanti.btn.btn-primary")
        click_button(driver,button)
    except:
        print('Non riuscito')


    if driver.find_element(By.CSS_SELECTOR, "select#ente.form-control"):

        button = driver.find_element(By.CSS_SELECTOR, "select#ente.form-control")
        click_button(driver,button)
        all_option = button.find_elements(By.CSS_SELECTOR,'option')

        for option in all_option:
            lista.append(option.text)

    else:
        print('Not entered in if.')
    return lista
    

 
def main():
    errors = list() 
    comuni_list = lista_uffici.copy()
    options = webdriver.EdgeOptions()
    driver = webdriver.Edge(options=options)
    driver.maximize_window()
    url = 'https://www1.agenziaentrate.gov.it/servizi/tassazioneattigiudiziari/registrazione.htm?passo=0'
    dict_option = {}
 
    chunks = [
        comuni_list[0:150],
        comuni_list[150:300],
        comuni_list[300:450],
        comuni_list[450:600],
        
    ]
 
    for i, chunk in enumerate(chunks):
        dict_option.clear()
        for j,comune in enumerate(chunk):
            print(j,' Processing comune:', comune)
            driver.get(url)
            tribunali = fetch_uffici(driver, comune)
            if tribunali == []:
                errors.append(comune)
            else: 
                dict_option[comune] = tribunali

        save_dict_to_json(dict_option, f'Resources/enti_{(i+1)*150}.json')
        with open(f"Errors/errors_enti_{(i+1)*150}.txt", "w") as file:
            for item in errors:
                file.write(item + "\n")
    driver.quit()

 
if __name__ == "__main__":
    main()

In [None]:
def merge_json_files(folder_path):
    merged_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                json_data = json.load(file)
                merged_dict.update(json_data)
    return merged_dict

# Replace 'folder_path' with the path to your folder containing JSON files
folder_path = 'Resources'
merged_dictionary = merge_json_files(folder_path)


In [None]:
def filter_empty_strings(dictionary):
    filtered_dict = {}
    for key, value in dictionary.items():
        if isinstance(value, list):
            filtered_dict[key] = [item for item in value if item.strip()]
        else:
            filtered_dict[key] = value
    return filtered_dict

# Assuming merged_dictionary is the dictionary obtained from merging JSON files
merged_dictionary_filtered = filter_empty_strings(merged_dictionary)

In [None]:
def filter_keys_with_word(dictionary, word):
    filtered_dict = {key: value for key, value in dictionary.items() if word in key}
    return filtered_dict

# Assuming merged_dictionary is your dictionary
filtered_dictionary = filter_keys_with_word(merged_dictionary_filtered, 'ut')

In [12]:
save_dict_to_json(filtered_dictionary, f'Resources/enti_completi_ut.json')