# Come installare la cartella con tutti i documenti

1.   Vai su Google Drive.
2.   Trova la cartella condivisa "progettoIngDati".
3.   Fai clic con il pulsante destro del mouse sulla cartella e   
     seleziona :
     - Organizza.
     - Aggiungi Scorciatotia
     - Tutte le posizioni
4.   Scegli "My Drive" come destinazione per il collegamento.

In [None]:
#in questo modo caricate il vostro drive, che peró noi non vedremo!
#ma prima é importante che eseguiate i le istruzioni qui sopra!
from google.colab import drive
drive.mount('/content/drive')


articoli_path = '/content/drive/MyDrive/progettoIngDati/sources'

extraction_path = '/content/drive/MyDrive/progettoIngDati/extraction'

import os

if os.path.exists(articoli_path):
    print("La cartella articoli esiste!")
else:
    print("Impossibile trovare la cartella articoli.")

Mounted at /content/drive
La cartella articoli esiste!


# Ogni volta che ci serve tutta la cartella basta copiare questa cella di codice e inserirla all'inizio del colab dove ci servono gli articoli! Successivamente avremo path

In [None]:
import os
import json
from lxml import etree
import pandas as pd

In [None]:
def estrai_dati_da_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    parser = etree.HTMLParser()
    tree = etree.fromstring(content, parser)

    tabelle = {}
    table_counter = 0

    figure_with_tables = tree.xpath('//figure[.//table]')
  	#per l'articolo 2277 questo non funziona perché i dati della tabella non sono in table ma in diversi span, peró ci sta comunque il figcaption

    for figure in figure_with_tables:
        try:
            table_id = figure.xpath("@id")[0]
            #print("table_id:", table_id)
            table = figure.xpath('.//table')[0]
            table_counter += 1
            table_key = f"id_table_{table_counter}"

            caption = figure.xpath('.//figcaption//text()')
            caption_text = ""  # Initialize caption_text

            if caption:
                caption_text = ' '.join([c.strip() for c in caption]).replace('  ', ' ')


            dati_tabella = []
            rows = table.xpath('.//tr[position()>1]')
            for row in rows:
                cols = row.xpath('.//td')
                dati_row = [etree.tostring(col, encoding='unicode', method='html') for col in cols]
                dati_tabella.append(dati_row)


            #questa non funziona
            note_a_pie_di_pagina = []
            footnotes = tree.xpath('//footer//ol//li | //footer//ul//li')
            for footnote in footnotes:
                note_a_pie_di_pagina.append(footnote.text.strip())


            references = tree.xpath(f"//p[a/@href = '#{table_id}']")
            references_text = [ref.xpath('string(.)').replace('\n', '').strip() for ref in references]   #elimina /n ma introduce spazi bianchi
            #print(references_text)


            tabelle[table_key] = {
                "caption": caption_text,
                "table": dati_tabella,
                "footnotes": note_a_pie_di_pagina,
                "references": references_text,
            }
        except Exception as e:
            print(f"Error processing figure in {file_path}: {e}")
            print(f"Figure content: {etree.tostring(figure, encoding='unicode', pretty_print=True)}")  # Print figure content for debugging
            # You might want to add a break statement here if you want to stop processing
            # the file after encountering an error:
            # break

    return tabelle

In [None]:
# Funzione per salvare i dati estratti in un file JSON
def salva_dati_in_json(dati_estratti, article_id, extraction_path):
    tabelle = dati_estratti

    # Salva i dati in un file JSON
    output_file_path = f"{extraction_path}/{article_id}.json"
    os.makedirs(extraction_path, exist_ok=True)
    with open(output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(tabelle, json_file, ensure_ascii=False, indent=4)

    print(f"Dati salvati in {output_file_path}")

In [None]:
#per testare su di un unico file

file_path = os.path.join(articoli_path, 'ar5iv_article_2310.02277.html')

dati_estratti = estrai_dati_da_file(file_path)
        # Salva i dati in JSON
salva_dati_in_json(dati_estratti, '0', extraction_path)

table_id: S4.T1
['Rationale and Method:        We propose a method to gauge complexity by juxtaposing the performance of deep learning models with that of human counterparts. Specifically, we define task difficulty as the disparity in performance between humans and models, normalized by human performance. A more pronounced positive performance gap (for instance, where humans outperform the machine to a greater extent) would signify a higher level of difficulty for the model in handling the given task. Conversely, in cases where the machine outperforms humans, a larger gap indicates an easier task. The resulting assessment of across-task difficulty is outlined in Table               1             .']
Dati salvati in /content/drive/MyDrive/progettoIngDati/extraction/0.json


In [None]:
#MAIN
# Itera attraverso i file nella cartella 'articoli' e estrae le tabelle, didascalie, note e riferimenti
total_tables = 0
total_captions = 0
total_footnotes = 0
total_references = 0

articolo_counter = 0

for filename in os.listdir(articoli_path):
    if filename.endswith('.html'):
        file_path = os.path.join(articoli_path, filename)
        article_id = filename.split('.')[1]  # Estrae l'ID dal nome del file (primo elemento prima del punto)

        articolo_counter += 1  # Incrementa il contatore
        print(f"Estraendo dati dall'articolo {articolo_counter}: {filename}...")

        # Estrai i dati dal file HTML
        dati_estratti = estrai_dati_da_file(file_path)

        # Update statistics counters
        for table_data in dati_estratti.values():
            total_tables += 1
            if table_data["caption"]:
                total_captions += 1
            total_footnotes += len(table_data["footnotes"])
            total_references += len(table_data["references"])

        # Salva i dati in JSON
        salva_dati_in_json(dati_estratti, article_id, extraction_path)

# Print statistics
print("\n--- Extraction Statistics ---")
print(f"Total Articles Processed: {articolo_counter}")
print(f"Total Tables Found: {total_tables}")
print(f"Total Captions Found: {total_captions}")
print(f"Total Footnotes Found: {total_footnotes}")
print(f"Total References Found: {total_references}")

Estraendo dati dall'articolo 1: ar5iv_article_2310.02255.html...
table_id: S2.F3
table_id: S2.F3.fig1
table_id: S3.T2
table_id: A3.T3
table_id: A3.T4
table_id: A3.T5
table_id: A6.T8
table_id: A6.T9
table_id: A6.T11
table_id: A6.T12
table_id: A7.T13
Dati salvati in /content/drive/MyDrive/progettoIngDati/extraction/02255.json
Estraendo dati dall'articolo 2: ar5iv_article_2310.02256.html...
Error processing figure in /content/drive/MyDrive/progettoIngDati/sources/ar5iv_article_2310.02256.html: list index out of range
Figure content: <figure>
      <figcaption class="ltx_caption ltx_centering" style="font-size:50%;">
       <span class="ltx_tag ltx_tag_table">
        Table 1:
       </span>
       <span class="ltx_text" id="S3.T1.72.7" style="color:#000000;">
        All
       </span>
       Gaia eDR3
       <span class="ltx_text ltx_font_italic" id="S3.T1.73.8">
        red
       </span>
       candidate binaries within 200 pc. In the below columns, the subscript 1 identifies the prima