In [1]:
import pandas as pd
import os
from lxml import etree
import re
from thefuzz import fuzz
from pathlib import Path
import random
import httpx
import json

In [3]:
df_clef_overview = pd.read_parquet("../../../data/CLEF_overview_paper_with_ids_of_citing_papers.parquet")

In [6]:
# Define path to CLEF XML-tranformed documents

directory = '../../../data/citing_overview_paper_paper_pdfs/XML_files'

In [7]:
# Get all paths to the corresponding documents
files = os.listdir(directory)

In [10]:

def list_all_paths(directory):
    paths = []
    for root, dirs, files in os.walk(directory):
        for name in files:
            paths.append(os.path.join(root, name))
        for name in dirs:
            paths.append(os.path.join(root, name))
    return paths

directory = '../../../data/citing_overview_paper_paper_pdfs/XML_files_new'

all_paths = list_all_paths(directory)
print(len(all_paths))

1796


In [11]:

def find_closest_head_tag(element):
    """
    Finds the closest <head> tag before the given element and extracts its text content.

    Args:
        element (etree.Element): The XML element from which to start searching for the closest <head> tag.

    Returns:
        str: The text content of the closest <head> tag without HTML tags. 
             Returns "No text found in the <head> tag." if the <head> tag is empty.
             Returns "No <head> tag found." if no <head> tag is found before the element.
    """

    # Search for the closest <head> before the current element
    for sibling in element.itersiblings(preceding=True):
        if sibling.tag == '{http://www.tei-c.org/ns/1.0}head':
            # Extract the text content of the <head> tag without HTML tags
            return sibling.text.strip() if sibling.text else "No text found in the <head> tag."

    return "No <head> tag found."

In [23]:
def find_references(xml_file, search_string):
    """
    Finds references within an XML file that match a given search string, and retrieves the surrounding text.

    Args:
        xml_file (str): Path to the XML file to search within.
        search_string (str): The string to search for within <title> elements.

    Returns:
        list: A list of sentences containing the found references and their corresponding <head> tags. 
              Each item in the list is a tuple containing the sentence and the closest <head> tag text. 
              Returns an empty list if no references are found.
    """
    
    tree = etree.parse(xml_file)
    root = tree.getroot()
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    listbibl_elements = root.findall('.//tei:listBibl', namespaces=ns)
    results = []

    # Iterate through all <listBibl> elements
    for listbibl in listbibl_elements:
        # Search for <title> elements within <listBibl>
        titles = listbibl.findall('.//tei:title', namespaces=ns)
        for title in titles:
            # Check if the title contains the search string
            if title is not None and title.text is not None:
                if fuzz.ratio(search_string.lower(), title.text.lower()) > 90:
                    bibl_struct = title.getparent().getparent()
                    xml_id = bibl_struct.get('{http://www.w3.org/XML/1998/namespace}id')
                    results.append((title.text, xml_id))  
    if len(results) == 0:
        return []

    else:
        ref_elements = root.findall('.//tei:ref', namespaces=ns)
        sentences = []
    
        for ref in ref_elements:
            # Retrieve the target attribute
            target = ref.get('target')
            
            if target:
                # Clean the target attribute by removing the "#" symbol
                target_id = target.lstrip('#')
                
                # Compare the target ID with the searched ID
                if target_id == xml_id:
                    # Find the parent <p> tag
                    parent = ref.getparent()
                    
                    # Search for the nearest <p> element
                    while parent is not None and parent.tag != '{http://www.tei-c.org/ns/1.0}p':
                        parent = parent.getparent()
    
                    if parent is not None:
                        # Retrieve the entire text of the parent <p> tag
                        text = parent.text if parent.text is not None else ''
                        tail = parent.tail if parent.tail is not None else ''
                        full_text = text + ''.join(parent.itertext()) + tail
                        
                        # Find the text around the <ref> tag
                        ref_text = ref.text.strip() if ref.text is not None else ''
                        
                        # Search for the sentence containing the <ref> tag
                        pattern = rf'[^.!?]*{re.escape(ref_text)}[^.!?]*[.!?]'
                        match = re.search(pattern, full_text)
                        head_tag = find_closest_head_tag(parent)
                        if match:
                            sentence = match.group().strip()
                            sentences.append([sentence])
                       
                       
        return sentences

In [24]:
path = "../../../data/citing_overview_paper_paper_pdfs/XML_files_new"
columns = []

# Parse through the XML-transformed citing documents and extract the context text for the underlying reference

for i, j in df_clef_overview.iterrows():
    snippet_extraction_list = []

    for k in j["filtered_citing_paper_id_lists"]:
        filename = k.replace("https://openalex.org/", "") + ".tei.xml"
        file_path = Path(path + "/" + filename)

        if file_path.is_file():
            extractions = find_references(file_path, j["Title"])
            if len(extractions) > 0:
                snippet_extraction_list.append(extractions)

    columns.append(snippet_extraction_list)

In [25]:
columns

[[[['One of the proposed tasks asked to the participants on this data was to automatically pre-populate handover forms with relevant text-snippets (slot filling) [16].']],
  [['The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers [51].'],
   ['The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers [51].'],
   ['Researchers worldwide have contributed to achieve a significant improvement on the clinical handover task because of a shared computational task organized in 2016 

In [26]:
def flatten(nested_list):
    flat_list = []
    for item in nested_list:
        if isinstance(item, list):
            flat_list.extend(flatten(item))
        else:
            flat_list.append(item)
    return flat_list

In [27]:
flat_list = flatten(columns)


In [28]:
flat_list

['One of the proposed tasks asked to the participants on this data was to automatically pre-populate handover forms with relevant text-snippets (slot filling) [16].',
 'The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers [51].',
 'The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers The CLEF eHealth 2016 Task 1 required the participants to implement systems that are able to identify relevant text snippets from free-text nursing handovers [51].',
 'Researchers worldwide have contributed to achieve a significant improvement on the clinical handover task because of a shared computational task organized in 2016 Researchers worl

In [35]:
filtered_list = [s for s in flat_list if isinstance(s, str)]
filtered_list = list(set(filtered_list))

In [36]:
len(filtered_list)

2232

In [37]:
# Select a sample of 300 random snippets
random_selection = random.sample(filtered_list, 300)

In [38]:
random_selection

['A detailed description of the protocol used to build the GeoLifeCLEF 2018 dataset is provided in A detailed description of the protocol used to build the GeoLifeCLEF 2018 dataset is provided in [1].',
 'They also implemented training on artificially constructed datasets and reported superior performances on ImageCLEF dataset (García Seco de Herrera et al., 2016).',
 'One of the means to improve the access to the Linked Data Web lies in the development of natural-language interfaces that can transform human languages or even controlled languages into a representation suitable for querying large knowledge bases [10].',
 'Organizers of this challenge provided a largescale question answering competition, in which the systems are required to cope with all stages of a question answering task, including the retrieval of relevant articles and snippets as well as the provision of natural language answers [30,31].',
 'Previous research has shown that di↵erent users tend to issue di↵erent queri

In [39]:
# Define the localhost url for the Llama3 instance

url = "http://localhost:11434/api/generate"

In [40]:
# Define the prompt for the classification with Llama3

prompt = """
	Can you tell me from the context of the sentence at the end of the prompt if the underlying paper used a dataset related to CLEF? 
    If there is one or more datasets from CLEF used, please return all names separated by a semicolon. If there are none, please just respond with None. 
    Please provide no other output sentences except for the list or None.
    """

In [44]:
list_of_used_datasets = []

# Classify the sentence with Llama3 and count how many of the citations are referring to the usage of the underlying dataset 
counter = 1
for i in random_selection:
    print(counter, "/", len(random_selection)) 
    prompt = """
            Can you tell me from the context of the sentence on the end of the prompt, if the underlying paper used a dataset related to CLEF? If there is one or more datasets from CLEF used please return all names separated by a semicolon, if there are none please just responde with None. Please no other 
            output sentences except for the list or the None.
            """
    prompt = prompt + "\nSentence:\n" + i 
    data = {"model":"llama3", "prompt" : f"{prompt}"}
    response = httpx.post(url, data=json.dumps(data), headers={"Content-Type": "application/json"}, timeout=15)
    response_lines = [line for line in response.text.strip().split("\n") if line]
    response_dicts = [json.loads(line) for line in response_lines]
    answer = "".join(response_dict.get("response", "") for response_dict in response_dicts)
    
    list_of_used_datasets.append([i, answer])
            

1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300
1 / 300


In [56]:
list_of_used_datasets[11]

['In recent years there has been an increasing interest in the problem of plant species classification in images In recent years there has been an increasing interest in the problem of plant species classification in images [3,7,12,18].',
 'None']

In [59]:
# Calculate the amount of random samples that were considered as referecing the reusage of the underlying dataset
counter = 0
for i in list_of_used_datasets:
    if i[1] != "None":
        print(i[0] ,"\n")
        counter +=1

A detailed description of the protocol used to build the GeoLifeCLEF 2018 dataset is provided in A detailed description of the protocol used to build the GeoLifeCLEF 2018 dataset is provided in [1]. 

They also implemented training on artificially constructed datasets and reported superior performances on ImageCLEF dataset (García Seco de Herrera et al., 2016). 

• ImageCLEF • ImageCLEF [71]: it consists of more than 250k images belonging to 95 concepts and is split into training, dev and test data; we only consider the dev set, which includes 1,000 images equally split between training and testing, as the ground-truth is released on this dev set only. 

PlantCLEF PlantCLEF [32]: The PlantCLEF dataset is a large-scale dataset for plant identification, comprising millions of images covering thousands of plant species, including trees, flowers, fruits, and leaves. 

Closely related to our initiative is the ImageCLEF benchmarking and in particular the 2009 Photo Retrieval task Closely rel

In [60]:
# Amount of positive labels is 118 out of 300 for the 300 random samples
counter

118

In [63]:
# Writing snippets and labels into text file to manually evaluate the correctness of the labels
with open('testing_quality_of_model.txt', 'w', encoding="utf-8") as f:
    for sublist in list_of_used_datasets:
        for item in sublist:
            f.write(item + '\n')  