In [1]:
import pandas as pd
import json
import httpx
import json

# Analysis for CEUR & LNCS

In [2]:
df_ceur = pd.read_parquet("../../../data/metadata_CEUR.parquet")
df_lncs = pd.read_parquet("../../../data/metadata_LNCS.parquet")
df_lncs.loc[df_lncs['ID'] == "lncs_649", 'Section'] = "CLEF at SemEval 2007"

In [3]:
# Read the manually curated list of Overview paper title

file_name = "../../../data/CEUR_Overview_titles.txt"

overview_paper_ceur = []

with open(file_name, 'r', encoding='utf-8') as file:
    overview_paper_ceur = [line.strip() for line in file]

In [4]:
# Apply the Overview classification based on the Title matching

overview_participant_classification = []
for i , j in df_ceur.iterrows():
    if j["Title"] in overview_paper_ceur:
        overview_participant_classification.append("Overview")
    else:
        overview_participant_classification.append("Participant")

In [5]:
df_ceur["Classification"] = overview_participant_classification

In [6]:
# Define concordance dict for Overview paper

concordance_dict_lncs_overview = {}
concordance_dict_lncs_overview["Overview"] = ["Overview", "Overviews", "CLEF Lab Overviews", "Lab Overviews", "Labs Overviews" ,"CLEF 2018 Lab Overviews", "Overviews  of 2022 Labs", "CLEF 2020 Lab Overviews", "Overviews 2021 Labs", "Lab Overviews","CLEF 2019 Lab Overviews","CLEF 2016 Labs Overviews"]

In [7]:
# Apply LNCS classification for Overview papers with concordance dict

def categorize(row):
    for key, values in concordance_dict_lncs_overview.items():
        if row['Section'] in values or row['Subsection'] in values:
            return key
    return 'Participant'

# Neue Spalte 'Category' basierend auf der Kategorisierung
df_lncs['Classification'] = df_lncs.apply(categorize, axis=1)

In [8]:
# Read the manually curated list of Overview paper title for LNCS


file_name = "../../../data/LNCS_Overview_titles.txt"

overview_paper_lncs = []

with open(file_name, 'r', encoding='utf-8') as file:
    overview_paper_lncs = [line.strip() for line in file]

In [9]:
# Apply the Overview classification based on the Title matching

for i, j in df_lncs.iterrows():
    if j["Title"] in overview_paper_lncs:
        df_lncs.loc[i, 'Classification'] = 'Overview'

In [10]:
df_lncs_overview = df_lncs[df_lncs["Classification"] == "Overview"]

In [11]:
df_ceur_overview = df_ceur[df_ceur["Classification"] == "Overview"]

In [16]:
df_clef_overview = pd.concat([df_ceur_overview, df_lncs_overview], axis=0)

In [17]:
# Drop unnecessary columns

columns_to_keep = ['PubYear', 'Title', "ID", "filename"]
df_clef_overview = df_clef_overview[columns_to_keep]

In [21]:
# Read metadata for CEUR (Source: OpenALex)

input_file_path = '../../../data/OpenAlex_CEUR.json'  
with open(input_file_path, 'r', encoding="utf-8") as file:
    OpenAlex_CEUR = json.load(file)

In [22]:
# Create a list of OpenAlex IDs to filter those citations out

CLEF_openalex_ids = []
for key, value in OpenAlex_CEUR.items():
    CLEF_openalex_ids.append(value["ids"]["openalex"])
                             

In [23]:
len(CLEF_openalex_ids)

2858

In [24]:
# Read metadata for LNCS (Source: OpenALex)

input_file_path = '../../../data/OpenAlex_LNCS.json'  
with open(input_file_path, 'r', encoding="utf-8") as file:
    OpenAlex_LNCS = json.load(file)

In [25]:
# Add additional OpenAlex IDs for LNCS documents to filter those citations out

for key, value in OpenAlex_LNCS.items():
    CLEF_openalex_ids.append(value["ids"]["openalex"])
                     

In [26]:
len(CLEF_openalex_ids)

4114

In [27]:
# Read in the metadata of the citing papers of LNCS (Source: OpenAlex)

input_file_path = '../../../data/OpenAlex_LNCS_citing_doc.json' 
with open(input_file_path, 'r', encoding="utf-8") as file:
    OpenAlex_citing_LNCS = json.load(file)

In [28]:
# Read in the metadata of the citing papers of CEUR (Source: OpenAlex)

input_file_path = '../../../data/OpenAlex_CEUR_citing_doc.json'  
with open(input_file_path, 'r', encoding="utf-8") as file:
    OpenAlex_citing_CEUR = json.load(file)

In [29]:
# Read in the metadata of the citing papers of LNCS (Source: Semantic Scholar)

input_file_path = '../../../data/SemanticScholar_LNCS_additional_metadata.json'  
with open(input_file_path, 'r', encoding="utf-8") as file:
    Semantic_citing_LNCS = json.load(file)

In [30]:
# Read in the metadata of the citing papers of CEUR (Source: Semantic Scholar)

input_file_path = '../../../data/SemanticScholar_CEUR_additional_metadata.json'  
with open(input_file_path, 'r', encoding="utf-8") as file:
    Semantic_citing_CEUR = json.load(file)

In [116]:
counter = 0 
citing_papers_of_dataset = []
titles = []
for i, j in df_clef_overview.iterrows():
    OpenAlex_ID_list = []
    title_list = []

    # Extract all the citing papers and filter those out that are not corresponding to a CEUR paper

    if j["ID"].startswith("ceur") and j["ID"] in OpenAlex_citing_CEUR:
        
        for k in OpenAlex_citing_CEUR[j["ID"]]:
            if k["ids"]["openalex"] not in CLEF_openalex_ids: #and k.get("primary_location", {}).get("source", {}).get("display_name") != "CLEF (Working Notes)":
                #try:
                    #if k["primary_location"]["source"]["display_name"] != "CLEF (Working Notes)":
                OpenAlex_ID_list.append(k["ids"]["openalex"])
                    #print(k["primary_location"]["source"]["display_name"])
                #except:
                #    OpenAlex_ID_list.append(k["ids"]["openalex"])
                    #print(k["title"])
        #citing_papers_of_dataset.append(OpenAlex_ID_list)

    # Extract all the citing papers and filter those out that are not corresponding to a LNCS paper

    if j["ID"].startswith("lncs") and j["ID"] in OpenAlex_citing_LNCS:
        for k in OpenAlex_citing_LNCS[j["ID"]]:
            if k["ids"]["openalex"] not in CLEF_openalex_ids: #and k.get("primary_location", {}).get("source", {}).get("display_name") != "CLEF (Working Notes)":
                #try:
                    #if k["primary_location"]["source"]["display_name"] != "CLEF (Working Notes)":
                OpenAlex_ID_list.append(k["ids"]["openalex"])
                    #print(k["primary_location"]["source"]["display_name"])
                #except:
                #    OpenAlex_ID_list.append(k["ids"]["openalex"])
                    #print(k["title"])
        #citing_papers_of_dataset.append(OpenAlex_ID_list)

    # Extract all the citing papers, if the OpenAlex/MAG ID is given use those to exclude CEUR papers, else safe the title of the paper
    if j["ID"].startswith("ceur") and j["ID"] in Semantic_citing_CEUR:
        for k in Semantic_citing_CEUR[j["ID"]]["citations"]:
            #if k["externalIds"]["MAG"] is not None:
            #print(k)
            if k and "externalIds" in k and k["externalIds"] and "MAG" in k["externalIds"]:
                if "https://openalex.org/W" + str(k["externalIds"]["MAG"]) not in CLEF_openalex_ids:
                    #try:
                    OpenAlex_ID_list.append("https://openalex.org/W" + str(k["externalIds"]["MAG"]))
            else:
                title_list.append(k["title"])
        #titles.append(title_list)
        #citing_papers_of_dataset.append(liste)

    # Extract all the citing papers, if the OpenAlex/MAG ID is given use those to exclude LNCS papers, else safe the title of the paper
    if j["ID"].startswith("lncs") and j["ID"] in Semantic_citing_LNCS:
        for k in Semantic_citing_LNCS[j["ID"]]["citations"]:
            #if k["externalIds"]["MAG"] is not None:
            #print(k["externalIds"])
            if k and "externalIds" in k and k["externalIds"] and "MAG" in k["externalIds"]:

                if "https://openalex.org/W" + str(k["externalIds"]["MAG"]) not in CLEF_openalex_ids:
                    #try:
                    OpenAlex_ID_list.append("https://openalex.org/W" + str(k["externalIds"]["MAG"]))
            else:
                title_list.append(k["title"])
    titles.append(title_list)
    citing_papers_of_dataset.append(OpenAlex_ID_list)

                                  

In [117]:
len(citing_papers_of_dataset)

507

In [33]:
flattened_citing_papers = []
for sublist in citing_papers_of_dataset:
    flattened_citing_papers.extend(sublist)

In [34]:
len(list(set(flattened_citing_papers)))

5949

In [35]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders

In [118]:
len(titles)


507

In [119]:
for i in titles:
    print(len(i))

9
10
22
23
6
2
29
18
0
12
23
59
3
1
2
1
7
0
2
9
1
0
19
0
0
11
8
8
6
3
3
13
54
26
49
9
14
6
0
13
14
19
0
1
16
35
10
26
5
39
18
83
4
0
53
17
2
1
14
0
31
19
3
0
11
3
0
2
6
12
13
5
57
52
21
45
18
2
16
25
11
41
4
92
79
25
0
33
18
0
3
1
3
3
9
0
0
14
3
0
0
0
0
49
1
8
16
5
6
2
6
12
4
16
6
1
4
0
0
11
6
11
16
0
0
0
41
12
6
8
6
22
40
15
5
51
0
35
18
56
0
21
11
15
14
23
36
8
8
26
8
10
124
73
8
10
2
6
1
2
2
5
11
2
1
4
83
0
10
0
0
8
1
4
6
31
6
54
45
21
44
10
1
1
0
16
5
13
1
10
1
0
5
13
12
8
2
1
0
8
0
3
2
0
4
13
5
9
15
0
2
0
4
2
15
0
1
6
6
5
5
2
2
1
0
5
35
0
15
0
1
40
18
41
6
27
22
25
9
18
11
22
18
2
3
29
82
29
30
15
6
3
21
6
3
1
0
1
3
7
2
13
0
0
4
15
8
5
2
11
45
0
5
1
0
13
0
3
0
4
11
1
6
3
0
0
71
8
2
0
3
6
0
4
4
2
2
4
0
1
4
20
3
27
44
17
27
1
0
21
10
25
7
4
2
6
1
0
4
1
2
24
19
17
11
15
32
7
15
5
3
4
44
2
3
0
0
31
19
3
0
28
29
78
19
0
0
25
14
47
8
40
73
1
15
3
21
6
6
3
1
2
2
2
14
0
2
0
29
20
44
12
3
45
9
0
24
20
45
0
42
11
5
3
22
6
15
2
11
13
3
13
6
4
4
4
13
1
0
3
6
0
2
8
2
4
6
65
40
0
25
16
25
5
30


In [120]:
titles[506]

['Spoken Dialogue System for Medical Prescription Acquisition on Smartphone: Development, Corpus and Evaluation',
 'Evaluating GPT-4 and ChatGPT on Japanese Medical Licensing Examinations',
 'Enhancing Cross-lingual Semantic Annotations using Deep Network Sentence Embeddings',
 'What Happened in CLEF . . . For a While?',
 'The MeSpEN Resource for English-Spanish Medical Machine Translation and Terminologies : Census of Parallel Corpora , Glossaries and Term Translations',
 'BioTxtM 2016 Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining Proceedings of the Workshop']

In [38]:
flattened_titles = []
for sublist in titles:
    flattened_titles.extend(sublist)

In [39]:
flattened_titles = list(set(flattened_titles))

In [46]:
dict_title_id = {}

In [47]:
import urllib.parse
from thefuzz import fuzz

OpenAlex_IDs_over_API =  []
for i in flattened_titles:
    title = i.replace(",", "")
    title_encoded = urllib.parse.quote(title)
    work = Works().search_filter(title=title_encoded).get()
    for j in work:
        if fuzz.ratio(i, j["title"]) > 95:
            dict_title_id[i] = j["ids"]["openalex"]
            OpenAlex_IDs_over_API.append(j)
            break
    

1 / 3181
2 / 3181
3 / 3181
4 / 3181
5 / 3181
6 / 3181
7 / 3181
8 / 3181
9 / 3181
10 / 3181
11 / 3181
12 / 3181
13 / 3181
14 / 3181
15 / 3181
16 / 3181
17 / 3181
18 / 3181
19 / 3181
20 / 3181
21 / 3181
22 / 3181
23 / 3181
24 / 3181
25 / 3181
26 / 3181
27 / 3181
28 / 3181
29 / 3181
30 / 3181
31 / 3181
32 / 3181
33 / 3181
34 / 3181
35 / 3181
36 / 3181
37 / 3181
38 / 3181
39 / 3181
40 / 3181
41 / 3181
42 / 3181
43 / 3181
44 / 3181
45 / 3181
46 / 3181
47 / 3181
48 / 3181
49 / 3181
50 / 3181
51 / 3181
52 / 3181
53 / 3181
54 / 3181
55 / 3181
56 / 3181
57 / 3181
58 / 3181
59 / 3181
60 / 3181
61 / 3181
62 / 3181
63 / 3181
64 / 3181
65 / 3181
66 / 3181
67 / 3181
68 / 3181
69 / 3181
70 / 3181
71 / 3181
72 / 3181
73 / 3181
74 / 3181
75 / 3181
76 / 3181
77 / 3181
78 / 3181
79 / 3181
80 / 3181
81 / 3181
82 / 3181
83 / 3181
84 / 3181
85 / 3181
86 / 3181
87 / 3181
88 / 3181
89 / 3181
90 / 3181
91 / 3181
92 / 3181
93 / 3181
94 / 3181
95 / 3181
96 / 3181
97 / 3181
98 / 3181
99 / 3181
100 / 3181
101 / 31

In [62]:
dicti_title_id

{'Plant taxonomy-guided path-based tree classifier for large-scale plant species identification': 'https://openalex.org/W3152574551',
 'On the State of the Art in Authorship Attribution and Authorship Verification': 'https://openalex.org/W4296154797',
 'Detecting Hate Speech Spreaders on Twitter using LSTM and BERT in English and Spanish': 'https://openalex.org/W3196853212',
 'Span-based Named Entity Recognition by Generating and Compressing Information': 'https://openalex.org/W4320814283',
 'Evaluating and improving lexical resources for detecting signs of depression in text': 'https://openalex.org/W2886674741',
 'Computational bioacoustics with deep learning: a review and roadmap': 'https://openalex.org/W4226051760',
 'Lifelog Image Retrieval Based on Semantic Relevance Mapping': 'https://openalex.org/W3186150303',
 'DeriNet 2.0: Towards an All-in-One Word-Formation Resource': 'https://openalex.org/W3088530711',
 'BERT-Based Transformers for Early Detection of Mental Health Illnesses

In [121]:
# Replace the titles by OpenAlex IDs if provided

def replace_strings(nested_list, replacements):
    for i, sublist in enumerate(nested_list):
        nested_list[i] = [replacements.get(item, item) for item in sublist if item in replacements]
    return nested_list

In [None]:
converted_titles = replace_strings(titles, dict_title_id)

In [123]:
result[506]

['https://openalex.org/W4388514305',
 'https://openalex.org/W4362511131',
 'https://openalex.org/W3130491114']

In [124]:
# Aggregate the two nested list with OpenAlex IDs

def merge_nested_lists(list1, list2):
    merged_list = []
    for sublist1, sublist2 in zip(list1, list2):
        merged_list.append(sublist1 + sublist2)
    return merged_list

In [125]:
combined_citing_papers = merge_nested_lists(citing_papers_of_dataset, converted_titles)


In [126]:
combined_citing_papers

[['https://openalex.org/W2979250794',
  'https://openalex.org/W2887256936',
  'https://openalex.org/W2895486813',
  'https://openalex.org/W2571940045',
  'https://openalex.org/W3200774639',
  'https://openalex.org/W2895486813',
  'https://openalex.org/W2940730617',
  'https://openalex.org/W2889376139',
  'https://openalex.org/W2887256936',
  'https://openalex.org/W2900704633',
  'https://openalex.org/W2572526973',
  'https://openalex.org/W3096863249',
  'https://openalex.org/W2979250794',
  'https://openalex.org/W2967875042',
  'https://openalex.org/W4287009075',
  'https://openalex.org/W3200774639'],
 ['https://openalex.org/W3016110759',
  'https://openalex.org/W3000462442',
  'https://openalex.org/W2954264361',
  'https://openalex.org/W2933726975',
  'https://openalex.org/W2911238041',
  'https://openalex.org/W2908594866',
  'https://openalex.org/W2896515586',
  'https://openalex.org/W2904609058',
  'https://openalex.org/W2909295065',
  'https://openalex.org/W2899739158',
  'https://

In [127]:
df_clef_overview["citing_paper_id_lists"] = combined_citing_papers

In [80]:
df_clef_overview

Unnamed: 0,PubYear,Title,ID,filename,citing_paper_id_lists
0,2016,Task 1 of the CLEF eHealth Evaluation Lab 2016...,ceur_1019,16090001.pdf,"[https://openalex.org/W2979250794, https://ope..."
1,2016,The IR Task at the CLEF eHealth Evaluation Lab...,ceur_1020,16090015.pdf,"[https://openalex.org/W2979250794, https://ope..."
2,2016,Clinical Information Extraction at the CLEF eH...,ceur_1021,16090028.pdf,"[https://openalex.org/W2805211535, https://ope..."
23,2016,Overview of the ImageCLEF 2016 Medical Task,ceur_1042,16090219.pdf,"[https://openalex.org/W2559785631, https://ope..."
24,2016,Overview of the ImageCLEF 2016 Handwritten Sca...,ceur_1043,16090233.pdf,"[https://openalex.org/W4312475622, https://ope..."
...,...,...,...,...,...
1338,2013,Recent Trends in Digital Text Forensics and It...,lncs_1339,,"[https://openalex.org/W4392612071, https://ope..."
1339,2013,QA4MRE 2011-2013: Overview of Question Answeri...,lncs_1340,,"[https://openalex.org/W4392612071, https://ope..."
1340,2013,Multilingual Question Answering over Linked Da...,lncs_1341,,"[https://openalex.org/W4392612071, https://ope..."
1341,2013,Overview of RepLab 2013: Evaluating Online Rep...,lncs_1342,,"[https://openalex.org/W4392612071, https://ope..."


In [128]:
df_clef_overview = df_clef_overview.to_parquet("../../../data/CLEF_overview_paper_with_ids_of_citing_papers.parquet")

In [102]:
liste[0]

{'id': 'https://openalex.org/W4392489694',
 'doi': 'https://doi.org/10.48550/arxiv.2403.02281',
 'title': 'Emotion Granularity from Text: An Aggregate-Level Indicator of Mental\n  Health',
 'display_name': 'Emotion Granularity from Text: An Aggregate-Level Indicator of Mental\n  Health',
 'relevance_score': 80.81363,
 'publication_year': 2024,
 'publication_date': '2024-03-04',
 'ids': {'openalex': 'https://openalex.org/W4392489694',
  'doi': 'https://doi.org/10.48550/arxiv.2403.02281'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://arxiv.org/abs/2403.02281',
  'pdf_url': 'https://arxiv.org/pdf/2403.02281',
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn_l': None,
   'issn': None,
   'is_oa': True,
   'is_in_doaj': False,
   'is_core': False,
   'host_organization': 'https://openalex.org/I205783295',
   'host_organization_name': 'Cornell University',
   'host_organization_lineage':

In [107]:
len(liste)

2122

In [103]:
ids_not_needed = flattened_citing_papers + CLEF_openalex_ids

In [105]:
cit_papers_found_by_title = []
for i in OpenAlex_IDs_over_API:
    if i["id"] in ids_not_needed:
        continue
    else:
        cit_papers_found_by_title.append(i)

In [106]:
len(cit_papers_found_by_title)

1392

In [112]:
retrieved_by_title = []
for i in OpenAlex_IDs_over_API:
    retrieved_by_title.append(i["id"])

In [113]:
len(retrieved_by_title)

2122

In [119]:
flattened_citing_papers_unique = list(set(flattened_citing_papers))
# Neue Liste erstellen, die nur die Elemente aus list1 enthält, die nicht in list2 sind
filtered_flattened_citing_papers_unique = [item for item in flattened_citing_papers_unique if item not in retrieved_by_title]

In [120]:
len(filtered_flattened_citing_papers_unique)

5949

In [123]:
additional_documents_by_ID = []
counter_not_found = 0
for i in filtered_flattened_citing_papers_unique:
    OpenAlex_ID  = i.replace("https://openalex.org/","")
    try:
        work = Works()[OpenAlex_ID]
        additional_documents_by_ID.append(work)
    except:
        counter_not_found += 1

1 / 5488
2 / 5488
3 / 5488
4 / 5488
5 / 5488
6 / 5488
7 / 5488
8 / 5488
9 / 5488
10 / 5488
11 / 5488
12 / 5488
13 / 5488
14 / 5488
15 / 5488
16 / 5488
17 / 5488
18 / 5488
19 / 5488
20 / 5488
21 / 5488
22 / 5488
23 / 5488
24 / 5488
25 / 5488
26 / 5488
27 / 5488
28 / 5488
29 / 5488
30 / 5488
31 / 5488
32 / 5488
33 / 5488
34 / 5488
35 / 5488
36 / 5488
37 / 5488
38 / 5488
39 / 5488
40 / 5488
41 / 5488
42 / 5488
43 / 5488
44 / 5488
45 / 5488
46 / 5488
47 / 5488
48 / 5488
49 / 5488
50 / 5488
51 / 5488
52 / 5488
53 / 5488
54 / 5488
55 / 5488
56 / 5488
57 / 5488
58 / 5488
59 / 5488
60 / 5488
61 / 5488
62 / 5488
63 / 5488
64 / 5488
65 / 5488
66 / 5488
67 / 5488
68 / 5488
69 / 5488
70 / 5488
71 / 5488
72 / 5488
73 / 5488
74 / 5488
75 / 5488
76 / 5488
77 / 5488
78 / 5488
79 / 5488
80 / 5488
81 / 5488
82 / 5488
83 / 5488
84 / 5488
85 / 5488
86 / 5488
87 / 5488
88 / 5488
89 / 5488
90 / 5488
91 / 5488
92 / 5488
93 / 5488
94 / 5488
95 / 5488
96 / 5488
97 / 5488
98 / 5488
99 / 5488
100 / 5488
101 / 54

In [124]:
cit_papers = cit_papers_found_by_title + additional_documents_by_ID

In [125]:
len(final_list)


6737

In [155]:
# Searching for urls to potentially open avaible PDFs from the citiation papers

dict_of_urls_of_citing_docs = {}
urls = []
for i in cit_papers:
    temp_list_urls = []
    if "primary_location" in i and i["primary_location"] and "pdf_url" in i["primary_location"] and i["primary_location"]["pdf_url"]:
        temp_list_urls.append(i["primary_location"]["pdf_url"])
    if "open_access" in i and i["open_access"] and "oa_url" in i["open_access"] and i["open_access"]["oa_url"]:
        if i["open_access"]["oa_url"] not in liste_temp:
            temp_list_urls.append(i["open_access"]["oa_url"])
    if "best_oa_location" in i and i["best_oa_location"] and "pdf_url" in i["best_oa_location"] and i["best_oa_location"]["pdf_url"]:
        if i["best_oa_location"]["pdf_url"] not in liste_temp:
            temp_list_urls.append(i["best_oa_location"]["pdf_url"])
    else:
        continue
    dict_of_urls_of_citing_docs[i["id"]] = temp_list_urls

In [157]:
dict_of_urls_of_citing_docs

{'https://openalex.org/W4392489694': ['https://arxiv.org/pdf/2403.02281'],
 'https://openalex.org/W3211805281': ['https://aclanthology.org/2021.emnlp-main.40.pdf'],
 'https://openalex.org/W4285106275': ['https://aclanthology.org/2022.ltedi-1.56.pdf'],
 'https://openalex.org/W4205159731': ['https://arxiv.org/pdf/2109.10992'],
 'https://openalex.org/W3185295414': ['https://academic.oup.com/aobpla/article-pdf/13/4/plab050/39937228/plab050.pdf'],
 'https://openalex.org/W4385571781': ['https://aclanthology.org/2023.repl4nlp-1.13.pdf'],
 'https://openalex.org/W4394972183': ['https://arxiv.org/pdf/2404.12342'],
 'https://openalex.org/W4385570033': ['https://aclanthology.org/2023.bea-1.56.pdf'],
 'https://openalex.org/W4225104598': ['https://aclanthology.org/2022.findings-acl.281.pdf'],
 'https://openalex.org/W4392170772': ['http://www.cell.com/article/S0960982223017384/pdf'],
 'https://openalex.org/W4384345675': ['https://ieeexplore.ieee.org/ielx7/10172484/10172342/10172764.pdf'],
 'https://o

In [158]:
import os
import requests

In [159]:
# Create new directory for the PDF files

output_directory = "../../../data/citing_overview_paper_paper_pdfs"
os.makedirs(output_directory, exist_ok=True)

In [160]:
def download_pdf(url, filepath):
    try:
        response = requests.get(url)
        response.raise_for_status()  
        with open(filepath, 'wb') as f:
            f.write(response.content)
        print(f"Download successfull: {filepath}")
        return True
    except requests.RequestException as e:
        print(f"Error for {url}: {e}")
        return False

In [161]:
len(dicti)

2252

In [163]:
# Try to download all available PDF files

for key, links in dict_of_urls_of_citing_docs.items():
    for link in links:
        name = key.replace("https://openalex.org/", "")
        filepath = os.path.join(output_directory, f"{name}.pdf")
        
        if download_pdf(link, filepath):
            break

1 / 2252
Download erfolgreich: ../data/citing_overview_paper_paper_pdfs\W4392489694.pdf
2 / 2252
Download erfolgreich: ../data/citing_overview_paper_paper_pdfs\W3211805281.pdf
3 / 2252
Download erfolgreich: ../data/citing_overview_paper_paper_pdfs\W4285106275.pdf
4 / 2252
Download erfolgreich: ../data/citing_overview_paper_paper_pdfs\W4205159731.pdf
5 / 2252
Fehler beim Herunterladen von https://academic.oup.com/aobpla/article-pdf/13/4/plab050/39937228/plab050.pdf: 403 Client Error: Forbidden for url: https://academic.oup.com/aobpla/article-pdf/13/4/plab050/39937228/plab050.pdf
6 / 2252
Download erfolgreich: ../data/citing_overview_paper_paper_pdfs\W4385571781.pdf
7 / 2252
Download erfolgreich: ../data/citing_overview_paper_paper_pdfs\W4394972183.pdf
8 / 2252
Download erfolgreich: ../data/citing_overview_paper_paper_pdfs\W4385570033.pdf
9 / 2252
Download erfolgreich: ../data/citing_overview_paper_paper_pdfs\W4225104598.pdf
10 / 2252
Fehler beim Herunterladen von http://www.cell.com/art

In [1]:
# Create output path for the XML-transformed file

path = "../../../data/citing_overview_paper_paper_pdfs"
output="../../../data/citing_overview_paper_paper_pdfs/XML_files_new/"

In [1]:
# Create output path for the XML-transformed file

path = "../../../data/citing_overview_paper_paper_pdfs/pdfs_not_converted"
output="../../../data/citing_overview_paper_paper_pdfs/XML_files_new/"

In Windows PowerShell Docker Container startne mit PS <> docker run --rm --init --ulimit core=0 -p 8070:8070 grobid/grobid:0.8.0

In [8]:
from grobid_client.grobid_client import GrobidClient

# Apply GROBID on the gathered PDF files and transform the PDF files into XML

client = GrobidClient(config_path="../../config.json")
client.process("processFulltextDocument", path, output=output  ,consolidate_citations=False, tei_coordinates=True, force=True ,  n=5)

GROBID server is up and running
Processing of D:/Studium/Masterarbeit/PythonProject/data/citing_overview_paper_paper_pdfs/pdfs_not_converted\W2964020128.pdf failed with error 500 , [BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1
Processing of D:/Studium/Masterarbeit/PythonProject/data/citing_overview_paper_paper_pdfs/pdfs_not_converted\W2966735522.pdf failed with error 500 , [BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1
Processing of D:/Studium/Masterarbeit/PythonProject/data/citing_overview_paper_paper_pdfs/pdfs_not_converted\W3013983242.pdf failed with error 408 , None
Processing of D:/Studium/Masterarbeit/PythonProject/data/citing_overview_paper_paper_pdfs/pdfs_not_converted\W3043386385.pdf failed with error 500 , [BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1
Processing of D:/Studium/Masterarbeit/PythonProject/data/citing_overview_paper_paper_pdfs/pdfs_not_converted\W3081783897.pdf failed with error 500 , [BAD_INPUT_DATA] PDF to

In [2]:
import os
import shutil

# Verzeichnisse definieren
dir1 = '../../../data/citing_overview_paper_paper_pdfs'
dir2 = '../../../data/citing_overview_paper_paper_pdfs/XML_files_new'
target_dir = '../../../data/citing_overview_paper_paper_pdfs/pdfs_not_converted'

In [3]:
files_dir1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))]
files_dir2 = [f.replace(".tei.xml", ".pdf").replace(".txt", ".pdf").replace("_500", "").replace("_408", "") for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))]

In [9]:
len(files_dir1)

1794

In [5]:
files_to_move = [f for f in files_dir1 if f not in files_dir2]


In [6]:
len(files_to_move)

1012

In [7]:
for file in files_to_move:
    shutil.copy2(os.path.join(dir1, file), os.path.join(target_dir, file))
    

In [40]:
df_clef_overview["ID_citing_paper"] = citing_papers_of_dataset

In [41]:
df_clef_overview

Unnamed: 0,PubYear,Title,ID,ID_citing_paper
0,2016,Task 1 of the CLEF eHealth Evaluation Lab 2016...,ceur_1019,"[https://openalex.org/W2979250794, https://ope..."
1,2016,The IR Task at the CLEF eHealth Evaluation Lab...,ceur_1020,"[https://openalex.org/W2979250794, https://ope..."
2,2016,Clinical Information Extraction at the CLEF eH...,ceur_1021,"[https://openalex.org/W2805211535, https://ope..."
23,2016,Overview of the ImageCLEF 2016 Medical Task,ceur_1042,"[https://openalex.org/W2559785631, https://ope..."
24,2016,Overview of the ImageCLEF 2016 Handwritten Sca...,ceur_1043,"[https://openalex.org/W4312475622, https://ope..."
...,...,...,...,...
1338,2013,Recent Trends in Digital Text Forensics and It...,lncs_1339,"[https://openalex.org/W4392612071, https://ope..."
1339,2013,QA4MRE 2011-2013: Overview of Question Answeri...,lncs_1340,"[https://openalex.org/W4392612071, https://ope..."
1340,2013,Multilingual Question Answering over Linked Da...,lncs_1341,"[https://openalex.org/W4392612071, https://ope..."
1341,2013,Overview of RepLab 2013: Evaluating Online Rep...,lncs_1342,"[https://openalex.org/W4392612071, https://ope..."


## Applying Llama 3 model to evaluate the citation context of Overview paper

In [44]:

directory = '../../../data/citing_overview_paper_paper_pdfs/XML_files'


In [47]:
import os
files = os.listdir(directory)

In [22]:
import os

def list_all_paths(directory):
    paths = []
    for root, dirs, files in os.walk(directory):
        for name in files:
            paths.append(os.path.join(root, name))
        for name in dirs:
            paths.append(os.path.join(root, name))
    return paths

# Beispielverzeichnis
directory = '../../../data/citing_overview_paper_paper_pdfs/XML_files_new'

all_paths = list_all_paths(directory)
print(len(all_paths))

1796


In [154]:
df_clef_overview = pd.read_parquet("../../../data/CLEF_overview_paper_with_ids_of_citing_papers.parquet")

In [156]:
import numpy as np

# Ensure that 'citing_paper_id_lists' contains lists, not other types (like None), or an empty list if not.
df_clef_overview['citing_paper_id_lists'] = df_clef_overview['citing_paper_id_lists'].apply(lambda x: x if (x, np.ndarray) else [])

# Filter the 'citing_paper_id_lists' by removing any OpenAlex IDs that are present in 'TREC_openalex_ids'.
df_clef_overview['filtered_citing_paper_id_lists'] = df_clef_overview['citing_paper_id_lists'].apply(lambda arr: np.setdiff1d(arr, CLEF_openalex_ids))

# Calculate the count of citing papers after filtering and store it in a new column 'count'.
df_clef_overview['count'] = df_clef_overview['filtered_citing_paper_id_lists'].apply(len)


In [158]:
df_clef_overview

Unnamed: 0,PubYear,Title,ID,filename,citing_paper_id_lists,citing_papers_not_clef_unique,citing_papers_not_clef,filtered_citing_paper_id_lists,count
0,2016,Task 1 of the CLEF eHealth Evaluation Lab 2016...,ceur_1019,16090001.pdf,"[https://openalex.org/W2979250794, https://ope...","[https://openalex.org/W2954264361, https://ope...",42,"[https://openalex.org/W2571940045, https://ope...",12
1,2016,The IR Task at the CLEF eHealth Evaluation Lab...,ceur_1020,16090015.pdf,"[https://openalex.org/W3016110759, https://ope...","[https://openalex.org/W2954264361, https://ope...",42,"[https://openalex.org/W2058057669, https://ope...",30
2,2016,Clinical Information Extraction at the CLEF eH...,ceur_1021,16090028.pdf,"[https://openalex.org/W2805211535, https://ope...","[https://openalex.org/W2740140770, https://ope...",43,"[https://openalex.org/W2562200621, https://ope...",41
23,2016,Overview of the ImageCLEF 2016 Medical Task,ceur_1042,16090219.pdf,"[https://openalex.org/W2559785631, https://ope...","[https://openalex.org/W3198119851, https://ope...",52,"[https://openalex.org/W2522765047, https://ope...",47
24,2016,Overview of the ImageCLEF 2016 Handwritten Sca...,ceur_1043,16090233.pdf,"[https://openalex.org/W4312475622, https://ope...","[https://openalex.org/W2752672158, https://ope...",9,"[https://openalex.org/W2752672158, https://ope...",8
...,...,...,...,...,...,...,...,...,...
1338,2013,Recent Trends in Digital Text Forensics and It...,lncs_1339,,"[https://openalex.org/W4327498231, https://ope...","[https://openalex.org/W3098893029, https://ope...",4210,"[https://openalex.org/W1536949787, https://ope...",38
1339,2013,QA4MRE 2011-2013: Overview of Question Answeri...,lncs_1340,,"[https://openalex.org/W2970482702, https://ope...","[https://openalex.org/W3098893029, https://ope...",4210,"[https://openalex.org/W128392744, https://open...",38
1340,2013,Multilingual Question Answering over Linked Da...,lncs_1341,,"[https://openalex.org/W2300469216, https://ope...","[https://openalex.org/W3098893029, https://ope...",4210,"[https://openalex.org/W1028111216, https://ope...",74
1341,2013,Overview of RepLab 2013: Evaluating Online Rep...,lncs_1342,,"[https://openalex.org/W2220660951, https://ope...","[https://openalex.org/W3098893029, https://ope...",4210,"[https://openalex.org/W1812489009, https://ope...",120


In [147]:
df_sorted = df_clef_overview.sort_values(by='count', ascending=False)

df_top10 = df_sorted.head(10)

latex_table = df_top10[['Title', 'count']].to_latex(index=False, column_format='|l|l|', caption='Top 10 Papers by Citing Paper ID Lists', label='tab:top10papers')


In [148]:
print(latex_table)

\begin{table}
\caption{Top 10 Papers by Citing Paper ID Lists}
\label{tab:top10papers}
\begin{tabular}{|l|l|}
\toprule
Title & count \\
\midrule
Overview of the ShARe/CLEF eHealth Evaluation Lab 2014 & 238 \\
Overview of the Author Profiling Task at PAN 2013 & 213 \\
Overview of the 3rd Author Profiling Task at PAN 2015 & 188 \\
Question Answering over Linked Data (QALD-4) & 155 \\
CLEF 2005: Ad Hoc Track Overview & 147 \\
Overview of the CLEF 2009 Medical Image Retrieval Track & 142 \\
Overview of the CLEF 2009 Medical Image Retrieval Track & 142 \\
Overview of the 5th Author Profiling Task at PAN 2017: Gender and Language Variety Identification in Twitter & 139 \\
LifeCLEF 2015: Multimedia Life Species Identification Challenges & 136 \\
Overview of the 4th Author Profiling Task at PAN 2016: Cross-Genre Evaluations & 133 \\
\bottomrule
\end{tabular}
\end{table}



In [171]:
from lxml import etree

def find_closest_head_tag(element):
    """
    Finds the closest <head> tag before the given element and extracts its text content.

    Args:
        element (etree.Element): The XML element from which to start searching for the closest <head> tag.

    Returns:
        str: The text content of the closest <head> tag without HTML tags. 
             Returns "No text found in the <head> tag." if the <head> tag is empty.
             Returns "No <head> tag found." if no <head> tag is found before the element.
    """

    # Search for the closest <head> before the current element
    for sibling in element.itersiblings(preceding=True):
        if sibling.tag == '{http://www.tei-c.org/ns/1.0}head':
            # Extract the text content of the <head> tag without HTML tags
            return sibling.text.strip() if sibling.text else "No text found in the <head> tag."

    return "No <head> tag found."

In [172]:
import re
from thefuzz import fuzz
from lxml import etree

def find_references(xml_file, search_string):
    """
    Finds references within an XML file that match a given search string, and retrieves the surrounding text.

    Args:
        xml_file (str): Path to the XML file to search within.
        search_string (str): The string to search for within <title> elements.

    Returns:
        list: A list of sentences containing the found references and their corresponding <head> tags. 
              Each item in the list is a tuple containing the sentence and the closest <head> tag text. 
              Returns an empty list if no references are found.
    """
    
    tree = etree.parse(xml_file)
    root = tree.getroot()
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    listbibl_elements = root.findall('.//tei:listBibl', namespaces=ns)
    results = []

    # Iterate through all <listBibl> elements
    for listbibl in listbibl_elements:
        # Search for <title> elements within <listBibl>
        titles = listbibl.findall('.//tei:title', namespaces=ns)
        for title in titles:
            # Check if the title contains the search string
            #if search_string.lower() in title.text.lower():
            if title is not None and title.text is not None:
                if fuzz.ratio(search_string.lower(), title.text.lower()) > 90:
                    bibl_struct = title.getparent().getparent()
                    xml_id = bibl_struct.get('{http://www.w3.org/XML/1998/namespace}id')
                    results.append((title.text, xml_id))  
    if len(results) == 0:
        return []

    else:
        ref_elements = root.findall('.//tei:ref', namespaces=ns)
        sentences = []
    
        for ref in ref_elements:
            # Retrieve the target attribute
            target = ref.get('target')
            
            if target:
                # Clean the target attribute by removing the "#" symbol
                target_id = target.lstrip('#')
                
                # Compare the target ID with the searched ID
                if target_id == xml_id:
                    # Finde das übergeordnete <p>-Tag
                    parent = ref.getparent()
                    
                    # Search for the nearest <p> element
                    while parent is not None and parent.tag != '{http://www.tei-c.org/ns/1.0}p':
                        parent = parent.getparent()
    
                    if parent is not None:
                        # Retrieve the entire text of the parent <p> tag
                        text = parent.text if parent.text is not None else ''
                        tail = parent.tail if parent.tail is not None else ''
                        full_text = text + ''.join(parent.itertext()) + tail
                        
                        # Finde den Text um das <ref>-Tag
                        ref_text = ref.text.strip() if ref.text is not None else ''
                        
                        # Search for the sentence containing the <ref> tag
                        pattern = rf'[^.!?]*{re.escape(ref_text)}[^.!?]*[.!?]'
                        match = re.search(pattern, full_text)
                        head_tag = find_closest_head_tag(parent)
                        if match:
                            sentence = match.group().strip()
                            sentences.append([sentence, head_tag])
                       
                       
        return sentences

In [173]:
from pathlib import Path
path = "../../../data/citing_overview_paper_paper_pdfs/XML_files_new"
columns = []

# Parse through the XML-transformed citing documents and extract the context text for the underlying reference

for i, j in df_clef_overview.iterrows():
    snippet_extraction_list = []

    for k in j["filtered_citing_paper_id_lists"]:
        filename = k.replace("https://openalex.org/", "") + ".tei.xml"
        file_path = Path(path + "/" + filename)

        if file_path.is_file():
            extractions = find_references(file_path, j["Title"])
            if len(extrations) > 0:
                snippet_extraction_list.append(extractions)

    columns.append(snippet_extraction_list)

In [169]:
#def find_closest_head_tag(element):
 
    # Suche nach nächstgelegenen <head> nach dem aktuellen Element
    #for sibling in element.itersiblings(preceding=False):
    #    if sibling.tag == '{http://www.tei-c.org/ns/1.0}head':
    #        return etree.tostring(sibling.text, pretty_print=True, encoding='unicode')

    # Suche nach nächstgelegenen <head> vor dem aktuellen Element
#    for sibling in element.itersiblings(preceding=True):
  #      if sibling.tag == '{http://www.tei-c.org/ns/1.0}head':
 #           return etree.tostring(sibling.text, pretty_print=True, encoding='unicode')

   # return "Kein <head>-Tag gefunden."

In [216]:
# Define the localhost url for the Llama3 instance

url = "http://localhost:11434/api/generate"

In [217]:
# Define the prompt for the classification with Llama3

prompt = """
Can you tell me from the context of the sentence on the end of the prompt, if the underlying paper used a dataset related to CLEF? If there is one or more datasets from CLEF used please return all names separated by a semicolon, if there are none please just responde with None. Please no other 
output sentences except for the list or the None.
"""

In [1]:
list_of_used_datasets = []

# Classify the sentence with Llama3 and count how many of the citations are referring to the usage of the underlying dataset 

for i in columns:
    used = 0
    for j in i:
        for k in j:
            
            prompt = """
                    Can you tell me from the context of the sentence on the end of the prompt, if the underlying paper used a dataset related to CLEF? If there is one or more datasets from CLEF used please return all names separated by a semicolon, if there are none please just responde with None. Please no other 
                    output sentences except for the list or the None.
                    """
            prompt = prompt + "\nSentence:\n" + k[0] 
            data = {"model":"llama3", "prompt" : f"{prompt}"}
            response = httpx.post(url, data=json.dumps(data), headers={"Content-Type": "application/json"}, timeout=15)
            response_lines = [line for line in response.text.strip().split("\n") if line]
            response_dicts = [json.loads(line) for line in response_lines]
            answer = "".join(response_dict.get("response", "") for response_dict in response_dicts)
            if answer == "None":
                continue
            else:
                used +=1
                break
    list_of_used_datasets.append(used)
            

NameError: name 'columns' is not defined

In [220]:
df_clef_overview["Counts_cited_dataset"] = liste_used_dataset

In [222]:
df_clef_overview = df_clef_overview.to_parquet("../../../data/CLEF_overview_paper_with_ids_of_citing_papers.parquet")