In [1]:
import json
import pandas as pd
from thefuzz import fuzz
from thefuzz import process
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders
import numpy as np

In [2]:
def get_self_citing_paperIDs_from_OpenAlex(path_openalex,list_of_names, source_type ):
    """ 
    Extracting paper IDs that are citing a proceeding document and originate from the same proceeding  
    
    Args:
        path_openalex (str): Path to the Json File including the citing documents
        source_type (str): Allows to select between Journal and Conference sources
        list_of_names (list): Contains the journal/conference names that are related to the observed proceedings

    Returns:
        df_OA: Dataframe including the document ID and the corresponding journals 

    """
    # Readin the json file with the citing documents
    with open(path_openalex, 'r', encoding="utf-8") as file:
        OpenAlexCitations = json.load(file)

    journals_open_alex = []
    for i in OpenAlexCitations:
        list_journals = []
        for j in OpenAlexCitations[i]:
            if j.get("primary_location") and j["primary_location"].get("source") and j["primary_location"]["source"].get("type") == source_type:
                try:
                    if j["primary_location"]["source"]["display_name"] in list_of_names:                
                        list_journals.append([j["id"], j["title"]])
                except:
                    continue
        journals_open_alex.append([i, list_journals])

    # Creating the dataframe including the ID and the corresponding journals
    df_OA = pd.DataFrame(journals_open_alex, columns=["ID", f"{source_type}s"])
    
    return df_OA

In [3]:
# Creating the dataframes with the citing journals for all three proceedings
list_of_names=["CLEF (Working Notes)", "CLEF (Online Working Notes/Labs/Workshop)"]

df_ceur_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_CEUR_citing_doc.json", list_of_names, source_type="journal")
df_lncs_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_LNCS_citing_doc.json", ["Cross-Language Evaluation Forum"], source_type="conference")
df_trec_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_TREC_citing_doc.json", ["Text REtrieval Conference"], source_type="conference")

## Analying self-citation by Overview Papers (CEUR)

In [4]:
ceur_citing_docs = []
for i in df_ceur_journals["journals"]:
    for j in i:
        ceur_citing_docs.append(j)

In [5]:
ceur_citing_docs

[['https://openalex.org/W2750981090',
  'Image-based Plant Species Identification with Deep Convolutional Neural Networks.'],
 ['https://openalex.org/W2403531298',
  'Applying LDA in contextual image retrieval ReDCAD participation at ImageCLEF Flickr Photo Retrieval 2012'],
 ['https://openalex.org/W2294894353',
  'The participation of IntermidiaLab at the ImageCLEF 2012 Photo Annotation Task'],
 ['https://openalex.org/W2401982302',
  'A Hybrid Tweet Contextualization System using IR and Summarization.'],
 ['https://openalex.org/W2406927329',
  'An Automatic System for Modality and Negation Detection.'],
 ['https://openalex.org/W2399497040',
  'Tweet Contextualization (Answering Tweet Question) - the Role of Multi-document Summarization.'],
 ['https://openalex.org/W2185044903', 'FBM-Yahoo! at RepLab 2012'],
 ['https://openalex.org/W2397760338',
  'Learning to Analyze Relevancy and Polarity of Tweets.'],
 ['https://openalex.org/W2182297626',
  'UTACLIR @ CLEF 2002: Towards a unified tran

In [6]:
# Read the manually curated list of Overview paper title

file_name = "../../../data/CEUR_Overview_titles.txt"

overview_paper_ceur = []

with open(file_name, 'r', encoding='utf-8') as file:
    overview_paper_ceur = [line.strip() for line in file]

In [7]:
with open("../../../data/OpenAlex_CEUR.json", 'r', encoding="utf-8") as file:
    OpenAlexCitations_CEUR = json.load(file)

In [8]:
threshold = 70

comparison_list = []
for key,value in OpenAlexCitations_CEUR.items():
    for i in overview_paper_ceur:
        if fuzz.ratio(value["title"], i) >= threshold:
            if [value["id"], value["title"]] not in comparison_list:
                comparison_list.append([value["id"], value["title"]])

In [9]:
len(comparison_list)

400

In [10]:
# Threshold for fuzzy-matching
threshold = 90
count_matches = 0

for id1, title1 in ceur_citing_docs:
    for id2, title2 in comparison_list:
        if id1 == id2 or fuzz.ratio(title1, title2) >= threshold:
            count_matches += 1
            break

print(f"Amount of Overlap: {count_matches}")

Amount of Overlap: 277


In [11]:
# Threshold for fuzzy-matching
threshold = 90
count_matches = 0

for id1, title1 in ceur_citing_docs:
    for id2, title2 in comparison_list:
        if id1 == id2:
            count_matches += 1
            break

print(f"Amount of Overlap: {count_matches}")

Amount of Overlap: 271


In [12]:
count_matches

271

## Analying self-citation by Overview Papers (LNCS)

In [13]:
lncs_citing_docs = []
for i in df_lncs_journals["conferences"]:
    for j in i:
        lncs_citing_docs.append(j)

In [14]:
# Read the manually curated list of Overview paper title

file_name = "../../../data/LNCS_Overview_titles.txt"

overview_paper_lncs = []

with open(file_name, 'r', encoding='utf-8') as file:
    overview_paper_lncs = [line.strip() for line in file]

In [15]:
with open("../../../data/OpenAlex_LNCS.json", 'r', encoding="utf-8") as file:
    OpenAlexCitations_LNCS = json.load(file)

In [16]:
threshold = 70

comparison_list = []
for key,value in OpenAlexCitations_LNCS.items():
    for i in overview_paper_lncs:
        if fuzz.ratio(value["title"], i) >= threshold:
            if [value["id"], value["title"]] not in comparison_list:
                comparison_list.append([value["id"], value["title"]])

In [17]:
# Threshold for fuzzy-matching
threshold = 70
count_matches = 0

for id1, title1 in lncs_citing_docs:
    for id2, title2 in comparison_list:
        if id1 == id2 or fuzz.ratio(title1, title2) >= threshold:
            count_matches += 1
            break

print(f"Amount of Overlap: {count_matches}")

Amount of Overlap: 2


In [18]:
# Threshold for fuzzy-matching
threshold = 70
count_matches = 0

for id1, title1 in lncs_citing_docs:
    for id2, title2 in comparison_list:
        if id1 == id2:
            count_matches += 1
            break

print(f"Amount of Overlap: {count_matches}")

Amount of Overlap: 0


## Analying self-citation by Overview Papers (TREC)

In [19]:
trec_citing_docs = []
for i in df_trec_journals["conferences"]:
    for j in i:
        trec_citing_docs.append(j)

In [20]:
# Read the manually curated list of Overview paper title

file_name = "../../../data/TREC_Overview_titles.txt"

overview_paper_trec = []

with open(file_name, 'r', encoding='utf-8') as file:
    overview_paper_trec = [line.strip() for line in file]

In [21]:
with open("../../../data/OpenAlex_TREC.json", 'r', encoding="utf-8") as file:
    OpenAlexCitations_TREC = json.load(file)

In [22]:
threshold = 70

comparison_list = []
for key,value in OpenAlexCitations_TREC.items():
    for i in overview_paper_trec:
        if fuzz.ratio(value["title"], i) >= threshold:
            if [value["id"], value["title"]] not in comparison_list:
                comparison_list.append([value["id"], value["title"]])

In [23]:
# Threshold for fuzzy-matching
threshold = 70
count_matches = 0

for id1, title1 in trec_citing_docs:
    for id2, title2 in comparison_list:
        if id1 == id2 or fuzz.ratio(title1, title2) >= threshold:
            count_matches += 1
            break

print(f"Amount of Overlap: {count_matches}")

Amount of Overlap: 447


In [24]:
# Threshold for fuzzy-matching
threshold = 70
count_matches = 0

for id1, title1 in trec_citing_docs:
    for id2, title2 in comparison_list:
        if id1 == id2:
            count_matches += 1
            break

print(f"Amount of Overlap: {count_matches}")

Amount of Overlap: 268
