In [1]:
import json
import pandas as pd
from thefuzz import fuzz
from thefuzz import process
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders
import numpy as np
import matplotlib.pyplot as plt

## Self-citation analysis TREC

In [2]:
with open('../../../data/OpenAlex_TREC.json', 'r', encoding="utf-8") as file:
    OpenAlexTrec = json.load(file)

In [3]:
with open('../../../data/OpenAlex_TREC_citing_doc.json', 'r', encoding="utf-8") as file:
    OpenAlex_citing_Trec = json.load(file)

In [8]:
df_trec = pd.read_parquet("../../../data/metadata_TREC.parquet")

In [9]:
# Define concordance dict for classifiying TREC documents based on the headers given in DBLP and unify them to Participant or Overview

concordance_dict_trec_overview = {}
concordance_dict_trec_overview["Participant"] = ["Papers", "Participant", "Other Papers", "Participant Papers", "Uncategorized"]
concordance_dict_trec_overview["Overview"] = ["Track Overviews", "Overview Papers", "Overview"]

In [10]:
def replace_values(val):
    for key, value_list in concordance_dict_trec_overview.items():
        if val in value_list:
            return key
    return val

# Apply the concordance dict on TREC
df_trec['Classification'] = df_trec['Section'].apply(replace_values)

In [4]:
# Read the manually curated list of Overview paper title

file_name = "../../../data/TREC_Overview_titles.txt"

overview_paper_trec = []

with open(file_name, 'r', encoding='utf-8') as file:
    overview_paper_trec = [line.strip() for line in file]

In [11]:
# Assign Overview and Participant label to the TREC documents

df_trec.loc[df_trec['Title'].isin(overview_paper_trec), 'Classification'] = 'Overview' 

In [12]:
df_trec

Unnamed: 0,PubYear,url,Authors,Title,Section,filename,filepath,ID,Classification
0,2000,http://trec.nist.gov/pubs/trec9/papers/overvie...,"[Ellen M. Voorhees, Donna Harman]",Overview of the Ninth Text REtrieval Conferenc...,Uncategorized,overview_9.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1,Overview
1,2000,http://trec.nist.gov/pubs/trec9/papers/trec9-c...,"[Fredric C. Gey, Aitao Chen]",TREC-9 Cross-Language Information Retrieval (E...,Uncategorized,trec9-clir-overview.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_2,Overview
2,2000,http://trec.nist.gov/pubs/trec9/papers/filteri...,"[Stephen E. Robertson, David A. Hull]",The TREC-9 Filtering Track Final Report.,Uncategorized,filtering_new.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_3,Overview
3,2000,http://trec.nist.gov/pubs/trec9/papers/t9irep.pdf,"[William R. Hersh, Paul Over]",The TREC-9 Interactive Track Report.,Uncategorized,t9irep.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_4,Overview
4,2000,http://trec.nist.gov/pubs/trec9/papers/liggett...,"[Walter Liggett, Chris Buckley]",Query Expansion Seen Through Return Order of R...,Uncategorized,liggett.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_5,Participant
...,...,...,...,...,...,...,...,...,...
25,2019,https://trec.nist.gov/pubs/trec28/papers/OVERV...,"[Laura Dietz, John Foley]",TREC CAR Y3: Complex Answer Retrieval Overview,Overview,OVERVIEW.CAR.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1968,Overview
26,2007,https://trec.nist.gov/pubs/trec16/papers/umelb...,"[William Webber, Vo Ngoc Anh, Alistair Moffat]",The University of Melbourne in the Million Que...,Participant,umelbourne.ngoc-ahn.MQ.final.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1969,Participant
27,2020,https://trec.nist.gov/pubs/trec29/papers/OVERV...,"[Asia J. Biega, Fernando Diaz, Michael D. Ekst...",Overview of the TREC 2020 Fair Ranking Track∗,Overview,OVERVIEW.FR.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1970,Overview
28,2017,https://trec.nist.gov/pubs/trec26/papers/NOVAS...,"[Gonçalo Araújo, André Mourão, João Magalhães]",NOVASearch at Precision Medicine 2017,Participant,NOVASearch-PM.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1971,Participant


In [13]:
# Extract the IDs referring to Overview Papers within TREC

overview_paper_id_list = []
for i, j in df_trec.iterrows():
    if j["Classification"] == "Overview":
        overview_paper_id_list.append(j["ID"])

In [15]:
len(overview_paper_id_list)

161

In [17]:
author_dict

{'https://openalex.org/A5011542448': ['trec_521', 'trec_773', 'trec_665'],
 'https://openalex.org/A5100442592': ['trec_521'],
 'https://openalex.org/A5026261763': ['trec_521', 'trec_773', 'trec_665'],
 'https://openalex.org/A5044025684': ['trec_521'],
 'https://openalex.org/A5067106906': ['trec_652',
  'trec_1192',
  'trec_722',
  'trec_762',
  'trec_953',
  'trec_1249',
  'trec_1283',
  'trec_1408',
  'trec_825',
  'trec_918',
  'trec_1322'],
 'https://openalex.org/A5027541399': ['trec_652',
  'trec_762',
  'trec_953',
  'trec_887',
  'trec_918'],
 'https://openalex.org/A5036777186': ['trec_652'],
 'https://openalex.org/A5049302282': ['trec_770'],
 'https://openalex.org/A5035835846': ['trec_770', 'trec_413'],
 'https://openalex.org/A5060107942': ['trec_770'],
 'https://openalex.org/A5043487256': ['trec_849'],
 'https://openalex.org/A5085925326': ['trec_849',
  'trec_470',
  'trec_540',
  'trec_662',
  'trec_768',
  'trec_769',
  'trec_848',
  'trec_850',
  'trec_1000',
  'trec_1014',


In [20]:
# Assign all works of an author (that are not Overview papers), to the corresponding author 

author_dict = {}
for key,value in OpenAlexTrec.items():
    for author in value["authorships"]:
        if author_dict.get(author["author"]["id"]) == None and key not in overview_paper_id_list:
            author_dict[author["author"]["id"]] = [key]
        else:
            if key not in overview_paper_id_list:
                author_dict[author["author"]["id"]].append(key)


# Assign all authors of works citing author X to author X 
author_citing_author_dict = {}
for key,value in author_dict.items():
    for doc_id in value:
        for citing_docs in OpenAlex_citing_Trec[doc_id]:
            try:
                if citing_docs["primary_location"]["source"]["display_name"] == "Text REtrieval Conference":
                    for author in citing_docs["authorships"]:
                        if author_citing_author_dict.get(key) == None:
                            author_citing_author_dict[key] = [author["author"]["id"]]
                        else:
                            author_citing_author_dict[key].append(author["author"]["id"])
            except:
                continue


In [21]:
author_citing_author_dict

{'https://openalex.org/A5011542448': ['https://openalex.org/A5000825345',
  'https://openalex.org/A5057643560',
  'https://openalex.org/A5103134427',
  'https://openalex.org/A5090941074',
  'https://openalex.org/A5079046603',
  'https://openalex.org/A5102059411',
  'https://openalex.org/A5050532724',
  'https://openalex.org/A5055679306',
  'https://openalex.org/A5059003753',
  'https://openalex.org/A5068408717',
  'https://openalex.org/A5026261763',
  'https://openalex.org/A5011086547',
  'https://openalex.org/A5101631796',
  'https://openalex.org/A5009336048',
  'https://openalex.org/A5001571390',
  'https://openalex.org/A5011542448',
  'https://openalex.org/A5102150023',
  'https://openalex.org/A5047897879',
  'https://openalex.org/A5100831210',
  'https://openalex.org/A5100511387',
  'https://openalex.org/A5029998682',
  'https://openalex.org/A5090941074',
  'https://openalex.org/A5057643560',
  'https://openalex.org/A5079046603',
  'https://openalex.org/A5103134427',
  'https://ope

In [31]:
# Calculate the amount of times author X appears himself within the citing works as an author
self_citation_list = []
for key, value in author_citing_author_dict.items():
    self_cit = value.count(key)
    self_citation_list.append([key, self_cit])

In [32]:
self_citation_list.sort(key=lambda x: (x[1]), reverse=True)


In [33]:
self_citation_list

[['https://openalex.org/A5079046603', 29],
 ['https://openalex.org/A5063647013', 28],
 ['https://openalex.org/A5057643560', 23],
 ['https://openalex.org/A5042384742', 19],
 ['https://openalex.org/A5044511901', 16],
 ['https://openalex.org/A5031439294', 15],
 ['https://openalex.org/A5057657785', 14],
 ['https://openalex.org/A5091440467', 13],
 ['https://openalex.org/A5029998682', 10],
 ['https://openalex.org/A5083507657', 10],
 ['https://openalex.org/A5081925956', 9],
 ['https://openalex.org/A5059926999', 8],
 ['https://openalex.org/A5046292035', 7],
 ['https://openalex.org/A5002303994', 7],
 ['https://openalex.org/A5014322854', 7],
 ['https://openalex.org/A5027915931', 7],
 ['https://openalex.org/A5011782863', 7],
 ['https://openalex.org/A5074408622', 7],
 ['https://openalex.org/A5073522595', 7],
 ['https://openalex.org/A5036270568', 6],
 ['https://openalex.org/A5008248743', 6],
 ['https://openalex.org/A5086495684', 6],
 ['https://openalex.org/A5084091807', 5],
 ['https://openalex.org/

## Self-citation analysis CEUR

In [36]:
with open("../../../data/OpenAlex_CEUR.json", 'r', encoding="utf-8") as file:
    OpenAlexCitations_CEUR = json.load(file)

In [37]:
with open('../../../data/OpenAlex_CEUR_citing_doc.json', 'r', encoding="utf-8") as file:
    OpenAlex_citing_CEUR = json.load(file)

In [38]:
df_ceur = pd.read_parquet("../../../data/metadata_CEUR.parquet")

In [39]:
file_name = "../../../data/CEUR_Overview_titles.txt"

overview_paper_ceur = []

# Read file with the titles of CEUR Overview paper
with open(file_name, 'r', encoding='utf-8') as file:
    overview_paper_ceur = [line.strip() for line in file]

In [40]:
# Assign Overview and Participant label to the CEUR documents

overview_participant_classification = []
for i , j in df_ceur.iterrows():
    if j["Title"] in overview_paper_ceur:
        overview_participant_classification.append("Overview")
    else:
        overview_participant_classification.append("Participant")

In [41]:
df_ceur["Classification"] = overview_participant_classification

In [42]:
# Extract the IDs referring to Overview Papers within CEUR

overview_paper_id_list = []
for i, j in df_ceur.iterrows():
    if j["Classification"] == "Overview":
        overview_paper_id_list.append(j["ID"])

In [43]:
# Assign all works of an author (that are not Overview papers), to the corresponding author 

author_dict = {}
for key,value in OpenAlexCitations_CEUR.items():
    for author in value["authorships"]:
        if author_dict.get(author["author"]["id"]) == None and key not in overview_paper_id_list:
            author_dict[author["author"]["id"]] = [key]
        else:
            if key not in overview_paper_id_list:
                author_dict[author["author"]["id"]].append(key)

# Assign all authors of works citing author X to author X 

author_citing_author_dict = {}
for key,value in author_dict.items():
    for doc_id in value:
        for citing_docs in OpenAlex_citing_CEUR[doc_id]:
            try:
                if citing_docs["primary_location"]["source"]["display_name"] in ["CLEF (Working Notes)", "CLEF (Online Working Notes/Labs/Workshop)"]:
                    for author in citing_docs["authorships"]:
                        if author_citing_author_dict.get(key) == None:
                            author_citing_author_dict[key] = [author["author"]["id"]]
                        else:
                            author_citing_author_dict[key].append(author["author"]["id"])
            except:
                continue


In [44]:
# Calculate the amount of times author X appears himself within the citing works as an author

self_citation_list = []
for key, value in author_citing_author_dict.items():
    self_cit = value.count(key)
    self_citation_list.append([key, self_cit])

In [45]:
self_citation_list.sort(key=lambda x: (x[1]), reverse=True)


In [46]:
self_citation_list

[['https://openalex.org/A5071056491', 19],
 ['https://openalex.org/A5013027317', 13],
 ['https://openalex.org/A5035307573', 12],
 ['https://openalex.org/A5047951827', 12],
 ['https://openalex.org/A5032988611', 12],
 ['https://openalex.org/A5046327361', 12],
 ['https://openalex.org/A5067971909', 11],
 ['https://openalex.org/A5016928206', 11],
 ['https://openalex.org/A5022095958', 11],
 ['https://openalex.org/A5063643522', 10],
 ['https://openalex.org/A5002392851', 10],
 ['https://openalex.org/A5015449918', 10],
 ['https://openalex.org/A5063045515', 9],
 ['https://openalex.org/A5042596878', 9],
 ['https://openalex.org/A5006218020', 9],
 ['https://openalex.org/A5063647013', 9],
 ['https://openalex.org/A5016894977', 8],
 ['https://openalex.org/A5091440467', 8],
 ['https://openalex.org/A5050019723', 8],
 ['https://openalex.org/A5019395301', 8],
 ['https://openalex.org/A5012010936', 8],
 ['https://openalex.org/A5086045338', 8],
 ['https://openalex.org/A5043424064', 8],
 ['https://openalex.or