In [29]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json

In [30]:
df_trec = pd.read_parquet("../../../data/metadata_TREC.parquet")
df_ceur = pd.read_parquet("../../../data/metadata_CEUR.parquet")
df_lncs.loc[df_lncs['ID'] == "lncs_649", 'Section'] = "CLEF at SemEval 2007"

## Assign corresponding Labs to the documents of CEUR

In [31]:
with open('../../../data/abbreviations_CLEF_CEUR.json', 'r', encoding="utf-8") as file:
    matching_labs = json.load(file)

labs = []
for i,j in df_ceur.iterrows():
    assign_labs = []
    for k in matching_labs:
        for l in matching_labs[k]:
            if l in j["Section"]:
                assign_labs.append(k)
    labs.append(list(set(assign_labs)))
    
# Apply concordance dict to unify the corresponding Lab names
df_ceur["Labs"] = labs

## Assign corresponding Tracks to the document of TREC

In [32]:
track_assignments = pd.read_parquet("../../../data/Assign_tracks_by_ID_for_TREC.parquet")

In [33]:
df_trec = pd.merge(df_trec, track_assignments, how = "left", left_on="ID", right_on="ID")

## Define the length of the Tracks and Labs

### Starting with TREC

In [34]:
df_trec_explode = df_trec.explode("Tracks")

In [35]:
df_trec_explode

Unnamed: 0,PubYear,url,Authors,Title,Section,filename,filepath,ID,Tracks
0,2000,http://trec.nist.gov/pubs/trec9/papers/overvie...,"[Ellen M. Voorhees, Donna Harman]",Overview of the Ninth Text REtrieval Conferenc...,Uncategorized,overview_9.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1,Overview
1,2000,http://trec.nist.gov/pubs/trec9/papers/trec9-c...,"[Fredric C. Gey, Aitao Chen]",TREC-9 Cross-Language Information Retrieval (E...,Uncategorized,trec9-clir-overview.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_2,Cross-Language
2,2000,http://trec.nist.gov/pubs/trec9/papers/filteri...,"[Stephen E. Robertson, David A. Hull]",The TREC-9 Filtering Track Final Report.,Uncategorized,filtering_new.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_3,Filtering
3,2000,http://trec.nist.gov/pubs/trec9/papers/t9irep.pdf,"[William R. Hersh, Paul Over]",The TREC-9 Interactive Track Report.,Uncategorized,t9irep.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_4,Interactive
4,2000,http://trec.nist.gov/pubs/trec9/papers/liggett...,"[Walter Liggett, Chris Buckley]",Query Expansion Seen Through Return Order of R...,Uncategorized,liggett.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_5,Evaluation
...,...,...,...,...,...,...,...,...,...
1967,2019,https://trec.nist.gov/pubs/trec28/papers/OVERV...,"[Laura Dietz, John Foley]",TREC CAR Y3: Complex Answer Retrieval Overview,Overview,OVERVIEW.CAR.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1968,Complex Answer Retrieval
1968,2007,https://trec.nist.gov/pubs/trec16/papers/umelb...,"[William Webber, Vo Ngoc Anh, Alistair Moffat]",The University of Melbourne in the Million Que...,Participant,umelbourne.ngoc-ahn.MQ.final.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1969,Million Query
1969,2020,https://trec.nist.gov/pubs/trec29/papers/OVERV...,"[Asia J. Biega, Fernando Diaz, Michael D. Ekst...",Overview of the TREC 2020 Fair Ranking Track∗,Overview,OVERVIEW.FR.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1970,Fair Ranking
1970,2017,https://trec.nist.gov/pubs/trec26/papers/NOVAS...,"[Gonçalo Araújo, André Mourão, João Magalhães]",NOVASearch at Precision Medicine 2017,Participant,NOVASearch-PM.pdf,D:/Studium/Masterarbeit/PythonProject/data/PDF...,trec_1971,Precision Medicine


In [36]:
# Count the amount of unique Publication Years for every Track in order to get the runtime of this Track
track_years = df_trec_explode.groupby('Tracks')['PubYear'].nunique().reset_index()

In [37]:
# Remove "Overview Track" and "Others" because classification does not refer to a specifc Track
track_years = track_years[track_years['Tracks'] != 'Overview']
track_years = track_years[track_years['Tracks'] != 'Others']

In [38]:
track_years

Unnamed: 0,Tracks,PubYear
0,Blog,5
1,CENTRE,1
2,Chemical,3
3,Clinical Decision Support,3
4,Clinical Trials,2
5,Common Core,2
6,Complex Answer Retrieval,3
7,Contextual Suggestion,5
8,Conversational Assistance,4
9,CrisisFACTs,1


In [39]:
# Calculate the average runtime in the time period between 2000 and 2022 for TREC Tracks
average_pub_year_trec = track_years['PubYear'].mean()

print(f"The average of the publication year is: {average_pub_year_trec}")


The average of the publication year is: 3.2115384615384617


In [41]:
std_pub_year_trec = track_years['PubYear'].std()

In [60]:
threshold = average_pub_year_trec + 2 * std_pub_year_trec

In [61]:
outliers = track_years[track_years['PubYear'] > threshold]

In [62]:
outliers

Unnamed: 0,Tracks,PubYear
41,Question Answering,8
53,Web,11


### Analysis of CEUR Labs

In [16]:
df_ceur_explode = df_ceur.explode("Labs")

In [18]:
# Count the amount of unique Publication Years for every Lab in order to get the runtime of this Lab
lab_years = df_ceur_explode.groupby('Labs')['PubYear'].nunique().reset_index()

In [19]:
lab_years

Unnamed: 0,Labs,PubYear
0,ARQMath,3
1,Adhoc IR Track,4
2,BioASQ,3
3,CENTRE@CLEF,2
4,CHiC,3
5,CL-SDR,2
6,CL-SR,3
7,CLEF-ER,1
8,CLEF-IP,5
9,CLIR,6


In [20]:
# Calculate the average runtime in the time period between 2000 and 2022 for CEUR labs
average_pub_year_ceur = lab_years['PubYear'].mean()

print(f"The average of the publication year is: {average_pub_year_ceur}")

The average of the publication year is: 3.673076923076923


In [47]:
std_pub_year_ceur = lab_years['PubYear'].std()

In [57]:
threshold = average_pub_year_ceur + 2 * std_pub_year_ceur

In [58]:
outliers = lab_years[lab_years['PubYear'] > threshold]

In [59]:
outliers

Unnamed: 0,Labs,PubYear
21,ImageCLEF,20
35,PAN,13
38,Question Answering,13


## Testing for significant differences between the run time of tracks/labs

In [28]:
from scipy import stats
import numpy as np

# Extract citations for the two proceedings
runtime_trec = track_years['PubYear']
runtime_ceur = lab_years['PubYear']

# Shapiro-Wilk Test for normality
shapiro_trec_stat, shapiro_trec_p = stats.shapiro(runtime_trec)
shapiro_ceur_stat, shapiro_ceur_p = stats.shapiro(runtime_ceur)

print(f'Shapiro-Wilk Test for OpenAlex LNCS: Statistic={shapiro_trec_stat}, P-Value={shapiro_trec_p}')
print(f'Shapiro-Wilk Test for OpenAlex CEUR: Statistic={shapiro_ceur_stat}, P-Value={shapiro_ceur_p}')

# Levene Test for equal variances
levene_stat, levene_p = stats.levene(runtime_trec, runtime_ceur)

print(f'Levene Test: Statistic={levene_stat}, P-Value={levene_p}')

# Perform the T-Test, based on the test results
# Decide based on the tests whether to perform the T-Test or a different test
if shapiro_trec_p > 0.05 and shapiro_ceur_p > 0.05:
    print("Data is normally distributed.")
else:
    print("Data is not normally distributed. Consider using the Welch test or the Mann-Whitney U test.")

if levene_p > 0.05:
    print("Variances are equal.")
else:
    print("Variances are unequal. Consider using the Welch test instead of the standard T-Test.")

# Perform T-Test
mann_whitney_stat, mann_whitney_p = stats.mannwhitneyu(runtime_trec, runtime_ceur, alternative='two-sided')

print(f'T-Statistic: {mann_whitney_stat}')
print(f'P-Value: {mann_whitney_p}')

median_A = np.median(runtime_trec)
median_B = np.median(runtime_ceur)

print(f'Median Citations for OpenAlex LNCS: {median_A}')
print(f'Median Citations for OpenAlex CEUR: {median_B}')

Shapiro-Wilk Test for OpenAlex LNCS: Statistic=0.804054055720903, P-Value=7.421018270049134e-07
Shapiro-Wilk Test for OpenAlex CEUR: Statistic=0.6986854546656287, P-Value=4.9283481080108245e-09
Levene Test: Statistic=5.765306808253242, P-Value=0.018156720354996127
Data is not normally distributed. Consider using the Welch test or the Mann-Whitney U test.
Variances are unequal. Consider using the Welch test instead of the standard T-Test.
T-Statistic: 1487.5
P-Value: 0.36773136910860604
Median Citations for OpenAlex LNCS: 3.0
Median Citations for OpenAlex CEUR: 3.0
