In [1]:
import json
import pandas as pd
from scipy.stats import shapiro
from thefuzz import fuzz
from thefuzz import process
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders
from scipy.stats import levene
from scipy.stats import mannwhitneyu
import numpy as np

In [2]:
def get_journals_from_OpenAlex(path_openalex, source_type):
    """ Extracting journals that are citing a proceeding document 

    Extracts the corresponding journals names and their OpenAlex IDs, that are citing document with ID.
    
    Args:
        path_openalex (str): Path to the Json File including the citing documents
        source_type (str): Allows to select between Journal and Conference sources

    Returns:
        df_OA: Dataframe including the document ID and the corresponding journals 

    """
    # Readin the json file with the citing documents
    with open(path_openalex, 'r', encoding="utf-8") as file:
        OpenAlexCitations = json.load(file)

    journals_open_alex = []
    for i in OpenAlexCitations:
        list_journals = []
        for j in OpenAlexCitations[i]:
            if j.get("primary_location") and j["primary_location"].get("source") and j["primary_location"]["source"].get("type") == source_type:
                try:
                    # Adding the journal names and ids 
                    list_journals.append([j["primary_location"]["source"]["display_name"], j["primary_location"]["source"]["id"]])
                except:
                    continue
        journals_open_alex.append([i, list_journals])

    # Creating the dataframe including the ID and the corresponding journals
    df_OA = pd.DataFrame(journals_open_alex, columns=["ID", f"{source_type}s"])
    
    return df_OA

In [5]:
# Creating the dataframes with the citing journals for all three proceedings

df_ceur_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_CEUR_citing_doc.json", source_type="journal")
df_lncs_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_LNCS_citing_doc.json", source_type="journal")
df_trec_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_TREC_citing_doc.json", source_type="journal")

In [7]:
def group_similar_strings(journal_name_list, threshold):
    """ Group strings by fuzzy-matching 

    Match up and aggregate journal names, that are similar to a certain degree depending on the choosen treshhold
        
    Args:
        strings (list): List of strings containing the found journal names
        treshold (int): Defines the percentage of overlap in order to aggregate two strings

    Returns:
        aggregated_journal_names: Nested list including the journal names

    """
    
    aggregated_journal_names = []

    # Create nested list for all journal names and set all on false to keep track which where already been tried to be aggregated
    visited = [False] * len(journal_name_list)

    for i in range(len(journal_name_list)):

        # If visited[i] was set to true, do not try to aggregate this journal name again and skip
        if visited[i]:
            continue  

        # Creating a new aggregation, starting with the actual journal name
        group = [journal_name_list[i]]  
        
        # Set the index of visited to true, in order to not aggregate these in the future again
        visited[i] = True  

        # Search for additional journal names that match with the one that is currently observed
        for j in range(i + 1, len(journal_name_list)):
            # Don't try to aggregate journal name again
            if visited[j]:
                continue  

            # Calculate the fuzzy match similarity between the current journal name and the following ones
            similarity = fuzz.ratio(journal_name_list[i][0], journal_name_list[j][0])

            # Aggregate the journal names if similarity is higher than the treshold
            if similarity >= threshold:
                group.append(journal_name_list[j])  
                visited[j] = True  

        aggregated_journal_names.append(group)  

    return aggregated_journal_names

In [8]:
def aggregate_unique_journals(df, threshold):
    """ Extract journal impact from OpenAlex 

    Extract the journal impact from all identified unique aggregated journal names by their IDs.
            
    Args:
        df (DataFrame): DataFrame created in get_journals_from_OpenAlex
        treshold (int): Defines the percentage of overlap in order to aggregate two strings

    Returns:
        journal_dict: Dictionary containing the an assigned journal ID as key and journal impact, name and OpenAlex ID as nested list 

    """
    # Identify unique journals by name
    unique_journals_open_alex = []
    for i in df["Journals"]:
        if type(i) == list:
            for j in i:
                if j not in unique_journals_open_alex:
                    unique_journals_open_alex.append(j)

    # Aggregate journal names by fuzzy matching the journal names 
    similar_groups = group_similar_strings(unique_journals_open_alex, threshold)
    for group in similar_groups:
        if len(group) > 1:
            print(group)

    # Creating dictionary containing the aggregated journal names under a single ID and additionally extract the journal impact from OpenAlex for all journals
    journal_dict = {}
    for i in range(len(similar_groups)):
        journal_dict[f"journal_{i}"] = similar_groups[i]
    for i,j in journal_dict.items():
        for k in j:
            impact_factor = Sources()[k[1].replace("https://openalex.org/", "")]["summary_stats"]["2yr_mean_citedness"]
            k.append(impact_factor)
    return journal_dict

In [9]:
# Aggregate the journal names and extract the journal impacts for all three proceedings

threshold = 95
journal_dict_ceur = aggregate_unique_journals(df_ceur_journals, threshold)

threshold = 95
journal_dict_lncs = aggregate_unique_journals(df_lncs_journals, threshold)

threshold = 95
journal_dict_trec = aggregate_unique_journals(df_trec_journals, threshold)

[['International journal of computer applications', 'https://openalex.org/S4210206007'], ['International journal of computers & applications', 'https://openalex.org/S87797848']]
[['Informatica', 'https://openalex.org/S57195646'], ['Informatica', 'https://openalex.org/S4210173311']]
[['IX Workshop de Investigadores en Ciencias de la Computación', 'https://openalex.org/S4306514727'], ['XII Workshop de Investigadores en Ciencias de la Computación', 'https://openalex.org/S4306535119']]
[['International journal of computer applications', 'https://openalex.org/S4210206007'], ['International journal of computers & applications', 'https://openalex.org/S87797848']]
[['XII Workshop de Investigadores en Ciencias de la Computación', 'https://openalex.org/S4306535119'], ['XIII Workshop de Investigadores en Ciencias de la Computación', 'https://openalex.org/S4306535111'], ['IX Workshop de Investigadores en Ciencias de la Computación', 'https://openalex.org/S4306514727']]
[['Information Research', 'h

In [63]:
def replace_journal_names_by_id(nested_lst):
    """ Replace journal names by journal ID

    Replacing the journal names assigned to a doc by the corresponding journal IDs-
    
    Args:
        nested_lst (lst): Nested list containing the journal name and the journal OpenAlex ID

    Returns:
        replaced_list: Contains a list of Journal IDs

    """
    replaced_list = []

    # Iterate through the assigned nested list of journals
    for sublist in nested_lst:
        # Replace journal name through Journal ID based on journal_dict_openalex_inverse
        replaced_sublist = journal_dict_openalex_inverse.get(sublist[0], sublist[0]) 
        replaced_list.append(replaced_sublist)

    return replaced_list

# Replace the journal names by the aggregated journal IDs
journal_dict_openalex_inverse = {v[0]: k for k, values in journal_dict_ceur.items() for v in values}
df_ceur_journals['journals'] = df_ceur_journals['journals'].apply(lambda x: replace_journal_names_by_id(x) if isinstance(x, list) else x)

In [64]:
journal_dict_openalex_inverse = {v[0]: k for k, values in journal_dict_lncs.items() for v in values}
df_lncs_journals['journals'] = df_lncs_journals['journals'].apply(lambda x: replace_journal_names_by_id(x) if isinstance(x, list) else x)

In [65]:
journal_dict_openalex_inverse = {v[0]: k for k, values in journal_dict_trec.items() for v in values}
df_trec_journals['journals'] = df_trec_journals['journals'].apply(lambda x: replace_journal_names_by_id(x) if isinstance(x, list) else x)

In [66]:
exploded_df_open_alex_ceur = df_ceur_journals.explode('journals')

# Explode the lists of citing journals to count total appearance in the context of CEUR
value_counts_open_alex_ceur = exploded_df_open_alex_ceur['journals'].value_counts()
value_counts_open_alex_ceur = value_counts_open_alex_ceur.reset_index()
value_counts_open_alex_ceur.columns = ['id', 'frequency']


In [67]:
value_counts_open_alex_ceur

Unnamed: 0,id,frequency
0,j,851


In [68]:
exploded_df_open_alex_lncs = df_lncs_journals.explode('journals')

# Häufigkeiten zählen
value_counts_open_alex_lncs = exploded_df_open_alex_lncs['journals'].value_counts()
value_counts_open_alex_lncs = value_counts_open_alex_lncs.reset_index()
value_counts_open_alex_lncs.columns = ['id', 'frequency']


In [69]:
exploded_df_open_alex_trec = df_trec_journals.explode('journals')

# Häufigkeiten zählen
value_counts_open_alex_trec = exploded_df_open_alex_trec['journals'].value_counts()
value_counts_open_alex_trec = value_counts_open_alex_trec.reset_index()
value_counts_open_alex_trec.columns = ['id', 'frequency']


In [70]:
def calculate_average_journal_impact(id, journal_dict):
    """ Calculate the average journal impact for aggregated journal names

        Calculate the average journal impact for aggregated journal names
        
    id:
        id (string): Journal ID representing one or more journal names

    Returns:
       third_values (float) : Average journal impact for aggregated journal names

    """
    nested_list = journal_dict.get(id, [])

    # Iterate through the nested list to gather all corresponding journal impacts
    journal_impact_list = [item[2] for item in nested_list if len(item) > 2]
    return np.mean(journal_impact_list) if journal_impact_list else np.nan

# Funktion zum Hinzufügen der Durchschnittswerte
def add_average_journal_impact(df, journal_dict ,source_type):
    """ Add average journal impact

        Assign and apply calculate_average_journal_impact to DataFrame
        
    Args:
        df (DataFrame): DataFrame containing the assigned journals for the proceeding documents
        journal_dict (Dict): Dictionary containing the journal impacts for every journal name
        source_type (str): Define the observation object for correct column naming
        
        
    Returns:
       df (DataFrame) : df with additional column for average journal impact 

    """

    # Assign a new column to the DataFrame containing the average journal impact for aggregated journal names
    df[f'average_{source_type}_impact'] = df['id'].apply(lambda x: calculate_average_journal_impact(x, journal_dict))
    return df

value_counts_open_alex_ceur = add_average_journal_impact(value_counts_open_alex_ceur, journal_dict_ceur, "journal")

In [71]:
value_counts_open_alex_lncs = add_average_journal_impact(value_counts_open_alex_lncs, journal_dict_lncs, "journal")

In [72]:
value_counts_open_alex_trec = add_average_journal_impact(value_counts_open_alex_trec, journal_dict_trec, "journal")

In [73]:
def add_journal_ids(df, journal_dict, source_type):
    """ Assign the corresponding journal names

        Assign the corresponding journal names in addition to the corresponding journal IDs
        
    Args:
        df (DataFrame): DataFrame containing the assigned journals for the proceeding documents
        journal_dict (Dict): Dictionary containing the journal names for every journal ID
        source_type (str): Define the observation object for correct column naming

    Returns:
       df (DataFrame) : df with additional column for corresponding journal names

    """
    df[f'{source_type}_names'] = df['id'].apply(lambda x: journal_dict.get(x, []))
    return df

# Anwenden der Funktion
value_counts_open_alex_ceur = add_journal_ids(value_counts_open_alex_ceur, journal_dict_ceur, "journal")

In [74]:
top_10 = value_counts_open_alex_ceur.head(11)


In [75]:
top_10

Unnamed: 0,id,frequency,average_journal_impact,journal_names
0,j,851,,[]


In [76]:
value_counts_open_alex_lncs = add_journal_ids(value_counts_open_alex_lncs, journal_dict_lncs, "journal")
top_10 = value_counts_open_alex_lncs.head(11)

In [77]:
top_10

Unnamed: 0,id,frequency,average_journal_impact,journal_names
0,j,443,,[]


In [78]:
value_counts_open_alex_ceur = add_journal_ids(value_counts_open_alex_trec, journal_dict_trec, "journal")
top_10 = value_counts_open_alex_trec.head(11)

In [79]:
top_10

Unnamed: 0,id,frequency,average_journal_impact,journal_names
0,j,2151,,[]


# Significance Tests

In [118]:
expanded_data_lncs = []

for _, row in value_counts_open_alex_lncs.iterrows():
    expanded_data_lncs.extend([row['average_journal_impact']] * int(row['frequency']))

# Umwandeln der replizierten Daten in ein DataFrame
expanded_df_lncs = pd.DataFrame(expanded_data_lncs, columns=['average_journal_impact'])

In [119]:
expanded_data_ceur = []

for _, row in value_counts_open_alex_ceur.iterrows():
    expanded_data_ceur.extend([row['average_journal_impact']] * int(row['frequency']))

# Umwandeln der replizierten Daten in ein DataFrame
expanded_df_ceur = pd.DataFrame(expanded_data_ceur, columns=['average_journal_impact'])

In [121]:
expanded_data_trec = []

for _, row in value_counts_open_alex_trec.iterrows():
    expanded_data_trec.extend([row['average_journal_impact']] * int(row['frequency']))

# Umwandeln der replizierten Daten in ein DataFrame
expanded_df_trec = pd.DataFrame(expanded_data_trec, columns=['average_journal_impact'])

In [122]:
expanded_df_trec

Unnamed: 0,average_third_value
0,0.000000
1,0.000000
2,0.000000
3,0.000000
4,0.000000
...,...
4919,0.000000
4920,3.788506
4921,1.019748
4922,0.620253


In [123]:
stat_a, p_value_a = shapiro(expanded_df_ceur['average_journal_impact'])
print(f"Shapiro-Wilk Test CEUR: Statistic = {stat_a}, p-Wert = {p_value_a}")

# Für Gruppe B
stat_b, p_value_b = shapiro(expanded_df_lncs['average_journal_impact'])
print(f"Shapiro-Wilk Test LNCS: Statistic = {stat_b}, p-Wert = {p_value_b}")

# Für Gruppe C
stat_c, p_value_c = shapiro(expanded_df_trec['average_journal_impact'])
print(f"Shapiro-Wilk Test TREC: Statistic = {stat_c}, p-Wert = {p_value_c}")

Shapiro-Wilk Test Gruppe A: Statistik = 0.6176913219079239, p-Wert = 1.316779083938014e-84
Shapiro-Wilk Test Gruppe B: Statistik = 0.5837190337939409, p-Wert = 3.381969043517878e-86
Shapiro-Wilk Test Gruppe B: Statistik = 0.5579554903282727, p-Wert = 3.798901261914976e-77


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [124]:
stat, p_value = levene(expanded_df_trec['average_journal_impact'], expanded_df_ceur['average_journal_impact'])
print(f"Levene Test: Statistic = {stat}, p-value = {p_value}")

Levene Test: Statistik = 0.5412804416251655, p-Wert = 0.4619162574620923


In [125]:
stat, p_value = levene(expanded_df_trec['average_journal_impact'], expanded_df_lncs['average_journal_impact'])
print(f"Levene Test: Statistic = {stat}, p-value = {p_value}")

Levene Test: Statistik = 1.9234570486653797, p-Wert = 0.16550081170942765


In [126]:
# Testing signififance between TREC and CEUR
u_stat_ab, p_value_ab = mannwhitneyu(expanded_df_trec['average_journal_impact'], expanded_df_ceur['average_journal_impact'])
print(f"Mann-Whitney U-Test TREC vs CEUR: U-Statistic = {u_stat_ab}, p-Wert = {p_value_ab}")

# Testing signififance between TREC and LNCS
u_stat_ac, p_value_ac = mannwhitneyu(expanded_df_trec['average_journal_impact'], expanded_df_lncs['average_journal_impact'])
print(f"Mann-Whitney U-Test TREC vs LNCS: U-Statistic = {u_stat_ac}, p-Wert = {p_value_ac}")

Mann-Whitney U-Test A vs B: U-Statistik = 13021792.5, p-Wert = 6.982984468971997e-194
Mann-Whitney U-Test A vs C: U-Statistik = 10471295.5, p-Wert = 0.0


In [127]:
median_a = expanded_df_trec['average_journal_impact'].median()
median_b = expanded_df_ceur['average_journal_impact'].median()

print(f"Median TREC: {median_a}")
print(f"Median CEUR: {median_b}")

if median_a > median_b:
    print("TREC tends to have higher values than CEUR")
else:
    print("CEUR tends to have higher values than TREC")

Median A: 0.1330355926480954
Median B: 0.5340314136125655
Gruppe B hat tendenziell höhere Werte als Gruppe A.


In [128]:
median_a = expanded_df_trec['average_journal_impact'].median()
median_b = expanded_df_lncs['average_journal_impact'].median()

print(f"Median TREC: {median_a}")
print(f"Median LNCS: {median_b}")

if median_a > median_b:
    print("TREC tends to have higher values than LNCS")
else:
    print("LNCS tends to have higher values than TREC")

Median A: 0.1330355926480954
Median B: 1.3917236767655985
Gruppe B hat tendenziell höhere Werte als Gruppe A.


## Analysis of top citing conferences


In [105]:
# Creating the dataframes with the citing journals for all three proceedings

df_ceur_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_CEUR_citing_doc.json", source_type="conference")
df_lncs_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_LNCS_citing_doc.json", source_type="conference")
df_trec_journals = get_journals_from_OpenAlex("../../../data/OpenAlex_TREC_citing_doc.json", source_type="conference")

In [106]:
# Aggregate the journal names and extract the journal impacts for all three proceedings

threshold = 95
journal_dict_ceur = aggregate_unique_journals(df_ceur_journals, threshold)

threshold = 95
journal_dict_lncs = aggregate_unique_journals(df_lncs_journals, threshold)

threshold = 95
journal_dict_trec = aggregate_unique_journals(df_trec_journals, threshold)

[['2021 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)', 'https://openalex.org/S4363607735'], ['2022 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)', 'https://openalex.org/S4363607730']]
[['2022 International Arab Conference on Information Technology (ACIT)', 'https://openalex.org/S4363608405'], ['2021 22nd International Arab Conference on Information Technology (ACIT)', 'https://openalex.org/S4363608487']]
[['2014 IEEE/WIC/ACM International Joint Conferences on Web Intelligence (WI) and Intelligent Agent Technologies (IAT)', 'https://openalex.org/S4363606372'], ['2013 IEEE/WIC/ACM International Joint Conferences on Web Intelligence (WI) and Intelligent Agent Technologies (IAT)', 'https://openalex.org/S4363606669']]
[['2013 Fourth International Conference on Computing, Communications and Networking Technologies (ICCCNT)', 'https://openalex.org/S4306497833'], ['2022 13th International Conference on Computing Communication and Networking Te

In [107]:
# Replace the journal names by the aggregated conference IDs
journal_dict_openalex_inverse = {v[0]: k for k, values in journal_dict_ceur.items() for v in values}
df_ceur_journals['conferences'] = df_ceur_journals['conferences'].apply(lambda x: replace_journal_names_by_id(x) if isinstance(x, list) else x)

journal_dict_openalex_inverse = {v[0]: k for k, values in journal_dict_lncs.items() for v in values}
df_lncs_journals['conferences'] = df_lncs_journals['conferences'].apply(lambda x: replace_journal_names_by_id(x) if isinstance(x, list) else x)

journal_dict_openalex_inverse = {v[0]: k for k, values in journal_dict_trec.items() for v in values}
df_trec_journals['conferences'] = df_trec_journals['conferences'].apply(lambda x: replace_journal_names_by_id(x) if isinstance(x, list) else x)

In [108]:
exploded_df_open_alex_ceur = df_ceur_journals.explode('conferences')

# Explode the lists of citing journals to count total appearance in the context of CEUR
value_counts_open_alex_ceur = exploded_df_open_alex_ceur['conferences'].value_counts()
value_counts_open_alex_ceur = value_counts_open_alex_ceur.reset_index()
value_counts_open_alex_ceur.columns = ['id', 'frequency']

In [109]:
exploded_df_open_alex_lncs = df_lncs_journals.explode('conferences')

# Häufigkeiten zählen
value_counts_open_alex_lncs = exploded_df_open_alex_lncs['conferences'].value_counts()
value_counts_open_alex_lncs = value_counts_open_alex_lncs.reset_index()
value_counts_open_alex_lncs.columns = ['id', 'frequency']

In [110]:
exploded_df_open_alex_trec = df_trec_journals.explode('conferences')

# Häufigkeiten zählen
value_counts_open_alex_trec = exploded_df_open_alex_trec['conferences'].value_counts()
value_counts_open_alex_trec = value_counts_open_alex_trec.reset_index()
value_counts_open_alex_trec.columns = ['id', 'frequency']

In [111]:
value_counts_open_alex_ceur = add_average_journal_impact(value_counts_open_alex_ceur, journal_dict_ceur, "conference")
value_counts_open_alex_lncs = add_average_journal_impact(value_counts_open_alex_lncs, journal_dict_lncs, "conference")
value_counts_open_alex_trec = add_average_journal_impact(value_counts_open_alex_trec, journal_dict_trec, "conference")

In [112]:
value_counts_open_alex_ceur = add_journal_ids(value_counts_open_alex_ceur, journal_dict_ceur, "conference")
top_10 = value_counts_open_alex_ceur.head(11)
top_10

Unnamed: 0,id,frequency,average_journal_impact,journal_names
0,journal_2,170,0.714286,"[[Cross-Language Evaluation Forum, https://ope..."
1,journal_11,74,0.418182,[[International Conference on Computational Li...
2,journal_20,65,0.0,"[[Text REtrieval Conference, https://openalex...."
3,journal_8,58,9.110706,[[Meeting of the Association for Computational...
4,journal_3,31,1.203488,[[Recent Advances in Natural Language Processi...
5,journal_10,25,2.073171,[[International Joint Conference on Natural La...
6,journal_64,24,10.494881,[[North American Chapter of the Association fo...
7,journal_9,19,0.0,"[[Pacific Asia Conference on Language, Informa..."
8,journal_13,17,1.461295,[[Empirical Methods in Natural Language Proces...
9,journal_4,15,6.388747,[[Conference of the European Chapter of the As...


In [113]:
value_counts_open_alex_lncs = add_journal_ids(value_counts_open_alex_lncs, journal_dict_lncs, "conference")
top_10 = value_counts_open_alex_lncs.head(11)
top_10

Unnamed: 0,id,frequency,average_journal_impact,journal_names
0,journal_0,56,0.714286,"[[Cross-Language Evaluation Forum, https://ope..."
1,journal_3,55,0.0,"[[Text REtrieval Conference, https://openalex...."
2,journal_7,45,9.110706,[[Meeting of the Association for Computational...
3,journal_1,37,0.418182,[[International Conference on Computational Li...
4,journal_11,17,10.494881,[[North American Chapter of the Association fo...
5,journal_8,14,2.073171,[[International Joint Conference on Natural La...
6,journal_9,13,1.203488,[[Recent Advances in Natural Language Processi...
7,journal_17,11,5.236902,[[Proceedings of the 45th International ACM SI...
8,journal_4,9,6.388747,[[Conference of the European Chapter of the As...
9,journal_14,9,0.0,"[[Geographic Information Retrieval, https://op..."


In [114]:
for i in top_10["journal_names"]:
    print(i)

[['Cross-Language Evaluation Forum', 'https://openalex.org/S4306418139', 0.7142857142857143]]
[['Text REtrieval Conference', 'https://openalex.org/S4306421043', 0.0]]
[['Meeting of the Association for Computational Linguistics', 'https://openalex.org/S4306420508', 9.110706482155862]]
[['International Conference on Computational Linguistics', 'https://openalex.org/S4306419219', 0.41818181818181815]]
[['North American Chapter of the Association for Computational Linguistics', 'https://openalex.org/S4306420633', 10.494880546075086]]
[['International Joint Conference on Natural Language Processing', 'https://openalex.org/S4306420006', 2.073170731707317]]
[['Recent Advances in Natural Language Processing', 'https://openalex.org/S4306420780', 1.2034883720930232]]
[['Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval', 'https://openalex.org/S4363608773', 5.236902050113895]]
[['Conference of the European Chapter of the Association fo

In [115]:
value_counts_open_alex_trec = add_journal_ids(value_counts_open_alex_trec, journal_dict_trec, "conference")
top_10 = value_counts_open_alex_trec.head(11)
top_10

Unnamed: 0,id,frequency,average_journal_impact,journal_names
0,journal_0,1528,0.0,"[[Text REtrieval Conference, https://openalex...."
1,journal_21,47,9.110706,[[Meeting of the Association for Computational...
2,journal_2,43,10.494881,[[North American Chapter of the Association fo...
3,journal_9,43,5.776542,[[Proceedings of the ... AAAI Conference on Ar...
4,journal_8,36,0.418182,[[International Conference on Computational Li...
5,journal_18,32,1.421965,[[National Conference on Artificial Intelligen...
6,journal_5,31,2.073171,[[International Joint Conference on Natural La...
7,journal_1,26,0.0,[[European Conference on Information Retrieval...
8,journal_84,26,0.0,"[[Conference on Email and Anti-Spam, https://o..."
9,journal_35,23,1.461295,[[Empirical Methods in Natural Language Proces...


In [116]:
expanded_data_lncs = []

for _, row in value_counts_open_alex_lncs.iterrows():
    expanded_data_lncs.extend([row['average_conference_impact']] * int(row['frequency']))

# Umwandeln der replizierten Daten in ein DataFrame
expanded_df_lncs = pd.DataFrame(expanded_data_lncs, columns=['average_conference_impact'])

In [117]:
expanded_data_ceur = []

for _, row in value_counts_open_alex_ceur.iterrows():
    expanded_data_ceur.extend([row['average_conference_impact']] * int(row['frequency']))

# Umwandeln der replizierten Daten in ein DataFrame
expanded_df_ceur = pd.DataFrame(expanded_data_ceur, columns=['average_conference_impact'])

In [118]:
expanded_data_trec = []

for _, row in value_counts_open_alex_trec.iterrows():
    expanded_data_trec.extend([row['average_conference_impact']] * int(row['frequency']))

# Umwandeln der replizierten Daten in ein DataFrame
expanded_df_trec = pd.DataFrame(expanded_data_trec, columns=['average_conference_impact'])

In [119]:
stat_a, p_value_a = shapiro(expanded_df_ceur['average_conference_impact'])
print(f"Shapiro-Wilk Test CEUR: Statistic = {stat_a}, p-value = {p_value_a}")

# Für Gruppe B
stat_b, p_value_b = shapiro(expanded_df_lncs['average_conference_impact'])
print(f"Shapiro-Wilk Test LNCS: Statistic = {stat_b}, p-value = {p_value_b}")

# Für Gruppe C
stat_c, p_value_c = shapiro(expanded_df_trec['average_conference_impact'])
print(f"Shapiro-Wilk Test TREC: Statistic = {stat_c}, p-value = {p_value_c}")

Shapiro-Wilk Test Gruppe A: Statistik = 0.6422074696963672, p-Wert = 9.128401133465151e-39
Shapiro-Wilk Test Gruppe B: Statistik = 0.7451575968040867, p-Wert = 1.5126682619096295e-25
Shapiro-Wilk Test Gruppe B: Statistik = 0.4070487730215465, p-Wert = 2.703474431339863e-64


In [120]:
stat, p_value = levene(expanded_df_trec['average_conference_impact'], expanded_df_ceur['average_conference_impact'])
print(f"Levene Test: Statistic = {stat}, p-value = {p_value}")

Levene Test: Statistik = 67.06426496893172, p-Wert = 3.8439755865753164e-16


In [121]:
stat, p_value = levene(expanded_df_trec['average_conference_impact'], expanded_df_lncs['average_conference_impact'])
print(f"Levene Test: Statistic = {stat}, p-value = {p_value}")

Levene Test: Statistik = 160.22756919458496, p-Wert = 1.1218566193223456e-35


In [122]:

# Vergleich Gruppe A und Gruppe B
u_stat_ab, p_value_ab = mannwhitneyu(expanded_df_trec['average_conference_impact'], expanded_df_ceur['average_conference_impact'])
print(f"Mann-Whitney U-Test TREC vs CEUR: U-Statistic = {u_stat_ab}, p-value = {p_value_ab}")

# Vergleich Gruppe A und Gruppe C
u_stat_ac, p_value_ac = mannwhitneyu(expanded_df_trec['average_conference_impact'], expanded_df_lncs['average_conference_impact'])
print(f"Mann-Whitney U-Test TREC vs LNCS: U-Statistic = {u_stat_ac}, p-value = {p_value_ac}")

Mann-Whitney U-Test A vs B: U-Statistik = 430348.0, p-Wert = 6.220563271611711e-153
Mann-Whitney U-Test A vs C: U-Statistik = 229475.0, p-Wert = 3.2740403253620714e-102


In [123]:
median_a = expanded_df_trec['average_conference_impact'].median()
median_b = expanded_df_ceur['average_conference_impact'].median()

print(f"Median TREC: {median_a}")
print(f"Median CEUR: {median_b}")

if median_a > median_b:
    print("TREC tends to have higher values than CEUR.")
else:
    print("CEUR tends to have higher values than TREC.")

Median A: 0.0
Median B: 0.7142857142857143
Gruppe B hat tendenziell höhere Werte als Gruppe A.


In [124]:
median_a = expanded_df_trec['average_conference_impact'].median()
median_b = expanded_df_lncs['average_conference_impact'].median()

print(f"Median TREC: {median_a}")
print(f"Median LNCS: {median_b}")

if median_a > median_b:
    print("TREC tends to have higher values than LNCS.")
else:
    print("LNCS tends to have higher values than TREC.")

Median A: 0.0
Median B: 0.7142857142857143
Gruppe B hat tendenziell höhere Werte als Gruppe A.
