In [20]:
'''
1. Levenshtein Distance (Edit Distance)
Measures the minimum number of single-character edits required to transform one string into another. A higher distance indicates higher similarity.
'''

from Levenshtein import distance

names = ["Alice", "Alicia", "Bob", "Robert"]
similarities = []

# Compute pairwise similarity
for i in range(len(names)):
    for j in range(i + 1, len(names)):
        sim = 1 - distance(names[i], names[j]) / max(len(names[i]), len(names[j]))
        similarities.append(sim)
        print(f"Similarity between {names[i]} and {names[j]}: {sim:.2f}")

similarities = np.array(similarities)
print(similarities.mean())


Similarity between Alice and Alicia: 0.67
Similarity between Alice and Bob: 0.00
Similarity between Alice and Robert: 0.00
Similarity between Alicia and Bob: 0.00
Similarity between Alicia and Robert: 0.00
Similarity between Bob and Robert: 0.33
0.16666666666666666


In [18]:
'''
2. Cosine Similarity on Embeddings
Convert names into numerical embeddings (e.g., using Word2Vec or Sentence Transformers) and compute cosine similarity.
'''


from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-MiniLM-L6-v2')  # Pretrained embedding model
names = ["Alice", "Alicia", "Bob", "Robert"]

# Generate embeddings
embeddings = model.encode(names)

# Compute pairwise similarity
similarities = cosine_similarity(embeddings)

# Display similarity matrix
for i, name1 in enumerate(names):
    for j, name2 in enumerate(names):
        if i < j:
            print(f"Similarity between {name1} and {name2}: {similarities[i][j]:.2f}")
print(similarities.mean())

Similarity between Alice and Alicia: 0.46
Similarity between Alice and Bob: 0.50
Similarity between Alice and Robert: 0.42
Similarity between Alicia and Bob: 0.20
Similarity between Alicia and Robert: 0.41
Similarity between Bob and Robert: 0.59
0.57176983


In [24]:
'''
3. Jaro-Winkler Similarity
A metric that gives more weight to characters at the start of strings.
'''
import jellyfish
import numpy as np

names = ["Alice", "Alicia", "Bob", "Robert"]
similarities = []

# Compute pairwise similarity
for i in range(len(names)):
    for j in range(i + 1, len(names)):
        sim = jellyfish.jaro_winkler_similarity(names[i], names[j])
        similarities.append(sim)
        print(f"Similarity between {names[i]} and {names[j]}: {sim:.2f}")

# similarities = np.array(similarities)
# print(similarities.mean())
np.mean(similarities)

Similarity between Alice and Alicia: 0.89
Similarity between Alice and Bob: 0.00
Similarity between Alice and Robert: 0.46
Similarity between Alicia and Bob: 0.00
Similarity between Alicia and Robert: 0.00
Similarity between Bob and Robert: 0.67


0.33592592592592596

In [21]:
'''
4. Token-Based Similarity (e.g., Jaccard Similarity)
Splits names into tokens (characters or words) and measures overlap.
'''

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

names = ["Alice", "Alicia", "Bob", "Robert"]

# Vectorize names into character-level tokens
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))  # Bi-grams and tri-grams
X = vectorizer.fit_transform(names)

# Compute cosine similarity on tokenized names
similarities = cosine_similarity(X)

# Display similarity matrix
for i, name1 in enumerate(names):
    for j, name2 in enumerate(names):
        if i < j:
            print(f"Similarity between {name1} and {name2}: {similarities[i][j]:.2f}")

print(similarities.mean())

Similarity between Alice and Alicia: 0.63
Similarity between Alice and Bob: 0.00
Similarity between Alice and Robert: 0.00
Similarity between Alicia and Bob: 0.00
Similarity between Alicia and Robert: 0.00
Similarity between Bob and Robert: 0.19
0.35279885975982334


In [23]:
'''
5. Phonetic Similarity (Soundex or Metaphone)
Compares names based on how they sound rather than their spelling.
'''

from fuzzy import Soundex

soundex = Soundex(4)
names = ["Alice", "Alicia", "Bob", "Robert"]

# Compute phonetic similarity
phonetic_codes = {name: soundex(name) for name in names}
print("Phonetic Codes:", phonetic_codes)


Phonetic Codes: {'Alice': 'A42', 'Alicia': 'A42', 'Bob': 'B1', 'Robert': 'R163'}


# Diversity

In [32]:
'''
1. Shannon Diversity Index (Entropy)
The Shannon Index measures uncertainty or entropy in the distribution of categories. 
It is sensitive to both richness (number of unique categories) and evenness (distribution of categories).
'''

from collections import Counter
import numpy as np

# Example data
labels = ['cat', 'dog', 'dog', 'cat', 'bird', 'cat', 'bird', 'dog', 'dog', 'dog', 'dog', 'dog', 'dog', 'dog', 'dog', 'dog', 'dog']

# Compute proportions
counts = Counter(labels)
total = sum(counts.values())
proportions = [count / total for count in counts.values()]

# Shannon Index
shannon_index = -sum(p * np.log(p) for p in proportions)
print(f"Shannon Diversity Index: {shannon_index:.4f}")


Shannon Diversity Index: 0.8037


In [33]:
'''
2. Simpson's Diversity Index
Simpson's Index measures the probability that two individuals randomly selected from the dataset belong to the same category.
'''

# Simpson's Index
simpson_index = 1 - sum(p**2 for p in proportions)
print(f"Simpson's Diversity Index: {simpson_index:.4f}")


Simpson's Diversity Index: 0.4567


In [29]:
'''
3. Gini-Simpson Index
The Gini-Simpson Index is related to Simpson's Index and represents the probability that two randomly chosen individuals belong to different categories.
'''
# Gini-Simpson Index
gini_simpson_index = sum(p**2 for p in proportions)
print(f"Gini-Simpson Index: {gini_simpson_index:.4f}")

Gini-Simpson Index: 0.5433


In [30]:
'''
4. Effective Number of Species
The effective number of species converts the Shannon Index into the equivalent number of equally abundant categories.
'''
# Effective Number of Species
effective_number_of_species = np.exp(shannon_index)
print(f"Effective Number of Species: {effective_number_of_species:.4f}")

Effective Number of Species: 2.2339


In [31]:
'''
5. Richness (Count of Unique Categories)
Richness is the simplest diversity measure and represents the number of unique categories.
'''
# Richness
richness = len(counts)
print(f"Richness: {richness}")


Richness: 3


# Scholarly similarity

In [74]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# Example DataFrame
data = {
    "name": ["Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack"],
    "work_counts": [10, 15, 20, 25, 30, 35, 40, 45, 50, 55],
    "cited_by_counts": [200, 150, 300, 250, 400, 350, 500, 450, 600, 550],
    "h_index": [5, 7, 6, 8, 10, 9, 11, 12, 13, 14],
}
df = pd.DataFrame(data)

# Randomly select 10 scientists (or subset the DataFrame if already chosen)
selected_scientists = df.sample(n=10, random_state=42)

# Normalize the scholarly metrics
scaler = MinMaxScaler()
metrics = selected_scientists[["work_counts", "cited_by_counts", "h_index"]]
normalized_metrics = scaler.fit_transform(metrics)

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(normalized_metrics)

# Compute average pairwise similarity
pairwise_combinations = list(combinations(range(len(selected_scientists)), 2))
average_similarity = np.mean([similarity_matrix[i, j] for i, j in pairwise_combinations])

similarity = np.dot(normalized_metrics, normalized_metrics.T)

print(f"Average Similarity Score among selected scientists: {average_similarity:.4f}")
print(np.mean(similarity_matrix))

# print(similarity)
# print()
# print(similarity_matrix)

Average Similarity Score among selected scientists: 0.8404
0.8563202927157882


In [76]:
cosine_similarity([[5,10,15,20]]), cosine_similarity([[1,5,10,15,20,60]])

(array([[1.]]), array([[1.]]))

In [37]:
from scipy.spatial.distance import pdist, squareform

distance_matrix = squareform(pdist(normalized_metrics, metric="euclidean"))
average_distance = np.mean([distance_matrix[i, j] for i, j in pairwise_combinations])
print(f"Average Euclidean Distance: {average_distance:.4f}")


Average Euclidean Distance: 0.7213


In [39]:
correlation_matrix = np.corrcoef(normalized_metrics, rowvar=False)
correlation_matrix

array([[1.        , 0.93939394, 0.97575758],
       [0.93939394, 1.        , 0.91515152],
       [0.97575758, 0.91515152, 1.        ]])

In [60]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Sample DataFrame (as before)
data = {
    "name": ["Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack"],
    "work_counts": [10, 15, 20, 25, 30, 35, 40, 45, 50, 55],
    "cited_by_counts": [200, 150, 300, 250, 400, 350, 500, 450, 600, 550],
    "h_index": [5, 7, 6, 8, 10, 9, 11, 12, 13, 14],
}
df = pd.DataFrame(data)

# Normalize metrics
scaler = MinMaxScaler()
metrics = df[["work_counts", "cited_by_counts", "h_index"]]
normalized_metrics = scaler.fit_transform(metrics)

# Apply PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(normalized_metrics)

# Add PCA components back to the DataFrame
df["PC1"], df["PC2"] = pca_transformed[:, 0], pca_transformed[:, 1]

# Randomly select 10 scientists
selected_scientists = df.sample(n=10, random_state=42)

# Compute similarity among the selected scientists in the 2D PCA space
pca_metrics = selected_scientists[["PC1", "PC2"]]
similarity_matrix = cosine_similarity(pca_metrics)

# Compute average pairwise similarity
pairwise_combinations = list(combinations(range(len(selected_scientists)), 2))
average_similarity = np.mean([similarity_matrix[i, j] for i, j in pairwise_combinations])

print(pairwise_combinations)

print(f"Average Similarity Score in PCA Space: {average_similarity:.4f}")


[(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (5, 6), (5, 7), (5, 8), (5, 9), (6, 7), (6, 8), (6, 9), (7, 8), (7, 9), (8, 9)]
Average Similarity Score in PCA Space: -0.1111


In [42]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Example: Apply log transformation
data = [[10, 1000, 5], [15, 1500, 6], [20, 2000, 8]]
log_transformed = np.log1p(data)

# Optional: Normalize after transformation
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(log_transformed)
normalized_data

array([[0.        , 0.        , 0.        ],
       [0.57945826, 0.58490361, 0.38018236],
       [1.        , 1.        , 1.        ]])

In [44]:
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler

# Example data
data = np.array([10, 15, 20, 25, 30])

# Box-Cox transformation (requires all positive values)
boxcox_transformed, _ = boxcox(data)

# Normalize after transformation
scaler = StandardScaler()
normalized_data = scaler.fit_transform(boxcox_transformed.reshape(-1, 1))
normalized_data

array([[-1.47297643],
       [-0.66976095],
       [ 0.05534276],
       [ 0.72739861],
       [ 1.35999601]])

In [46]:
from sklearn.preprocessing import PowerTransformer

# Example: Yeo-Johnson transformation
data = [[10, 1000, 5], [15, 1500, 6], [20, 2000, 8]]

transformer = PowerTransformer(method='yeo-johnson')
transformed_data = transformer.fit_transform(data)
transformed_data

array([[-1.25039527, -1.25217013, -1.18159061],
       [ 0.05302306,  0.056829  , -0.08217032],
       [ 1.19737221,  1.19534113,  1.26376093]])

In [47]:
from scipy.stats import rankdata

# Example: Rank transformation
data = [10, 15, 20, 100, 5]
rank_transformed = rankdata(data, method='average')
rank_transformed

array([2., 3., 4., 5., 1.])

# Age similarity

In [79]:
import numpy as np
import pandas as pd

# Example series
ages = pd.Series([25, 30, 35, 40, 45, 2])

# Compute standard deviation
std_dev = ages.std()

# Normalized similarity index (1 means identical ages, closer to 0 means highly dispersed)
similarity_index = 1 - (std_dev / ages.max())
print(f"Similarity Index: {similarity_index:.4f}")


Similarity Index: 0.6619


In [80]:
from sklearn.metrics.pairwise import cosine_similarity

# Create age histogram (e.g., binning ages into 10-year intervals)
age_bins = pd.cut(ages, bins=5, labels=False)  # 5 bins
age_histogram = pd.Series(age_bins).value_counts().sort_index().values.reshape(1, -1)

# Compute cosine similarity (self-similarity for a single series = 1)
cosine_sim = cosine_similarity(age_histogram, age_histogram)[0, 0]
print(f"Cosine Similarity on Age Distribution: {cosine_sim:.4f}")


Cosine Similarity on Age Distribution: 1.0000


In [81]:
def gini_coefficient(array):
    sorted_array = np.sort(array)
    n = len(array)
    cumulative_sum = np.cumsum(sorted_array)
    gini = (2 / n) * np.sum((np.arange(1, n + 1) * sorted_array)) / cumulative_sum[-1] - (n + 1) / n
    return gini

# Compute Gini coefficient
gini = gini_coefficient(ages.values)
similarity_index = 1 - gini  # Transform to similarity (1 = identical ages, 0 = high disparity)
print(f"Gini-Based Similarity Index: {similarity_index:.4f}")


Gini-Based Similarity Index: 0.7505


In [90]:
from sklearn.metrics.pairwise import cosine_similarity

ages = pd.Series([1,1,1,1,1,1,1,0])

# Z-score normalization
# z_scores = (ages - ages.mean()) / ages.std()

# Compute pairwise cosine similarity
z_scores_matrix = ages.values.reshape(-1, 1)
cosine_sim = cosine_similarity(z_scores_matrix)
average_similarity = cosine_sim.mean()

print(f"Average Cosine Similarity (Z-scores): {average_similarity:.4f}")


Average Cosine Similarity (Z-scores): 0.7656


In [87]:
z_scores

0   NaN
1   NaN
dtype: float64

In [1]:
import random
from itertools import combinations

# Example data: List of authors and their affiliations
authors = [
    {"name": "Author 1", "affiliations": ["Univ A", "Univ B"]},
    {"name": "Author 2", "affiliations": ["Univ A", "Univ C"]},
    {"name": "Author 3", "affiliations": ["Univ D", "Univ E"]},
    {"name": "Author 4", "affiliations": ["Univ A", "Univ B", "Univ C"]},
    {"name": "Author 5", "affiliations": ["Univ F"]},
]

# 1. Sample a subset of authors
sample_size = 3
sampled_authors = random.sample(authors, sample_size)

# 2. Define a Jaccard similarity function for two lists
def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0

# 3. Compute pairwise Jaccard similarity
pairwise_similarities = []
for author1, author2 in combinations(sampled_authors, 2):
    similarity = jaccard_similarity(author1["affiliations"], author2["affiliations"])
    pairwise_similarities.append(similarity)

# 4. Calculate the average Jaccard similarity
average_similarity = sum(pairwise_similarities) / len(pairwise_similarities) if pairwise_similarities else 0

# Output the results
print("Sampled Authors:")
for author in sampled_authors:
    print(f"{author['name']}: {author['affiliations']}")

print(f"\nPairwise Jaccard Similarities: {pairwise_similarities}")
print(f"Average Jaccard Similarity: {average_similarity:.4f}")


Sampled Authors:
Author 1: ['Univ A', 'Univ B']
Author 5: ['Univ F']
Author 2: ['Univ A', 'Univ C']

Pairwise Jaccard Similarities: [0.0, 0.3333333333333333, 0.0]
Average Jaccard Similarity: 0.1111


___

In [1]:
import pandas as pd
from itertools import permutations

%load_ext autoreload
%autoreload 2
 
import sys
sys.path.append('../../code/')

from libs import io
from libs import constants
from postprocessing import similarity

In [2]:
from tqdm import tqdm
tqdm.pandas() 

In [19]:
aps_os_data_tar_gz = '../../data/final_dataset.tar.gz'
llm_valid_csv = '../../results/factuality/llama-3.1-8b_author.csv'

In [20]:
df_authorships = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_AUTHORSHIPS_FN)
df_authorships.rename(columns={'id_author':'id_author_oa', 'id_institution':'id_institution_oa'}, inplace=True)

In [21]:
df_institutions = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_INSTITUTIONS_FN)
df_institutions.rename(columns={'id_institution':'id_institution_oa'}, inplace=True)

In [22]:
df_responses = io.read_csv(llm_valid_csv, index_col=None).drop(columns=['Unnamed: 0'])
df_responses = df_responses.query("task_name == 'top_k'")
df_responses.head(2)

Unnamed: 0,date,time,llm_model,task_name,task_param,task_attempt,result_valid_flag,name,years,doi,...,h_index,i10_index,e_index,two_year_mean_citedness,year_first_publication,year_last_publication,academic_age,age_now,seniority_active,seniority_now
10,2024-12-09,00:00,llama-3.1-8b-instant,top_k,top_5,1,valid,Stephen Hawking,,,...,106.0,221.0,261.326342,0.0,1965.0,2016.0,52.0,61.0,senior,senior
11,2024-12-09,00:00,llama-3.1-8b-instant,top_k,top_5,1,valid,Richard Feynman,,,...,88.0,148.0,507.631749,0.0,1939.0,1986.0,48.0,87.0,senior,senior


In [23]:
df_responses.columns

Index(['date', 'time', 'llm_model', 'task_name', 'task_param', 'task_attempt',
       'result_valid_flag', 'name', 'years', 'doi', 'career_age', 'clean_name',
       'model', 'valid_attempt', 'id_author_oa', 'fact_author_score',
       'id_author_aps_list', 'ethnicity_dx', 'ethnicity_ec', 'ethnicity',
       'gender', 'works_count', 'cited_by_count', 'h_index', 'i10_index',
       'e_index', 'two_year_mean_citedness', 'year_first_publication',
       'year_last_publication', 'academic_age', 'age_now', 'seniority_active',
       'seniority_now'],
      dtype='object')

In [24]:
# def process_group(group, df_authorships, df_institutions):

#     print('==========================')

#     clean_group = group.dropna(subset=['id_author_oa'])
#     n_unique_author_recommendations = clean_group.id_author_oa.astype(int).nunique()
   
#     if clean_group.empty or n_unique_author_recommendations == 1:
#         institutions_share = None
#         coauthors_share = None
#         country_of_affiliation_share = None
    
#     else:
        
#         # insitutions and coauthors in common

#         print('group:\n', group[['name','id_author_oa']])
#         print('\n id_author_oa:', group.id_author_oa.nunique())

#         ids = clean_group.id_author_oa.dropna().unique()
#         df_authorships_filtered = df_authorships.query('id_author_oa in @ids').dropna(subset=['id_institution_oa'])

#         # df_institutions_authors = df_authorships_filtered[['id_author_oa','id_institution_oa']].drop_duplicates().groupby('id_author_oa').id_institution_oa.apply(list).reset_index(name='_items').astype(str).set_index('id_author_oa')
#         # df_institutions_authors._items = df_institutions_authors._items.astype(int)
#         df_institutions_authors = similarity.get_items_by_author(df_authorships_filtered.groupby('id_author_oa').id_institution_oa.unique(), df_institutions, 'id_institution_oa')
#         institutions_share = similarity.compute_average_jaccard_similarity(df_institutions_authors)
        
#         # print("\n df_institutions_authors:\n", df_institutions_authors)
#         # print('institutions_share:\n', institutions_share)


#         df_countries = similarity.get_items_by_author(df_authorships_filtered.groupby('id_author_oa').id_institution_oa.unique(), df_institutions, 'country_code')
#         country_of_affiliation_share = similarity.compute_average_jaccard_similarity(df_countries)

#         # print("\n df_countries:\n", df_countries)
#         # print('country_of_affiliation_share:\n', country_of_affiliation_share)


#         df_coauthors = similarity.get_items_by_author(df_authorships_filtered.groupby('id_author_oa').id_institution_oa.unique(), df_authorships, 'id_author_oa', column_item_cast=int)
#         coauthors_share = similarity.compute_average_jaccard_similarity(df_coauthors)

#         print("\n df_coauthors:\n", df_coauthors)
#         print('coauthors_share:\n', coauthors_share)
        
#         # coauthors among the recommendations
#         df_coauthors_recommended = pd.DataFrame(df_coauthors.apply(lambda row: list(set(row._items).intersection(set(ids)) - set([row.name])), axis=1), columns=['_items'])
#         coauthors_share_recommended = similarity.compute_average_jaccard_similarity(df_coauthors_recommended)

#         print(ids)
#         print("\n df_coauthors_recommended:\n", df_coauthors_recommended)
#         print('coauthors_share_recommended:\n', coauthors_share_recommended)

#         # import sys
#         # sys.exit(0)
        
#     # Return a DataFrame with one row and multiple columns
#     df = pd.DataFrame({
#         'institutions_share': [institutions_share],
#         'country_of_affiliation_share': [country_of_affiliation_share],
#         'coauthors_share': [coauthors_share]
#     })
#     return df

In [31]:
def process_group(group, df_authorships, df_institutions):    
    # Remove rows with missing author ids
    clean_group = group.dropna(subset=['id_author_oa'])
    
    # Compute the number of name recommendations and author hallucinations
    n_unique_author_recommendations = clean_group.id_author_oa.nunique()
    
    if clean_group.empty or n_unique_author_recommendations == 1:
        gender_diversity = None
        ethnicity_diversity = None
        scholarly_similarity = None
        aps_similarity = None
        oa_similarity = None
        aps_career_age_similarity = None
        oa_career_age_similarity = None
        institutions_share = None
        coauthors_share = None
        country_of_affiliation_share = None
    
    else:
        
        
        ids = clean_group.id_author_oa.dropna().unique()
        df_authorships_filtered = df_authorships.query('id_author_oa in @ids').dropna(subset=['id_institution_oa'])

        # shared institutions
        df_institutions_authors = similarity.get_items_by_author(df_authorships_filtered.groupby('id_author_oa').id_institution_oa.unique(), df_institutions, 'id_institution_oa')
        institutions_share = similarity.compute_average_jaccard_similarity(df_institutions_authors)
        
        #print('institutions_share:', institutions_share)

        # shared institutions' countries
        df_countries = similarity.get_items_by_author(df_authorships_filtered.groupby('id_author_oa').id_institution_oa.unique(), df_institutions, 'country_code')
        country_of_affiliation_share = similarity.compute_average_jaccard_similarity(df_countries)

        # print('country_of_affiliation_share:', country_of_affiliation_share)

        # shared coauthors
        df_coauthors = similarity.get_items_by_author(df_authorships_filtered.groupby('id_author_oa').id_institution_oa.unique(), df_authorships, 'id_author_oa', column_item_cast=int)
        coauthors_share = similarity.compute_average_jaccard_similarity(df_coauthors)

        # print('coauthors_share:', coauthors_share)

        # coauthors among the recommendations
        df_coauthors_recommended = pd.DataFrame(df_coauthors.apply(lambda row: list(set(row._items).intersection(set(ids)) - set([row.name])), axis=1), columns=['_items'])
        coauthors_recommended_share = similarity.compute_average_jaccard_similarity(df_coauthors_recommended)
        
        all_possible_pairs = len(list(permutations(ids, 2)))
        recommended_authors_are_coauthors = df_coauthors_recommended._items.apply(lambda x: len(x) > 0).sum() / all_possible_pairs

        if coauthors_recommended_share > 0:
            print(df_coauthors_recommended)

            print('recommended_authors_are_coauthors:', recommended_authors_are_coauthors)
            print('coauthors_recommended_share: ', coauthors_recommended_share)

            import sys
            sys.exit(0)


In [32]:
cols = ['model', 'task_name', 'task_param', 'date', 'time']
df_request_stats = df_responses.groupby(cols).progress_apply(lambda row: process_group(row, 
                                                                                        df_authorships=df_authorships,
                                                                                        df_institutions=df_institutions,
                                                                                        )).reset_index()


  0%|          | 0/81 [00:00<?, ?it/s]

  1%|          | 1/81 [00:03<05:08,  3.85s/it]

                                                         _items
id_author_oa                                                   
5001536933    [5004950660.0, 5110862535.0, 5106849896.0, 503...
5004950660    [5043077347.0, 5012326853.0, 5051731365.0, 500...
5006656464    [5004950660.0, 5012326853.0, 5110862535.0, 511...
5012326853    [5004950660.0, 5110862535.0, 5041516104.0, 511...
5012461168                                                   []
5014383473    [5012326853.0, 5110862535.0, 5041516104.0, 511...
5017158718                                                   []
5023914308                                                   []
5027643641                                       [5043077347.0]
5029910389                                                   []
5031678863                                                   []
5032512426    [5110436352.0, 5012326853.0, 5110862535.0, 511...
5037710835    [5004950660.0, 5012326853.0, 5110862535.0, 504...
5041516104    [5012326853.0, 5066175077.




SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
