In [None]:
import pandas as pd
import numpy as np
import pickle as pkl

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
from google.colab import files

### Files needed (9 files):
# subsampled_previous_work_abstracts.pkl: abtracts used to build the researcher profiles
# co_authors_dict.pkl: dictionary containing the co-authors for the co-author heuristic
# user_profiles_based_on_previous_work_abstracts_based_on_new_papers.pkl: mean of abstract embeddings used for testing vanilla sentence-BERT
# subsampled_previous_work_abstracts_ner_clean.csv: previous work abstracts with keywords extracted using the NER tool of Raza et al. (2022)
# new_subsampled_data.pkl: subsampled dataset of papers that contain SOIs
# subsampled_data_ner_clean.pkl: NER concepts for the sentences in the subsampled dataset of papers
# sentence_embeddings.pkl: embeddings of all sentences in the subsampled dataset of papers
# LR_model.pkl: Logistic regression
# list_of_weigthed_BERT_embeddings_profiles.pkl: embeddings of the researcher profiles
files.upload()

In [None]:
# When all the files are loaded on Colab (or are present locally), we can load them in the notebook.

with open("subsampled_previous_work_abstracts.pkl", "rb") as f:
  subsampled_previous_work_abstracts = pkl.load(f)
with open("co_authors_dict.pkl", "rb") as f:
  co_authors_dict = pkl.load(f)
with open("user_profiles_based_on_previous_work_abstracts_based_on_new_papers.pkl", "rb") as f:
  user_profiles_based_on_previous_work_abstracts = pkl.load(f)
with open("new_subsampled_data.pkl", "rb") as f:
  subsampled_data = pkl.load(f)
with open("subsampled_data_ner_clean.pkl", "rb") as f:
  subsampled_data_NER = pkl.load(f)
with open("subsampled_previous_work_abstracts_ner_clean.pkl", "rb") as f:
  abstracts_NER = pkl.load(f)
with open("sentence_embeddings.pkl", "rb") as f:
  sentence_embeddings = pkl.load(f)
with open("LR_model.pkl", "rb") as f:
  LR_model = pkl.load(f)
with open("list_of_weigthed_BERT_embeddings_profiles.pkl", "rb") as f:
  weighted_BERT_embeddings_profiles = pkl.load(f)

In [None]:
# We add the concept extracted by the NER tool of Raza et al. (2022) in the DataFrame of abstracts

abstracts_NER.index = subsampled_previous_work_abstracts.index
subsampled_previous_work_abstracts["Concepts"] = abstracts_NER["Word"]

In [None]:
# We add the concept extracted by the NER tool of Raza et al. (2022) in the DataFrame of papers containing the SOIs

subsampled_data_NER.index = subsampled_data.index
subsampled_data["Concepts"] = subsampled_data_NER["Word"]

In [None]:
# Check the number of unique authors used

classes = list(subsampled_previous_work_abstracts["First Author"])
authors = np.unique(classes)
print(f"Number of authors = {len(authors)}")

In [None]:
# Prepare the TF-IDF vectorization of the abstracts

vectorizer = TfidfVectorizer()
TFIDF_previous_work_abstracts = vectorizer.fit_transform(list(subsampled_previous_work_abstracts["Abstract"]))
previous_work_abstracts_column_names = vectorizer.get_feature_names_out()

In [None]:
# Main part of the code to perform the experiment

ranking = pd.DataFrame(columns=["Best Rank", "Number of Abstracts"])
list_of_sentence_BERT_ranks = []
list_of_best_sentence_BERT_ranks = []

list_of_LR_ranks = []
list_of_best_LR_ranks = []

list_of_RecSOI_ranks = []
list_of_best_RecSOI_ranks = []

number_of_abstracts = []
df_of_best_ranked_SoIs_by_sentence_BERT = pd.DataFrame(index=subsampled_data.loc[subsampled_data['IGNORANCE_TYPE'] == 0].index) # We test for 0 because this is the label for SOIs in Boguslav et al.'s dataset
df_of_best_ranked_SoIs_by_RecSOI = pd.DataFrame(index=subsampled_data.loc[subsampled_data['IGNORANCE_TYPE'] == 0].index) # We test for 0 because this is the label for SOIs in Boguslav et al.'s dataset

all_results = pd.DataFrame(columns=["Expectancy Top5", "Expectancy Top10", "Expectancy Top20", "sentence-BERT Ranks", "LR Ranks", "RecSOI Ranks"])
all_co_author_results = pd.DataFrame(columns=["Expectancy Top5", "Expectancy Top10", "Expectancy Top20", "sentence-BERT Co-Author Ranks", "LR Co-Author Ranks", "RecSOI Co-Author Ranks"])
all_concept_results = pd.DataFrame(columns=["Expectancy Top5", "Expectancy Top10", "Expectancy Top20", "sentence-BERT Concept Ranks", "LR Concept Ranks", "RecSOI Concept Ranks"])

scaler = StandardScaler(with_mean=False).fit(TFIDF_previous_work_abstracts)
baseline_results = []
counter = 1
for author in authors: # The recommendation is performed for each author
  print("Working on", author, "(", counter, "on", len(authors), ")")

  papers = subsampled_data[subsampled_data["First Author"] == author]
  SoIs_in_paper = subsampled_data[subsampled_data["IGNORANCE_TYPE"] == 0] # 0 is used to say that it IS an ignorance in Boguslav et al. dataset and classifiers
  investigated_SoI_indices = papers.loc[papers['IGNORANCE_TYPE'] == 0].index # 0 is used to say that it IS an ignorance in Boguslav et al. dataset and classifiers

  co_author_papers = subsampled_data[subsampled_data["First Author"].isin(co_authors_dict[author])]
  investigated_co_auhor_SoI_indices = co_author_papers.loc[co_author_papers['IGNORANCE_TYPE'] == 0].index # 0 is used to say that it IS an ignorance in Boguslav et al. dataset and classifiers

  # Gather the concepts of the authors based on what is extracted from their abstracts
  author_concepts = []
  author_abstracts = subsampled_previous_work_abstracts[subsampled_previous_work_abstracts["First Author"] == author]
  for _, abstract_info in author_abstracts.iterrows():
    author_concepts += abstract_info["Concepts"]
  author_concepts = set(author_concepts)

  # Gather the concept related to SOIs
  investigated_concept_SoI_indices = []
  for index, data_info in SoIs_in_paper.iterrows():
    data_concepts = set(data_info["Concepts"])
    if len(data_concepts.intersection(author_concepts)) > 0:
      investigated_concept_SoI_indices.append(index)
  concept_papers = subsampled_data.loc[investigated_concept_SoI_indices]

  if len(investigated_SoI_indices) == 0 and len(investigated_co_auhor_SoI_indices) == 0 and len(investigated_concept_SoI_indices) == 0:
    counter += 1
    continue

  # Perform the recommendation for sentence-BERT
  distances = []

  statements_of_ignorance_indices = subsampled_data.loc[subsampled_data['IGNORANCE_TYPE'] == 0].index # 0 is used to say that it IS an ignorance
  for statements_of_ignorance_index in statements_of_ignorance_indices:
    distances.append(np.linalg.norm(user_profiles_based_on_previous_work_abstracts[author] - sentence_embeddings[statements_of_ignorance_index]))
  distances = pd.DataFrame(distances)
  distances.index = statements_of_ignorance_indices

  # Perform the recommendation for LR
  results = []

  statements_of_ignorance_indices = subsampled_data.loc[subsampled_data['IGNORANCE_TYPE'] == 0].index # 0 is used to say that it IS an ignorance
  TFIDF_statements_of_ignorance = vectorizer.transform(subsampled_data.loc[statements_of_ignorance_indices, "SENTENCE"])
  scaled_TFIDF_statements_of_ignorance = scaler.transform(TFIDF_statements_of_ignorance)
  results = np.transpose(LR_model.predict_proba(scaled_TFIDF_statements_of_ignorance))[list(LR_model.classes_).index(author)]
  results = pd.DataFrame(results)
  results.index = statements_of_ignorance_indices

  # Perform the recommendation for RecSOI
  RecSOI_results = []

  statements_of_ignorance_indices = subsampled_data.loc[subsampled_data['IGNORANCE_TYPE'] == 0].index # 0 is used to say that it IS an ignorance
  for statements_of_ignorance_index in statements_of_ignorance_indices:
    RecSOI_temp_results = []
    for author_embedding in weighted_BERT_embeddings_profiles[author]:
      RecSOI_temp_results.append(np.linalg.norm(author_embedding - sentence_embeddings[statements_of_ignorance_index]))
    RecSOI_results.append(min(RecSOI_temp_results))
  RecSOI_results = pd.DataFrame(RecSOI_results)
  RecSOI_results.index = statements_of_ignorance_indices

  # Expectancy of results
  N = len(statements_of_ignorance_indices)
  K = len(investigated_SoI_indices)
  all_results.loc[author, "Expectancy Top5"] = 0 if 5*(K/N) < 1 else 1
  all_results.loc[author, "Expectancy Top10"] = 0 if 10*(K/N) < 1 else 1
  all_results.loc[author, "Expectancy Top20"] = 0 if 20*(K/N) < 1 else 1

  K = len(investigated_co_auhor_SoI_indices)
  all_co_author_results.loc[author, "Expectancy Top5"] = 0 if 5*(K/N) < 1 else 1
  all_co_author_results.loc[author, "Expectancy Top10"] = 0 if 10*(K/N) < 1 else 1
  all_co_author_results.loc[author, "Expectancy Top20"] = 0 if 20*(K/N) < 1 else 1

  K = len(investigated_concept_SoI_indices)
  all_concept_results.loc[author, "Expectancy Top5"] = 0 if 5*(K/N) < 1 else 1
  all_concept_results.loc[author, "Expectancy Top10"] = 0 if 10*(K/N) < 1 else 1
  all_concept_results.loc[author, "Expectancy Top20"] = 0 if 20*(K/N) < 1 else 1

  # Check that at least one SoI among the investigated ones has a good rank with sentence-BERT
  ranks_of_SoI_of_interest = distances.rank().loc[investigated_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_results.loc[author, "sentence-BERT Ranks"] = min(list(ranks_of_SoI_of_interest))
  else:
    all_results.loc[author, "sentence-BERT Ranks"] = np.nan

  df_of_best_ranked_SoIs_by_sentence_BERT.loc[statements_of_ignorance_indices, author] = distances.rank()[0]

  # Check that at least one SoI among the ones from co-authors has a good rank with sentence-BERT
  ranks_of_SoI_of_interest = distances.rank().loc[investigated_co_auhor_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_co_author_results.loc[author, "sentence-BERT Co-Author Ranks"] = min(list(ranks_of_SoI_of_interest))
  else: # Means that there is no co-author of this author in the dataset
    all_co_author_results.loc[author, "sentence-BERT Co-Author Ranks"] = np.nan

  # Check that at least one SoI among the investigated ones has a concept in common with the author with sentence-BERT
  ranks_of_SoI_of_interest = distances.rank().loc[investigated_concept_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_concept_results.loc[author, "sentence-BERT Concept Ranks"] = min(list(ranks_of_SoI_of_interest))
  else: # Means that there is no SOI with matching concepts in the dataset
    all_concept_results.loc[author, "sentence-BERT Concept Ranks"] = np.nan

  # Check that at least one SoI among the investigated ones has a good rank with LR
  ranks_of_SoI_of_interest = results.rank(ascending=False).loc[investigated_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_results.loc[author, "LR Ranks"] = min(list(ranks_of_SoI_of_interest))
  else:
    all_results.loc[author, "LR Ranks"] = np.nan

  # Check that at least one SoI among the ones from co-authors has a good rank with LR
  ranks_of_SoI_of_interest = results.rank(ascending=False).loc[investigated_co_auhor_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_co_author_results.loc[author, "LR Co-Author Ranks"] = min(list(ranks_of_SoI_of_interest))
  else: # Means that there is no co-author of this author in the dataset
    all_co_author_results.loc[author, "LR Co-Author Ranks"] = np.nan

  # Check that at least one SoI among the investigated ones has a concept in common with the author with LR
  ranks_of_SoI_of_interest = results.rank(ascending=False).loc[investigated_concept_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_concept_results.loc[author, "LR Concept Ranks"] = min(list(ranks_of_SoI_of_interest))
  else: # Means that there is no SOI with matching concepts in the dataset
    all_concept_results.loc[author, "LR Concept Ranks"] = np.nan

  # Check that at least one SoI among the investigated ones has a good rank with RecSOI
  ranks_of_SoI_of_interest = RecSOI_results.rank().loc[investigated_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_results.loc[author, "RecSOI Ranks"] = min(list(ranks_of_SoI_of_interest))
  else:
    all_results.loc[author, "RecSOI Ranks"] = np.nan

  df_of_best_ranked_SoIs_by_RecSOI.loc[statements_of_ignorance_indices, author] = RecSOI_results.rank()[0]

  # Check that at least one SoI among the ones from co-authors has a good rank with RecSOI
  ranks_of_SoI_of_interest = RecSOI_results.rank().loc[investigated_co_auhor_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_co_author_results.loc[author, "RecSOI Co-Author Ranks"] = min(list(ranks_of_SoI_of_interest))
  else: # Means that there is no co-author of this author in the dataset
    all_co_author_results.loc[author, "RecSOI Co-Author Ranks"] = np.nan

  # Check that at least one SoI among the investigated ones has a concept in common with the author with RecSOI
  ranks_of_SoI_of_interest = RecSOI_results.rank().loc[investigated_concept_SoI_indices, 0]
  if len(ranks_of_SoI_of_interest) > 0:
    all_concept_results.loc[author, "RecSOI Concept Ranks"] = min(list(ranks_of_SoI_of_interest))
  else: # Means that there is no SOI with matching concepts in the dataset
    all_concept_results.loc[author, "RecSOI Concept Ranks"] = np.nan

  counter += 1

In [None]:
# Get the total number of abstracts. We need this info because we want to be flexible in the number of maximum abstracts we allow during the evaluation.

for author in all_results.index:
  all_results.loc[author, "Number of Abstracts"] = len(subsampled_previous_work_abstracts[subsampled_previous_work_abstracts["First Author"] == author])
for author in all_co_author_results.index:
  all_co_author_results.loc[author, "Number of Abstracts"] = len(subsampled_previous_work_abstracts[subsampled_previous_work_abstracts["First Author"] == author])
for author in all_concept_results.index:
  all_concept_results.loc[author, "Number of Abstracts"] = len(subsampled_previous_work_abstracts[subsampled_previous_work_abstracts["First Author"] == author])

In [None]:
def author_heuristic(top=5, max_number_abstracts=5):
  """
  Compute the author heuristic: how good we are at recommending SOIs of author A with author A's profile?
  Then show the results.
  """
  print(f"Results for the author heuristic, a number of abstracts = {max_number_abstracts}, and MAP@{top}")
  temp_all_results = all_results[all_results["Number of Abstracts"] <= max_number_abstracts]

  num_best_sentence_BERT_ranks = len(temp_all_results[temp_all_results["sentence-BERT Ranks"] <= top])
  num_best_LR_ranks = len(temp_all_results[temp_all_results["LR Ranks"] <= top])
  num_best_RecSOI_ranks = len(temp_all_results[temp_all_results["RecSOI Ranks"] <= top])

  print("Total number of authors:", len(temp_all_results))
  print("Percentage of authors for which expectancy of the number of random successes >= 1:", np.mean(temp_all_results[f"Expectancy Top{top}"], axis=0))
  print(f"Number of authors with at least one SoI ranked <= {top} for sentence-BERT:", num_best_sentence_BERT_ranks, "("+str(num_best_sentence_BERT_ranks/len(temp_all_results))+")")
  print(f"Number of authors with at least one SoI ranked <= {top} for LR:", num_best_LR_ranks, "("+str(num_best_LR_ranks/len(temp_all_results))+")")
  print(f"Number of authors with at least one SoI ranked <= {top} for RecSOI:", num_best_RecSOI_ranks, "("+str(num_best_RecSOI_ranks/len(temp_all_results))+")")
  print("-------------------")

In [None]:
author_heuristic(5)
author_heuristic(10)
author_heuristic(20)

Results for the author heuristic, a number of abstracts = 5, and MAP@5
Total number of authors: 500
Percentage of authors for which expectancy of the number of random successes >= 1: 0.0
Number of authors with at least one SoI ranked <= 5 for sentence-BERT: 156 (0.312)
Number of authors with at least one SoI ranked <= 5 for LR: 115 (0.23)
Number of authors with at least one SoI ranked <= 5 for RecSOI: 173 (0.346)
-------------------
Results for the author heuristic, a number of abstracts = 5, and MAP@10
Total number of authors: 500
Percentage of authors for which expectancy of the number of random successes >= 1: 0.0
Number of authors with at least one SoI ranked <= 10 for sentence-BERT: 189 (0.378)
Number of authors with at least one SoI ranked <= 10 for LR: 141 (0.282)
Number of authors with at least one SoI ranked <= 10 for RecSOI: 210 (0.42)
-------------------
Results for the author heuristic, a number of abstracts = 5, and MAP@20
Total number of authors: 500
Percentage of authors

In [None]:
def co_author_heuristic(top=5, max_number_abstracts=5):
  """
  Compute the co-author heuristic: how good we are at recommending SOIs of the co-authors of author A with author A's profile?
  Then show the results.
  """
  print(f"Results for the co-author heuristic, a number of abstracts = {max_number_abstracts}, and MAP@{top}")
  mask = list((~all_co_author_results.isna())["sentence-BERT Co-Author Ranks"]) # We only want to consider authors that have a co-author as first-author in the dataset
  temp_all_co_author_results = all_co_author_results[mask]
  temp_all_co_author_results = temp_all_co_author_results[temp_all_co_author_results["Number of Abstracts"] <= max_number_abstracts]

  num_best_sentence_BERT_co_author_ranks = len(temp_all_co_author_results[temp_all_co_author_results["sentence-BERT Co-Author Ranks"] <= top])
  num_best_LR_co_author_ranks = len(temp_all_co_author_results[temp_all_co_author_results["LR Co-Author Ranks"] <= top])
  num_best_RecSOI_co_author_ranks = len(temp_all_co_author_results[temp_all_co_author_results["RecSOI Co-Author Ranks"] <= top])

  print("Total number of authors:", len(temp_all_co_author_results))
  print("Percentage of authors for which expectancy of the number of random successes >= 1:", np.mean(temp_all_co_author_results[f"Expectancy Top{top}"], axis=0))
  print(f"Number of authors with at least one SoI ranked <= {top} for sentence-BERT:", num_best_sentence_BERT_co_author_ranks, "("+str(num_best_sentence_BERT_co_author_ranks/len(temp_all_co_author_results))+")")
  print(f"Number of authors with at least one SoI ranked <= {top} for LR:", num_best_LR_co_author_ranks, "("+str(num_best_LR_co_author_ranks/len(temp_all_co_author_results))+")")
  print(f"Number of authors with at least one SoI ranked <= {top} for RecSOI:", num_best_RecSOI_co_author_ranks, "("+str(num_best_RecSOI_co_author_ranks/len(temp_all_co_author_results))+")")
  print("-------------------")

In [None]:
co_author_heuristic(5)
co_author_heuristic(10)
co_author_heuristic(20)

Results for the co-author heuristic, a number of abstracts = 5, and MAP@5
Total number of authors: 59
Percentage of authors for which expectancy of the number of random successes >= 1: 0.0
Number of authors with at least one SoI ranked <= 5 for sentence-BERT: 8 (0.13559322033898305)
Number of authors with at least one SoI ranked <= 5 for LR: 4 (0.06779661016949153)
Number of authors with at least one SoI ranked <= 5 for RecSOI: 6 (0.1016949152542373)
-------------------
Results for the co-author heuristic, a number of abstracts = 5, and MAP@10
Total number of authors: 59
Percentage of authors for which expectancy of the number of random successes >= 1: 0.0
Number of authors with at least one SoI ranked <= 10 for sentence-BERT: 9 (0.15254237288135594)
Number of authors with at least one SoI ranked <= 10 for LR: 5 (0.0847457627118644)
Number of authors with at least one SoI ranked <= 10 for RecSOI: 11 (0.1864406779661017)
-------------------
Results for the co-author heuristic, a number 

In [None]:
def concept_heuristic(top=5, max_number_abstracts=5):
  """
  Compute the concept heuristic: how good we are at recommending SOIs of with relevant concepts for each author?
  Then show the results.
  """
  print(f"Results for the concept heuristic, a number of abstracts = {max_number_abstracts}, and MAP@{top}")
  mask = list((~all_concept_results.isna())["sentence-BERT Concept Ranks"]) # We only want to consider authors that have matching concepts with at least one SOI in the dataset
  tmp_all_concept_results = all_concept_results[mask]
  tmp_all_concept_results = tmp_all_concept_results[tmp_all_concept_results["Number of Abstracts"] <= max_number_abstracts]

  num_best_sentence_BERT_concept_ranks = len(tmp_all_concept_results[tmp_all_concept_results["sentence-BERT Concept Ranks"] <= top])
  num_best_LR_concept_ranks = len(tmp_all_concept_results[tmp_all_concept_results["LR Concept Ranks"] <= top])
  num_best_RecSOI_concept_ranks = len(tmp_all_concept_results[tmp_all_concept_results["RecSOI Concept Ranks"] <= top])

  print("Total number of authors:", len(tmp_all_concept_results))
  print("Percentage of authors for which expectancy of the number of random successes >= 1:", np.mean(tmp_all_concept_results[f"Expectancy Top{top}"], axis=0))
  print(f"Number of authors with at least one SoI ranked <= {top} for sentence-BERT:", num_best_sentence_BERT_concept_ranks, "("+str(num_best_sentence_BERT_concept_ranks/len(tmp_all_concept_results))+")")
  print(f"Number of authors with at least one SoI ranked <= {top} for LR:", num_best_LR_concept_ranks, "("+str(num_best_LR_concept_ranks/len(tmp_all_concept_results))+")")
  print(f"Number of authors with at least one SoI ranked <= {top} for RecSOI:", num_best_RecSOI_concept_ranks, "("+str(num_best_RecSOI_concept_ranks/len(tmp_all_concept_results))+")")
  print("-------------------")

In [None]:
concept_heuristic(5)
concept_heuristic(10)
concept_heuristic(20)

Results for the concept heuristic, a number of abstracts = 5, and MAP@5
Total number of authors: 496
Percentage of authors for which expectancy of the number of random successes >= 1: 0.0
Number of authors with at least one SoI ranked <= 5 for sentence-BERT: 324 (0.6532258064516129)
Number of authors with at least one SoI ranked <= 5 for LR: 215 (0.4334677419354839)
Number of authors with at least one SoI ranked <= 5 for RecSOI: 375 (0.7560483870967742)
-------------------
Results for the concept heuristic, a number of abstracts = 5, and MAP@10
Total number of authors: 496
Percentage of authors for which expectancy of the number of random successes >= 1: 0.1592741935483871
Number of authors with at least one SoI ranked <= 10 for sentence-BERT: 369 (0.7439516129032258)
Number of authors with at least one SoI ranked <= 10 for LR: 288 (0.5806451612903226)
Number of authors with at least one SoI ranked <= 10 for RecSOI: 418 (0.842741935483871)
-------------------
Results for the concept he