In [1]:
import pandas as pd
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
from thefuzz import fuzz
import json
import re
import urllib.parse

In [2]:
df_metadata_ceur = pd.read_parquet("../../../data/metadata_CEUR.parquet")

In [3]:
with open("../../../data/SemanticScholar_CEUR.json", encoding="utf-8") as f:
    SemanticScholar_FINAL = json.load(f)

In [None]:
# Initialize lists to store data
merge_list = [] # Will contain pairs of Semantic Scholar IDs and corresponding OpenAlex IDs
ids_not_scraped = [] # Will contain IDs from SemanticScholar_FINAL that could not be processed

# Iterate through each entry in the SemanticScholar_FINAL dictionary
for i in SemanticScholar_FINAL:
    try:
        # Attempt to extract the OpenAlex ID for each Semantic Scholar entry
        merge_list.append([i, "W" + SemanticScholar_FINAL[i][0]["externalIds"]["MAG"]]) # Append the MAG ID by Semantic Scholar and OpenAlex ID to merge_list
    except:
        # If there's a KeyError, it means the expected data is missing
        ids_not_scraped.append(i) # Add the problematic ID to ids_not_scraped
        continue

In [None]:
df_has_mag = pd.DataFrame(merge_list, columns=["ID","MAG"])

In [None]:
df_has_mag_merged = pd.merge(df_has_mag, df_metadata_ceur, how="inner", left_on="ID", right_on="ID")

In [None]:
def extract_years(title):
    return re.findall(r'\b\d{4}\b', title)

In [None]:
# Initialize dictionaries and lists to store data

OpenAlex_by_MAG = {}      # Will store matched OpenAlex works with Semantic Scholar IDs as keys
ids_found = []            # Will keep track of IDs that were successfully matched
ids_not_scraped = []      # Will keep track of IDs that could not be processed

# Iterate through each row in the DataFrame df_has_mag_merged (Already found documents in Semantic Scholar that provide MAG ID)
for i,j in df_has_mag_merged.iterrows():
  
    try:
        # Fetch the OpenAlex work using the MAG ID from the current row
        work = Works()[j["MAG"]]

        # Extract years from the work title and the DataFrame title
        years_semantic = extract_years(work["title"])
        years_df = extract_years(j["Title"])

        # Check if the titles match closely and if the years are the same
        if fuzz.ratio(j["Title"], work["title"]) > 95 and years_df == years_semantic: 
            OpenAlex_by_MAG[j["ID"]] = work  # Store the matched work in the dictionary with Semantic Scholar ID as key
            ids_found.append(j["ID"])        # Add the ID to the list of found IDs
    except:
        # Handle cases where the OpenAlex work could not be retrieved or other errors occur
        ids_not_scraped.append(j["ID"])
        print("ID not given or found")
        continue


In [None]:
# Initialize lists to categorize documents based on DOI and CEUR link availability
has_MAG_has_DOI_has_no_ceur = [] # List for documents with DOI but without CEUR links
has_doi_has_ceur = [] # List for documents with both DOI and CEUR links

# Iterate through each document in the OpenAlex_by_MAG dictionary
for i in OpenAlex_by_MAG:
    # Check if the document has a DOI
    if OpenAlex_by_MAG[i]["doi"]:
        # Retrieve possible open access locations for the document
        best_oa_location = OpenAlex_by_MAG[i].get("best_oa_location", {})
        oa_access = OpenAlex_by_MAG[i].get("open_access", {})
        locations = OpenAlex_by_MAG[i].get("locations", [])

        # Collect all unique PDF URLs from the open access locations
        collected_urls = []
        if isinstance(best_oa_location, dict):
            pdf_url = best_oa_location.get("pdf_url")
            if pdf_url is not None:
                collected_urls.append(pdf_url)
        if isinstance(oa_access, dict):
            pdf_url = oa_access.get("oa_url")
            if pdf_url is not None:
                collected_urls.append(pdf_url)
        if isinstance(locations, list):
            for j in OpenAlex_by_MAG[i]["locations"]:
                if j["is_oa"] == True:
                    if j["pdf_url"] is not None:
                        collected_urls.append(j["pdf_url"])

        # Remove duplicate URLs
        collected_urls = list(set(liste_urls))

        # Get the CEUR URL for the current document from the merged DataFrame
        ceur_url = df_has_mag_merged.loc[df_has_mag_merged['ID'] == i, 'url'].values[0]

        # Check if the CEUR URL is among the collected PDF URLs
        found = any(ceur_url in url for url in collected_urls)

        if found:
            # If the CEUR URL is found among the URLs, categorize the document as having DOI and CEUR
            if i not in has_doi_has_ceur:
                has_doi_has_ceur.append(i)
            continue   
        else:
            # If the CEUR URL is not found, categorize the document as having DOI but no CEUR
            if i not in has_MAG_has_DOI_has_no_ceur:
                has_MAG_has_DOI_has_no_ceur.append(i)
                

In [None]:
# Filter DataFrame to include only rows where the ID is in the list of documents with DOI but no CEUR link
df_has_mag_has_doi =  df_has_mag_merged[df_has_mag_merged["ID"].isin(has_MAG_has_DOI_has_no_ceur)]

# Initialize a list to store tuples of IDs and corrected OpenAlex IDs
mag_and_id = []

# Iterate over each row in the filtered DataFrame
for i, j in df_has_mag_has_doi.iterrows():
    
    # Clean up the title by removing commas
    title = j["Title"].replace(",", "")
    title_encoded = urllib.parse.quote(title) # URL encode the cleaned title for the search query

    # If no results are found, skip to the next entry
    work = Works().search_filter(title=title_encoded).get()
    if len(work) == 0:
        print("Skip - no results found")
        continue
    
    # If exactly one result is found and the MAG ID matches, skip the entry    
    if len(work) == 1 and j["MAG"][1:] == work[0]["ids"]["mag"]:
        print("Skip - result matches existing MAG ID")
        continue
    
    # If exactly one result is found but the MAG ID does not match, prompt user for correction
    if len(work) == 1 and j["MAG"][1:] != work[0]["ids"]["mag"]:
        print("Results differ")
        print("Current ID:", "https://openalex.org/" + j["MAG"])
        print("New ID:", work[0]["id"])
        user_input = input("Enter the correct OpenAlex ID: ")
        user_input = user_input.replace("https://openalex.org/works/", "")
        mag_and_id.append([j["ID"], user_input])

    # If multiple results are found, prompt user to choose the correct one
    if len(work) > 1:
        print("Multiple results found:")
        for k in work:
            print(k["id"])
        print(j["Authors"])
        print(j["Title"])
        
        print("Current ID:", "https://openalex.org/" + j["MAG"])
        user_input = input("Enter the correct OpenAlex ID: ")
        user_input = user_input.replace("https://openalex.org/works/", "")
        # Add the chosen ID to the list if it is different from the current MAG ID
        if user_input  != j["MAG"]:
            mag_and_id.append([j["ID"], user_input])
        


In [None]:
mag_and_id = [['ceur_7', 'W3097562850'],
 ['ceur_14', 'W3095609828'],
 ['ceur_41', '-'],
 ['ceur_153', '-'],
 ['ceur_2923', 'W2292665776'],
 ['ceur_1681', 'W216921825'],
 ['ceur_2570', 'W2186486156'],
 ['ceur_743', '-'],
 ['ceur_1524', '-'],
 ['ceur_899', 'W2734855474'],
 ['ceur_2686', 'W2964258545'],
 ['ceur_864', 'W2951384152']]

In [None]:
# Iterate through the list of MAG and CEUR ID pairs
for i in mag_and_id:
    # If the second item (OpenAlex ID) is "-", remove the corresponding entry from OpenAlex_by_MAG
    if i[1] == "-":
        del OpenAlex_by_MAG[i[0]]
    else:
        # If the CEUR ID is already in OpenAlex_by_MAG, remove the existing entry
        if i[0] in OpenAlex_by_MAG:
            del OpenAlex_by_MAG[i[0]]
        # Add or update the OpenAlex entry for the given CEUR ID
        OpenAlex_by_MAG[i[0]] = Works()[i[1]]

In [None]:
# Filter OpenAlex_by_MAG to keep only those entries not in has_MAG_has_DOI_has_no_ceur
filtered_dict = {key: OpenAlex_by_MAG[key] for key in OpenAlex_by_MAG if key not in has_MAG_has_DOI_has_no_ceur}

In [None]:
# Lists to store IDs of documents with DOI and CEUR URLs, and those with DOI but without CEUR URLs

documents_with_doi_no_ceur = []
doi_without_ceur_documents = []

# Iterate over the filtered OpenAlex entries
for i in OpenAlex_by_MAG:
    # Extract open access URLs from various sources in the OpenAlex entry
    best_oa_location = OpenAlex_by_MAG[i].get("best_oa_location", {})
    oa_access = OpenAlex_by_MAG[i].get("open_access", {})
    locations = OpenAlex_by_MAG[i].get("locations", [])
    list_urls = []
    if isinstance(best_oa_location, dict):
        pdf_url = best_oa_location.get("pdf_url")
        if pdf_url is not None:
            list_urls.append(pdf_url)
    if isinstance(oa_access, dict):
        pdf_url = oa_access.get("oa_url")
        if pdf_url is not None:
            list_urls.append(pdf_url)
    if isinstance(locations, list):
        for j in OpenAlex_by_MAG[i]["locations"]:
            if j["is_oa"] == True:
                if j["pdf_url"] is not None:
                    list_urls.append(j["pdf_url"])
                    
    # Remove duplicate URLs
    list_urls = list(set(list_urls))
    
    # Get the URL from the DataFrame for the current CEUR ID
    url_value  = df_has_mag_merged.loc[df_has_mag_merged['ID'] == i, 'url'].values[0]
    url_value  = url_wert.replace("https", "http") # Standardize the URL scheme

    # Print the collected URLs and the corresponding CEUR URL for debugging
    if len(list_urls) > 0:
        print(list_urls)
        print(url_value )
        print("\n")

    # Check if any of the collected URLs match the CEUR URL
    found = any(url_value  in url for url in list_urls)

    if found:
        if i not in has_doi_has_ceur:
            documents_with_doi_no_ceur.append(i) # Document has DOI and CEUR URL
        continue   
    else:
        if i not in has_MAG_has_DOI_has_no_ceur:
            doi_without_ceur_documents.append(i) # Document has DOI and CEUR URL

In [None]:
# IDs from OpenAlex that we have already found and processed by MAG ID
ids_found_with_MAG = list(OpenAlex_by_MAG.keys())

# Filter out documents from the main DataFrame that have already been processed (i.e., have MAG IDs)
df_no_mag = df_metadata_ceur[~df_metadata_ceur["ID"].isin(ids_found_with_MAG)]

# Dictionary to store documents that need to be matched by title
documents_without_MAG  = {}

# Iterate through the filtered DataFrame
for i, j in df_no_mag.iterrows():
    
    # Clean and encode the title for the search query
    title = j["Title"].replace(",", "")
    title_encoded = urllib.parse.quote(title)
    try:
        # Search for works in OpenAlex by the cleaned and encoded title
        work = Works().search_filter(title=title_encoded).get()    
        
        if len(work) == 0:
            # If no results are found, print a message and continue to the next document
            print("no matches")
            continue
            
        elif len(work) == 1:
            # If exactly one result is found, check if it matches well with the title and year
            print("One option")
            years_semantic = extract_years(work[0]["title"])
            years_df = extract_years(j["Title"])
            if fuzz.ratio(j["Title"], work[0]["title"]) > 95 and years_df == years_semantic:
                # Add the result to the dictionary if the title and year match
                documents_without_MAG [j["ID"]] = work
                
        elif len(work) > 1:
            # If multiple results are found, display them for user selection
            for k in work:
                print(k["id"])
            print(j["Title"])
            print(j["Authors"])
            
            # Prompt the user to select the correct result
            mag_id_to_take = int(input())
            if mag_id_to_take != "-":
                documents_without_MAG [j["ID"]] = work[mag_id_to_take]
    except:
        continue



In [None]:
# Dictionary to store documents without MAG ID, converted for uniformity
uniform_documents_without_MAG = {}

# Iterate through each item in the original dictionary
for i in documents_without_MAG :
    # Check if the item is a list
    if type(documents_without_MAG [i]) == list:
        uniform_documents_without_MAG[i] = documents_without_MAG [i][0]
    
    # Check if the item is a dictionary
    elif isinstance(documents_without_MAG [i] , dict):
        uniform_documents_without_MAG[i] = documents_without_MAG [i]
    

In [None]:
# Initialize counters for the types of items in the uniform dictionary
count_lists = 0
count_dicts = 0

for i in uniform_documents_without_MAG:
    if type(uniform_documents_without_MAG[i]) == list:
        print(i)
        count_lists +=1
    elif isinstance(uniform_documents_without_MAG[i] , dict):
        #print(documents_without_MAG [i])
        count_dicts += 1
    else:
        print(type(uniform_documents_without_MAG))

## Manually add still missing OpenAlex documents

In [None]:
# Combine IDs from OpenAlex and documents without MAG IDs
all_found_ids  = list(OpenAlex_by_MAG.keys()) + list(uniform_documents_without_MAG.keys()) 

# Filter the original dataframe to find records that are missing from the combined list
df_missing_records  = df_metadata_ceur[~df_metadata_ceur["ID"].isin(all_found_ids)]

# Dictionary to store records that are still missing after initial searches
still_missing_records = {}

for i, j in df_missing_records.iterrows():
    # Print the title and authors of the missing record
    print(j["Title"])
    print(j["Authors"])
    
    # Prompt for a potential OpenAlex ID
    potential_id = input("Enter the OpenAlex ID (or '-' to skip): ")
    
    if potential_id == "-":
        # Skip to the next record if input is "-"
        continue
    else:
        # Add the record to still_missing_records using the provided OpenAlex ID
        still_missing_records[j["ID"]] = Works()["W" + potential_id]

# Get the list of IDs found in the last step
last_found_ids  = list(still_missing_records.keys())

# Filter the dataframe again to find records that are still missing
final_openalex_ceur = df_missing_records[~df_missing_records["ID"].isin(last_found_ids)]

# Merge the dictionaries to create a final JSON output
final_json_openalex_ceur = still_missing_records  | OpenAlex_by_MAG | uniform_documents_without_MAG

# Save the final combined dictionary to a JSON file
with open('../../../data/OpenAlex_CEUR.json', 'w', encoding="utf-8") as f:
    json.dump(final_json_openalex_ceur, f , ensure_ascii = False, indent=4)

df_finally_missing.to_parquet("../../../data/OpenAlex_CEUR_not_found.parquet")