In [2]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import urllib.parse
from thefuzz import fuzz

# Extract CEUR metadata from Semantic Scholar

In [3]:
df_ceur = pd.read_parquet("../../../data/metadata_CEUR.parquet")

In [None]:
# Define tresholds and variables for storage

data_CEUR = {}
non_matching_rows = []
threshhold = 75
threshold_avg = 85

In [None]:
# Function to extract years or 4-digit numbers from titles

def extract_years(title):
    return re.findall(r'\b\d{4}\b', title)

In [None]:
# Function to fuzzy match the titles scraped from CEUR website and the once from Semantic Scholar

def fuzzy_match_lists(list1, list2, threshold_single, threshhold_avg):
    if len(list1) != len(list2):
        return False
        
    total_similarity = 0
    for item1, item2 in zip(list1, list2):
        similarity_score = fuzz.ratio(item1, item2)
        total_similarity += similarity_score
        if fuzz.ratio(item1, item2) < threshold_single:
            return False
    average_similarity = total_similarity / len(list1)

    if average_similarity > threshhold_avg:
        return True

In [None]:
# Iterate through each row in the merged DataFrame

for i, j in df_ceur.iterrows():
    # Define parameters for the API request
    params = {
        "query": f'title:("{j["Title"]}")', # Search query using the title from the DataFrame
        "limit": 1,                         # Limit the results to 1
        "fields": "corpusId,externalIds,citationCount,influentialCitationCount,authors,journal,title"   # Fields to retrieve
    }
    
    # Make the GET request to the API and parse the JSON response
    resp = requests.get(url, params=params, headers=headers).json()
    try:
        # Extract the title from the API response
        title_semantic = resp["data"][0]["title"]
        title_df = j["Title"]

        # Extract years from both the semantic title and the title from the DataFrame
        years_semantic = extract_years(title_semantic)
        years_df = extract_years(title_df)

        # Create sorted lists of author names from both the API response and the DataFrame
        sorted_author_list1 = sorted([i["name"].split()[-1] for i in resp["data"][0]["authors"]])
        sorted_author_list2 = sorted([i.split()[-1] for i in j["Authors"]])

        # Print the sorted author lists for debugging
        print(sorted_author_list1)
        print(sorted_author_list2)
        
        # Check if the title matches, years are the same, and authors match
        if fuzz.ratio(title_df, title_semantic) > 90 and years_df == years_semantic and fuzzy_match_lists(sorted_author_list1, sorted_author_list2, threshhold, threshold_avg) == True:
            
            # If all conditions are met, add the data to the dictionary
            data_CEUR[j["ID"]] = resp["data"]
        else:
            # If any condition is not met, add the row to the non-matching list and print a failure message
            non_matching_rows.append(j)
    except:
        # If an exception occurs, add the row to the non-matching list
        non_matching_rows.append(j)

    # Pause for 3.5 seconds between requests to avoid overwhelming the API
    time.sleep(3.5)

# Create a DataFrame from the non-matching rows and save it as a parquet file
non_matching_df = pd.DataFrame(non_matching_rows, columns=merged_df.columns)
non_matching_df.to_parquet("../../../data/metadata_CEUR_not_found.parquet")

# Save the successfully matched data to a JSON file
with open('../../../data/SemanticScholar_CEUR_found.json', 'w', encoding='utf-8') as f:
    json.dump(data_CEUR, f, ensure_ascii=False, indent=4)

## Retrieve still missing CEUR data from Semantic Scholar

In [4]:
df_CEUR_missing = pd.read_parquet("../../../data/metadata_CEUR_not_found.parquet")

In [9]:
ceur_proceedings_df_with_ID_missing = pd.merge(df_CEUR_missing, df_ceur, on=['PubYear', 'Title', 'CEUR Title', 'Volume', 'filename', 'Section'], how='left', suffixes=('', '_df2'))

In [10]:
ceur_proceedings_df_with_ID_missing.drop(columns=['Authors_df2', "url_df2"], inplace=True)

In [None]:
with open("../data/SemanticScholar_CEUR_found.json", encoding="utf-8") as f:
    Semantic_Scholar_CEUR_found = json.load(f)

In [None]:
found_ids = list(Semantic_Scholar_CEUR_found.keys())
not_found_ids = ceur_proceedings_df_with_ID_missing["ID"].tolist()

In [None]:
data_CEUR = {}
non_matching_rows = []

In [None]:
# Set API parameters for Semantic Scholar

url = "https://api.semanticscholar.org/graph/v1/paper/search"

headers = {
    "x-api-key": api_key  
}

In [None]:
for i, j in ceur_proceedings_df_with_ID_missing.iterrows():

    # Define parameters for the API request
    params = {
        "query": f'title:("{j["Title"]}")', # Search query using the title from the DataFrame
        "limit": 1,# Limit the results to 1
        "fields": "corpusId,externalIds,citationCount,influentialCitationCount,authors,journal,title" # Fields to retrieve
    }
    
    # Make the GET request to the API and parse the JSON response
    resp = requests.get(url, params=params, headers=headers).json()
    try:
        # Extract the title and other details from the API response
        title_semantic = resp["data"][0]["title"]
        title_df = j["Title"]
        years_semantic = extract_years(title_semantic)
        years_df = extract_years(title_df)
        sorted_author_list1 = sorted([i["name"].split()[-1] for i in resp["data"][0]["authors"]])
        sorted_author_list2 = sorted([i.split()[-1] for i in j["Authors"]])

        # Print the extracted information manual control
        print("Title SemanticScholar:", title_semantic, "\t", "Years:" , years_semantic)
        print(sorted_author_list1)
        print("Title Given:          ", title_df , "\t", "Years:" , years_df)
        print(sorted_author_list2)
        print("Fuzz Ratio:", fuzz.ratio(title_df, title_semantic))
        print(resp["data"][0]["paperId"])
        
        # Prompt the user for input to decide whether to include the data or not
        input_frame = input()
        if input_frame == "+":
            # If user inputs "+", add the paperId to the data_CEUR dictionary
            data_CEUR[j["ID"]] = resp["data"][0]["paperId"]
        elif input_frame == "-":
            # If user inputs "-", skip to the next record
            continue
        else:
            # If user inputs anything else, add the custom input to the data_CEUR dictionary
            data_CEUR[j["ID"]] = input_frame
            
    except:
        # Handle exceptions and prompt the user for input
        print("Except:")
        print(j["Title"])
        print(j["PubYear"])
        print(j["Authors"])
        input_frame_2 = input()
        if input_frame_2 == "-":
            # If user inputs "-", skip to the next record
            continue
        else:
            # If user inputs anything else, add the custom input to the data_CEUR dictionary
            data_CEUR[j["ID"]] = input_frame_2
    # Pause for 2.5 seconds between requests to avoid overwhelming the API
    time.sleep(2.5)

In [None]:
del data_CEUR['ceur_3368']

In [None]:
params = {
    "limit": 1,
    "fields": "corpusId,externalIds,citationCount,influentialCitationCount,authors,journal,title"
}

In [None]:
data_ceur_Semantic = {}
for key, value in data_CEUR.items():
    url = f"https://api.semanticscholar.org/graph/v1/paper/{value}"
    resp = requests.get(url, params=params, headers=headers).json()

    data_ceur_Semantic[key] = [resp]
    time.sleep(2.5)

In [None]:
with open("../../../data/SemanticScholar_CEUR_found.json", encoding="utf-8") as f:
    SemanticScholar_found_by_title = json.load(f)

In [None]:
del SemanticScholar_found_by_title['ceur_1652']
del SemanticScholar_found_by_title['ceur_2258']
del SemanticScholar_found_by_title['ceur_2324']

In [None]:
SemanticScholar_FINAL_json = SemanticScholar_found_by_title | data_ceur_Semantic

In [None]:
# Refers to QALD-4 instead of QALD-5
SemanticScholar_FINAL_json.pop("ceur_2974")

In [None]:
with open('../../../data/SemanticScholar_CEUR.json', 'w', encoding='utf-8') as f:
    json.dump(SemanticScholar_FINAL_json, f, ensure_ascii=False, indent=4)