In [28]:
import re
import pandas as pd
import requests
import time
import json
from thefuzz import fuzz
import urllib.parse

In [None]:
url = "https://api.semanticscholar.org/graph/v1/paper/search/match"

# Insert your API key here
api_key = ""

headers = {
    "x-api-key": api_key  
}

In [8]:
def extract_years(title):
    return re.findall(r'\b\d{4}\b', title)

In [9]:
def fuzzy_match_lists(list1, list2, threshold_single, threshhold_avg):
    if len(list1) != len(list2):
        return False
        
    total_similarity = 0
    for item1, item2 in zip(list1, list2):
        similarity_score = fuzz.ratio(item1, item2)
        total_similarity += similarity_score
        if fuzz.ratio(item1, item2) < threshold_single:
            return False
    average_similarity = total_similarity / len(list1)

    if average_similarity > threshhold_avg:
        return True

In [10]:
df_trec = pd.read_parquet("../../../data/metadata_TREC.parquet")

In [41]:
data_trec = {}
non_matching_rows = []
threshhold = 75                   # Threshold for matching author names
threshold_avg = 85                # Threshold for average similarity in matching

for i, j in df_trec.iterrows():

    # Parameters for the API request to search for the paper by title
    params = {
        "query": j["Title"],
        "limit": 1,
        "fields": "corpusId,externalIds,citationCount,influentialCitationCount,authors,journal,title,citations.paperId"
    }

    # Send the API request and convert the response to JSON
    resp = requests.get(url, params=params, headers=headers).json()
   
    try:
        # Extract the title from the Semantic Scholar API response
        title_semantic = resp["data"][0]["title"]
        title_df = j["Title"]

        # Extract publication years from both titles
        years_semantic = extract_years(title_semantic)
        years_df = extract_years(title_df)

        # Extract and sort author last names from both sources
        sorted_author_list1 = sorted([i["name"].split()[-1] for i in resp["data"][0]["authors"]])
        sorted_author_list2 = sorted([i.split()[-1] for i in j["Authors"]])
        
        print(sorted_author_list1)
        print(sorted_author_list2)

        # Check if titles, years, and author lists match based on specified thresholds
        if fuzz.ratio(title_df, title_semantic) > 90 and years_df == years_semantic and fuzzy_match_lists(sorted_author_list1, sorted_author_list2, threshhold, threshold_avg) == True:
            # If matching, add the response data to data_trec
            data_trec[j["ID"]] = resp["data"]
        else:
            # If not matching, add the row to non_matching_rows
            non_matching_rows.append(j)
    except:
        # In case of errors, add the row to non_matching_rows
        non_matching_rows.append(j)

    # Wait for 3.5 seconds between API requests to avoid overloading the server
    time.sleep(3.5)

# Convert the non-matching rows into a DataFrame and save it as a Parquet file
non_matching_df = pd.DataFrame(non_matching_rows, columns=df_trec.columns)
non_matching_df.to_parquet("../../../data/metadata_TREC_SemanticScholar_not_found.parquet")

# Save the matching data (found results) into a JSON file
with open('../../../data/SemanticScholar_TREC_found.json', 'w', encoding='utf-8') as f:
    json.dump(data_trec, f, ensure_ascii=False, indent=4)

1 / 1972
['Harman', 'Voorhees']
['Harman', 'Voorhees']
+
2 / 1972
['Chen', 'Gey']
['Chen', 'Gey']
+
3 / 1972
['Hull', 'Robertson']
['Hull', 'Robertson']
+
4 / 1972
['Hersh', 'Over']
['Hersh', 'Over']
+
5 / 1972
['Buckley', 'Liggett']
['Buckley', 'Liggett']
+
6 / 1972
['Voorhees']
['Voorhees']
+
7 / 1972
['Buckley']
['Buckley']
+
8 / 1972
['Hawking']
['Hawking']
+
9 / 1972
['Honma', 'Mano', 'Narita', 'Ogawa']
['Honma', 'Mano', 'Narita', 'Ogawa']
+
10 / 1972
['Beitzel', 'Chowdhury', 'Frieder', 'Grossman', 'Holmes', 'Jensen', 'McCabe', 'Sailee']
['Beitzel', 'Chowdhury', 'Frieder', 'Grossman', 'Holmes', 'Jensen', 'McCabe', 'Sailee']
+
11 / 1972
['Chan', 'Dinstl', 'Grunfeld', 'Kwok']
['Chan', 'Dinstl', 'Grunfeld', 'Kwok']
+
12 / 1972
['Gao', 'Huang', 'Nie', 'Su', 'Xun', 'Zhang', 'Zhou']
['Gao', 'Huang', 'Nie', 'Su', 'Xun', 'Zhang', 'Zhou']
+
13 / 1972
['Bunescu', 'Girju', 'Harabagiu', 'Mihalcea', 'Moldovan', 'Morarescu', 'Pasca', 'Rus', 'Surdeanu']
['Bunescu', 'Girju', 'Harabagiu', 'Mihalce

In [2]:
df_not_found = pd.read_parquet("../../../data/metadata_TREC_SemanticScholar_not_found.parquet")

In [3]:
# Read in already manually searched document IDS for Semantic Scholar

file_path = '../../../data/TREC_SemanticScholar_manually_seach_IDs.txt'

# Read the tab-separated file into a DataFrame
df_found = pd.read_csv(file_path, sep=',', header=None)

# Display the DataFrame
df_found

Unnamed: 0,0,1,2
0,trec_76,d19009c71873e08035a75e8ec55bda650bdc5f9,OK
1,trec_132,7e525f13fcd4e477e75d3a8b1a6ad136a8edf680,OK
2,trec_270,16172a45df487449b91b59706ed9d071e5af468c,draft
3,trec_293,bd608f980ca65d20be920d4fac16df4bd0a5ea10,OK
4,trec_329,8cfbfc2a90b18d154a12761da44204d55179ea45,OK
5,trec_362,4e4152d168b0ca49a0718629814de868793cc8a8,OK
6,trec_398,5fc936d6a3d9276c4ef9c9bae62dc3cad70786dd,OK
7,trec_435,9e0bd7808afb5cb28aeb811cbbe1a552d0f27ad8,OK
8,trec_454,a4f22fb9631dea6cf2496055c1a20bf3177cdab3,OK
9,trec_459,27853481007662b26f7707ad4a39ccb881ff48f9,OK


In [4]:
# Narrow done the IDs to the once fitting the requirements 

df_found = df_found[df_found[2] == "OK"]

In [14]:
df_found.columns = ['ID', 'Semantic ID', 'Notes']

In [6]:
# Filter rows in when ID was already manually searched

df_not_found = df_not_found[~df_not_found["ID"].isin(df_found[0].tolist())]

In [1]:
 # Initialize an empty list to store ID, semantic_id, and note
ids = []

for i, j in df_not_found.iterrows():
   
    print(j["Title"])
    print(j["Authors"])

    # Take user input for the semantic ID (manual input)
    semantic_id = input()

    # Take user input for any additional note (manual input)
    note = input()

    # Append the current row's ID, the inputted semantic_id, and note to the ids list
    ids.append([j["ID"], semantic_id, note])

NameError: name 'df_not_found' is not defined

In [10]:
df_manual = pd.DataFrame(liste_ids, columns=["ID","Semantic ID", "Notes"])
df_manual = df_manual[df_manual["Semantic ID"] != ""]

# Concatenate the manually entered data (df_manual) with the previously manually entered data data (df_found)
df_concat = pd.concat([df_manual, df_found])

# Define a list of notes that indicate papers requiring additional scraping
notes_to_scrape = ['','WrongLinkRestGood', 'NoSourceButLink', 'LinkArxivRestGood',  'GoodAberVieleJahreLinks', 'LinkArxivSonstGoost', 'TablesCorrectTitleWrong',  'AbstractCorrectLinkAndTitleWrong', 'OK']

# Filter rows where the "Notes" column contains entries in the 'notes_to_scrape' list
df_to_additionally_scrape = df_concat[df_concat["Notes"].isin(notes_to_scrape)]

# Filter rows where the "Notes" column does NOT contain entries in the 'notes_to_scrape' list
df_not_sufficient = df_concat[~df_concat["Notes"].isin(notes_to_scrape)]

In [None]:
params = {
        "query": j["Title"],
        "limit": 1,
        "fields": "corpusId,externalIds,citationCount,influentialCitationCount,authors,journal,title,citations.paperId"
    }
    counter += 1
    resp = requests.get(url, params=params, headers=headers).json()

In [22]:
df_to_additionally_scrape

Unnamed: 0,ID,Semantic ID,Notes
12,trec_521,b05c34e18c5fd407c0f70360c1b175afb7749ccc,TablesCorrectTitleWrong
21,trec_652,b55686bc31771814626a684a00c93c17d3d19c5d,NoSourceButLink
27,trec_770,704a49e2c70ef81dc0b768a6a3590a3e570e903c,
32,trec_849,ccaa9caf84876897c8740d86b6cb681d32845db3,
33,trec_860,a151b4d68bff5a052827c3d35ab82d769ecfcf6f,
...,...,...,...
34,trec_1123,0737a9b31b84e32db39c813ee9397b70a7e95a08,OK
35,trec_1130,2e9de90b6a53bf693f7d4efce61e8a14a00ba258,OK
36,trec_1142,f325d7ef8427d6ed83d98107a424d99ddd8b8e66,OK
37,trec_1172,b384faf411c959a4f05c843f40059864c48ab6c9,OK


In [27]:
import time
import requests
data_trec = {}

params = {
        "limit": 1,
        "fields": "corpusId,externalIds,citationCount,influentialCitationCount,authors,journal,title,citations.paperId"
    }

for i,j in df_to_additionally_scrape.iterrows():
    
    # Construct the API URL using the "Semantic ID" from the current row
    url = f'https://api.semanticscholar.org/graph/v1/paper/{j["Semantic ID"]}'

    # Send a GET request to the Semantic Scholar API with the specified parameters and headers
    resp = requests.get(url, params=params, headers=headers).json()

    # Store the API response in the data_trec dictionary with the current row's "ID" as the key
    data_trec[j["ID"]] =  resp

    # Pause for 3.5 seconds to avoid overloading the API with too many requests in a short time
    time.sleep(3.5)

In [32]:
import json

with open('../../../data/SemanticScholar_TREC_found.json', 'r', encoding="utf-8") as file:
    Semantic_Trec = json.load(file)

In [39]:
# Initialize a new dictionary to store the unified data types
unified_semantic_trec = {}

for i in Semantic_Trec:
    if type(Semantic_Trec[i]) == list:
        unified_semantic_trec[i] = Semantic_Trec[i][0]
    elif isinstance(Semantic_Trec[i] , dict):
        unified_semantic_trec[i] = Semantic_Trec[i]

In [48]:
final_semantic_trec = data_trec | unified_semantic_trec

In [49]:
len(final_semantic_trec)

1871

In [50]:
with open('../../../data/SemanticScholar_TREC.json', 'w', encoding='utf-8') as f:
    json.dump(final_semantic_trec, f, ensure_ascii=False, indent=4)

In [51]:
df_not_scraped_for_reasons = df_concat[~df_concat["Notes"].isin(notes_to_scrape)]

In [52]:
df_not_scraped_for_reasons

Unnamed: 0,ID,Semantic ID,Notes
2,trec_270,16172a45df487449b91b59706ed9d071e5af468c,draft
3,trec_279,991665892b9d234217a4313730824173ef409ee0,missingSource
11,trec_508,bde127a84030270714180e0b4eff187a127a6106,missingSource
35,trec_884,1485f07d7d9194d15f09ed222388700585f457f3,WrongTitleCorrectAbstractETC
42,trec_993,7fa42baa9c5e7e8efb0669219b17f5b32799f085,MissingSourceAndLink
45,trec_1026,1485f07d7d9194d15f09ed222388700585f457f3,CorrectTitleWrongAbstract
75,trec_1309,3e8df0ab78ea7f40595dd6958196bc3d98889500,TitleHalfCorrectWrongLinkNoSource
156,trec_1955,0ef581a8a1640f049d313790b0263b77939cb919,AbstractCorrectTitleWrongLinksWrong
164,trec_1968,24d1b149a9f25288542785fd16e33ab7eb93563f,NoSourceWrongLink


In [53]:
df_not_scraped_for_reasons.to_parquet("../../../data/SemanticScholar_TREC_found_but_not_scraped.parquet")