In [8]:
# Parameters
numPapers = 100   # Number of papers to be loaded

In [9]:
import pandas as pd
import os

# Git root directory (adjust to your directory)
rootDir = "C:/Users/Simon/Git/solid-pancovid-19"

In [10]:
# Load metadata file into pandas dataframe
df = pd.read_csv(os.path.join(rootDir, "src", "metadata.csv"))
# Append keyword column to df
df["fullText"] = ""
df["tfidfKeywords"] = "No tf-idf keywords available"

In [11]:
# Cache sha keys and dictionary names for the requested number of papers
# (iterate through dataframe, pick up all entries that have a sha key set)
listShaKeys = []
listDirectory = []
listIlocIdx = []
idx = 0
for i in range(0, numPapers):
    while type(df.iloc[idx]["sha"]) == float:
        idx += 1
    #print(df.iloc[idx]["sha"])
    listShaKeys.append(df.iloc[idx]["sha"])
    listDirectory.append(df.iloc[idx]["full_text_file"])
    listIlocIdx.append(idx)
    idx += 1

In [12]:
# Load full texts for the requested number of papers
import json

listFullText = []
# Loop through the papers
for idx, shaKey in enumerate(listShaKeys):
    # Identify json file
    json_file = os.path.join(rootDir, "src", listDirectory[idx], listDirectory[idx], shaKey + ".json")

    # Load json to python dict
    with open(json_file) as json_data:
        data = json.load(json_data)

    # Load body text to python string
    string = ""
    for entry in data["body_text"]:
        string = string + "\n" + entry["text"]
    
    # Add full text to list
    listFullText.append(string)

    df.iloc[listIlocIdx[idx], df.columns.get_loc('fullText')] = string

    #print("Full-text added. Length=" + str(len(string)))

In [13]:
"""
Extract keywords with tfidf

Step 1: Build corpus from all available fullTexts
Step 2: Extract keywords iteratively for each paper
"""

# PARAMETERS
numKeywords = 8 # Number of keywords to be extracted

# Create corpus from all available fullTexts
corpus = []
# Loop through all papers
for idx in range(0, numPapers):
    # Skip papers without fullText
    if df.iloc[idx]["fullText"] != "":
        # Append fulltext to corpus
        corpus.append(df.iloc[idx]["fullText"])

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize tfidf transformer
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.85, max_features=20000)

# Apply tfidf to corpus
X = vectorizer.fit_transform(corpus)

# Extract keywords for each paper
paperCounter = 0  # count only papers with available fulltext
for i in range(0, numPapers):
    tfidfKeywords = ""
    if df.iloc[i]["fullText"] != "":
        # Extract row from tfidf matrix
        arr = np.squeeze(X[paperCounter].toarray())
        paperCounter += 1

        # Sort entried (descending)
        maxArgs = np.argsort(-arr)

        # Output Keywords
        for j in maxArgs[:numKeywords]:
            tfidfKeywords = tfidfKeywords + " " + vectorizer.get_feature_names()[j]
            #print("Keyword: ", vectorizer.get_feature_names()[j], "   Score: ", arr[j])

        # Write keywords to dataframe
        df.iloc[i, df.columns.get_loc('tfidfKeywords')] = tfidfKeywords

df.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,fullText,tfidfKeywords
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license,,No tf-idf keywords available
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license,,No tf-idf keywords available
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license,,No tf-idf keywords available
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license,"\nThe patient (Fo, ) was a 58 year old mentall...",klh twin sle ml patient blood loo mg
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license,,No tf-idf keywords available
