In [4]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict 
from gensim.utils import lemmatize
from gensim import corpora 
from gensim import models 
from nltk.corpus import wordnet
import nltk 

In [2]:
#Load data
data = pd.read_csv("./data/papers.csv")
papers = data['text'][:100]

In [5]:
#Tokenize
punctuation = ",.?!()-_\"\'\\\n\r\t;:+*<>@#§^$%&|/"
processed = [[w.lower() for w in word_tokenize(document)] for document in papers]


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()
processed = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in doc] for doc in processed]

processed = [[w for w in doc if (w not in stopwords.words('english')) and (w not in punctuation)] for doc in processed]

In [6]:
#Count 
frequency = defaultdict(int)
for document in processed:
    for token in document:
        frequency[token] += 1


In [8]:
#Get only words with frequency >1 
processed_corpus = [[w for w in document if frequency[w]>1] for document in processed]

In [33]:
#Save it into dictionary 
dictionary= corpora.Dictionary(processed_corpus)

#Create BoW vectors 
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

### Transformations


- TF-IDF

In [10]:
tfidf = models.TfidfModel(bow_corpus, id2word=dictionary, normalize=True,slope=0.25)

- LSA

In [None]:
lsa = ...

- LDA 

In [None]:
lda =...

- HDP

In [None]:
hdp = ...

- RP

In [None]:
rp = ...

### Preparation for similarity structure

In [11]:
from gensim import similarities 

In [12]:
index = similarities.Similarity(None, tfidf[bow_corpus],num_features=len(dictionary))

In [13]:
path_save = "./data/similarities_1000.index"
index.save(path_save)

### Queries 

In [14]:
example = "risk factor virus corona 2019"
exp_bow = dictionary.doc2bow(example.lower().split())
exp_tfidf = tfidf[exp_bow]

In [15]:
sims = index[exp_tfidf]
sims = sorted(enumerate(sims), key=lambda item: -item[1])


In [16]:
#Get relevant paper_id's
ids = []
for i, s in enumerate(sims):
    print(s, papers[s[0]][:150])
    ids.append(s[0])

(80, 0.19489084) Epidemiological status of the Middle East respiratory syndrome coronavirus in 2019: an update from January 1 to March 31, 2019 

Human coronaviruses (
(82, 0.18770048) First Pediatric Case of Coronavirus Disease 2019 in Korea 

Seoul National University Bundang Hospital Institutional Review Board approved this study 
(32, 0.06666849) Challenges presented by MERS corona virus, and SARS corona virus to global health 

In recent times, several life threatening viruses have emerged. Th
(94, 0.023345295) Major advances in managing community-acquired pneumonia 

A knowledge of the risk factors for community-acquired pneumonia and those at risk of worse 
(74, 0.020729087) Systemic resilience to cross‐border infectious disease threat events in Europe 

Global health security has been undermined by infectious disease thre
(4, 0.017106785) Risk and Outbreak Communication: Lessons from Taiwan's Experiences in the Post-SARS Era 

Since 2004, a 24/7 toll-free hotline has been opera

In [17]:
ids = ids[:10]

### Look into the queried documents

In [18]:
#look at the tf-idf scores of words in a document for the query


In [19]:
query = example.lower().split(' ')

In [20]:
id_list = [ s[0] for s in sorted(tfidf[bow_corpus[80]],key=lambda tup: -tup[-1])][:10]

In [21]:
id_list

[3180, 8865, 9132, 2241, 3099, 3220, 3091, 2560, 6204, 636]

In [53]:
keywords = []
for i in id_list:
    keywords.append(dictionary[i])
#     print(dictionary.id2token[3180])

In [48]:
exp_pap = papers[80].split('.')
sentences = []

In [49]:
#Find parts that include keywords
for s in exp_pap:
    for k in keywords:
        if k in s:
            sentences.append(s)


In [50]:
sentences

['Epidemiological status of the Middle East respiratory syndrome coronavirus in 2019: an update from January 1 to March 31, 2019 \n\nHuman coronaviruses (hCoV) usually causes mild-to-moderate upper respiratory tract illnesses',
 '  The patterns of transmission and origins of MERS CoV remain unclear, and based on the analysis of different virus genomes, it is declared that it may have originated in bats and was transmitted to camels sometime in the distant past',
 '  Globally, from September 2012 until 24 April 2019, WHO has been notified of 2374 laboratory-confirmed cases of infection with MERS-CoV including 823 associated deaths from 27 countries worldwide',
 '  Globally, from September 2012 until 24 April 2019, WHO has been notified of 2374 laboratory-confirmed cases of infection with MERS-CoV including 823 associated deaths from 27 countries worldwide',
 ' After 8 years from the onset of the MERS-CoV outbreak, this infection is still considered as a public health threat with no vacc

In [None]:
lemmatized

'completely'

In [54]:
tfidf.save("./data/tfidf.pkl")

In [55]:
mod = models.TfidfModel.load("./data/tfidf.pkl")

In [56]:
mod

<gensim.models.tfidfmodel.TfidfModel at 0x7f76dfb2f7d0>

In [62]:
mod[exp_bow]

[(221, 0.09828000450893809),
 (1506, 0.1687864636916478),
 (1685, 0.06470179296371363),
 (6181, 0.7209178167819962),
 (8865, 0.6617728838670803)]

In [63]:
tfidf[exp_bow]

[(221, 0.09828000450893809),
 (1506, 0.1687864636916478),
 (1685, 0.06470179296371363),
 (6181, 0.7209178167819962),
 (8865, 0.6617728838670803)]

## Paper Extraction


In [95]:
#Filter the papers from 2019 and 2020 
import pandas as pd 
import numpy as np 

metadata = pd.read_csv("./data/metadata.csv",low_memory=False)
to_drop = list(metadata[pd.isna(metadata['publish_time'])].index)
to_drop = to_drop + list(metadata[pd.isna(metadata['sha'])].index)
to_drop = np.array(to_drop)
to_drop = list(np.unique(to_drop))
metadata = metadata.drop(to_drop,axis=0)

p19 = metadata[metadata['publish_time'].str.contains('2019',regex=False)]
p20 = metadata[metadata['publish_time'].str.contains('2020',regex=False)]

metadata_papers = pd.concat([p19,p20],axis=0)


In [26]:
import os 
path_csv = "./data/papers.csv"
files = pd.DataFrame(columns=["name", "path"])

#Get JSON Files' paths
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        files = files.append({'name': filename, 'path': os.path.join(dirname, filename)}, ignore_index=True)
print("Got file paths")

#Drop irrelevant files that are not json
idx = files[files.name.str.contains(".json") == False].index
files.drop(index=idx, inplace=True)
files.reset_index(inplace=True)
files.drop("index", axis=1, inplace=True)

Got file paths


In [57]:
def getName(string):
    for i in range(len(string)):
        if string[i] == ".":
            return string[:i]

files['file_name'] = files['name'].map(lambda file_name: getName(file_name))

In [69]:
files['sha_flag'] = files['file_name'].isin(list(metadata_papers['sha']))
files['pmc_flag'] = files['file_name'].isin(list(metadata_papers['pmcid']))

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_pdf_parse',
       'has_pmc_xml_parse', 'full_text_file', 'url'],
      dtype='object')

In [93]:
len(to_drop)

13006