In [1]:
!pip install unidecode
!pip install gensim
!pip install flashtext



In [2]:
import json
import re
import traceback 
from pyspark.sql import SparkSession
from elasticsearch import Elasticsearch, helpers
import requests, json, os, csv
from nltk.corpus import stopwords
import string
import unidecode
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
from flashtext import KeywordProcessor
import spacy
import pytextrank
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import pprint
from IPython.display import display 

In [3]:
s_top_words = set(stopwords.words('english'))

## Data loading

In [4]:
def load_data(path = 'metadata.csv'):
    spark = SparkSession \
    .builder \
    .appName("ElasticSpark-1") \
    .config("spark.driver.extraClassPath", "/path/elasticsearch-hadoop-7.6.2/dist/elasticsearch-spark-20_2.11-7.6.2.jar") \
    .config("spark.es.port","9200") \
    .config("spark.driver.memory", "8G") \
    .config("spark.executor.memory", "12G") \
    .getOrCreate()
    metadata_df = spark.read.csv(path, multiLine=True, header=True)
    metadata_df.show(1)
    metadata_df = metadata_df.select("*").limit(1000)
    metadata_df.show(3)
    metadata_table = metadata_df.toPandas()
    print("Data loaded.")
    display(metadata_table)
    metadata_table["pr_title"]=metadata_table["title"]
    metadata_table["pr_abstract"]=metadata_table["abstract"]
    return metadata_table

In [5]:
def create_index(es_index="covid"):
    res = requests.get('http://localhost:9200')
    print (res.content)
    es = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
    mapping = {
        'settings':{
            'number_of_shards': 1, 
            'number_of_replicas': 1
        },
        'mappings': {
            'properties': {
                'cord_uid': {
                    'index': 'true', 
                    'type': 'text'
                },
                'sha': {
                    'index': 'true', 
                    'type': 'text'
                },
                'source_x': {
                    'index': 'true', 
                    'type': 'text'
                },
                'title': {
                    'index': 'true',
                    'type': 'text', 
                    'similarity': 'BM25'
                },
                'pr_title': {
                    'index': 'true',
                    'type': 'text', 
                    'similarity': 'BM25'
                },
                'doi': {
                    'index': 'true', 
                    'type': 'text'
                },
                'pmcid': {
                    'index': 'true',
                    'type': 'text'
                },
                'license': {
                    'index': 'true',
                    'type': 'text'
                },
                'abstract': {
                    'index': 'true',
                    'type': 'text',
                    'similarity': 'BM25'
                },
                'pr_abstract': {
                    'index': 'true',
                    'type': 'text',
                    'similarity': 'BM25'
                },
                'publish_time': {
                    'index': 'true', 
                    'type': 'text'
                },
                'authors': {
                    'index': 'true',
                    'type': 'text'
                },
                'journal': {
                    'index': 'true',
                    'type': 'text'
                },
                'who_covidence_id': {
                    'index': 'true',
                    'type': 'text'
                },
                'arxiv_id': {
                    'index': 'true',
                    'type': 'text'
                },
                'pdf_json_files': {
                    'index': 'true',
                    'type': 'text'
                },
                'pmc_json_files': {
                    'index': 'true', 
                    'type': 'text'
                },
                'url': {
                    'index': 'true',
                    'type': 'text'
                },
                's2_id': {
                    'index': 'true', 
                    'type': 'text'
                }
             }
         }
    }
    if es.indices.exists(es_index):
        es.indices.delete(es_index) 

    es.indices.create(index=es_index,body=mapping)
    return es

## Sentence Splitting, Tokenization and Normalization

In [6]:
class TextNormalizer:
    def __init__(self):
        self.punctuation_table = str.maketrans('','',string.punctuation)

    def normalize_text(self,text):
        if text==None:
            return None
        try: 
            normalized_sentences = []
            text = re.sub(' +',' ', text)
            text = unidecode.unidecode(text)
            text = text.lower()
            sentences = sent_tokenize(text)
        except:
            print("ERROR:", text)
            traceback.print_exc()
            return None
        
        for sentence in sentences:
            #remove punctuation
            sentence=re.sub("["+string.punctuation+"\d*]"," ",sentence)
            #strip leading/trailing whitespace
            sentence = sentence.strip()
            words = word_tokenize(sentence)
            new_sentence = ' '.join(words) #we want to keep it as before to extract phrases
            normalized_sentences.append(new_sentence)
        return normalized_sentences

In [7]:
def normalize_table(metadata_table):
    normaliser = TextNormalizer()
    
    table_to_process=metadata_table[["pr_title","pr_abstract"]]
    table_to_process["pr_title"]=table_to_process["pr_title"].apply(lambda x: normaliser.normalize_text(x))
    table_to_process["pr_abstract"]=table_to_process["pr_abstract"].apply(lambda x: normaliser.normalize_text(x))
    
    for i in range(0, len(table_to_process)):
        metadata_table.loc[i,"pr_title"] = table_to_process.loc[i,"pr_title"]
        metadata_table.loc[i,"pr_abstract"] = table_to_process.loc[i,"pr_abstract"]
    return metadata_table

## Selecting key words

In [8]:
def remove_stop_words(text):
    if text==None:
        return
    for index,sentence in enumerate(text):
        sentence = sentence.split(" ") #performing tokenisation
        sentence = [word for word in sentence if word not in s_top_words and len(word)>2]
        sentence=" ".join(sentence)
        text[index]=sentence
    return text

In [9]:
def get_words_corpus(table):
    words_corpus=[]
    for i in range(0, len(table)):
        row=table.loc[i]
        title_sentences = row["pr_title"]
        abstract_sentences = row["pr_abstract"]
        
        if title_sentences!=None:
            for i in range(0,len(title_sentences)):
                words_corpus.extend(title_sentences[i].split())
                
        if  abstract_sentences!=None:
            for i in range(0,len(abstract_sentences)):
                words_corpus.extend(abstract_sentences[i].split())
    return words_corpus
        

In [10]:
def get_keywords_by_textrank(sentences):
    if sentences==None:
        return None
    keywords=dict()
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe("textrank", last=True)
    doc = nlp(" ".join(sentences))

    # examine the top-ranked phrases in the document

    for p in doc._.phrases:
        if p.rank>=0.05:
            keywords[p.text]=p.rank
    #         print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
    #         print(p.text)
    return keywords

In [11]:
def extract_keywords(text,keyword_processor):
    sentences=[]
    if text==None:
        return None
    for i in range(0, len(text)):
        keywords_found = keyword_processor.extract_keywords(text[i])
        sentences.append(" ".join(keywords_found))
    return sentences
    

In [12]:
def merge_two_keywords_methods(sentences, text_rank_key_word_processor, frequent_key_words_processor):
    if sentences==None:
        return None
    text_rank_version = extract_keywords(sentences,text_rank_key_word_processor)
    frequent_key_words_version = extract_keywords(sentences,frequent_key_words_processor)
    intersect = set(frequent_key_words_version) - set(text_rank_version)

    merged_version = text_rank_version + list(intersect)
    return merged_version

In [13]:
def retain_best_tf_idf_keywords(sentences, index, tfIdf,tfIdfVectorizer):
    if sentences==None:
        return None
    tf_idf_keyword_processor = KeywordProcessor()
    df = pd.DataFrame(tfIdf[index].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF_IDF"])
    df = df.sort_values('TF_IDF', ascending=False)
    df = df[df.TF_IDF>0.09]
    tf_idf_dict=df.T.to_dict('list')    
    for keyword in tf_idf_dict.keys():
        parts = " ".join(keyword.split("_"))
        tf_idf_keyword_processor.add_keyword(keyword,parts)
    sentences = extract_keywords(sentences,tf_idf_keyword_processor)
    return sentences

In [14]:
def select_best_keywords(metadata_table):
    table_to_process=metadata_table[["pr_title","pr_abstract"]]
    table_to_process["pr_title"]=table_to_process["pr_title"].apply(lambda x: remove_stop_words(x))
    table_to_process["pr_abstract"]=table_to_process["pr_abstract"].apply(lambda x: remove_stop_words(x))
    
    print("Text Data after removing of stop-words")
    display(table_to_process)

    words_corpus=get_words_corpus(table_to_process)
    print(len(words_corpus))

    dist = nltk.FreqDist(words_corpus) #Creating a distribution of words' frequencies
    grams=dist.most_common(1000) #Obtaining the most frequent words
    bigrams = nltk.collocations.BigramAssocMeasures()
    trigrams = nltk.collocations.TrigramAssocMeasures()

    bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(words_corpus)
    trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(words_corpus)

    print("Showing first",2000,"top-freqent words in the corpus")
    grams = pd.DataFrame(grams) 
    grams.index = range(1,len(grams)+1)
    grams.columns = ["Word", "Frequency"]
    display(grams)
    
    bi_filter=7
    print("Showing bigrams in the corpus found by Pointwise Mutual Information method")
    print("Applying frequency filter: a bigramm occurs more than",bi_filter,"times")
    bigramFinder.apply_freq_filter(bi_filter)
    bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)
    bigramPMITable["bigram"]=bigramPMITable["bigram"].apply(lambda x: ' '.join(x))
    display(bigramPMITable)

    tri_filter=5
    print("Showing trigrams in the corpus found by Pointwise Mutual Information method")
    print("Applying frequency filter: a trigramm occurs more than",tri_filter,"times")
    trigramFinder.apply_freq_filter(tri_filter)
    trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)
    trigramPMITable["trigram"]=trigramPMITable["trigram"].apply(lambda x: ' '.join(x))
    display(trigramPMITable)

    gram_dict=grams.set_index('Word').T.to_dict('list')
    bigramPMIDict=bigramPMITable.set_index('bigram').T.to_dict('list')
    trigramPMIDict=trigramPMITable.set_index('trigram').T.to_dict('list')

    keyword_processor = KeywordProcessor()
    textrank_keyword_processor = KeywordProcessor()

    gram_dict.update(bigramPMIDict)
    bigramPMIDict.update(trigramPMIDict)

#     print(gram_dict)
    print("Extracting keywords from texts using Pointwise Mutual Information method and TextRank")
    text_rank_key_words=dict()
    for i in range(0, len(table_to_process)):
        sentences=table_to_process.loc[i,"pr_abstract"]
        if sentences!=None:
            keywords=get_keywords_by_textrank(sentences)
            if keywords!=None:
                text_rank_key_words.update(keywords)
                print("Text",i,"- Done")

    for keyword in gram_dict.keys():
        parts=keyword.split()
        parts="_".join(parts)
        keyword_processor.add_keyword(keyword,parts)

    for keyword in text_rank_key_words.keys():
        parts=keyword.split()
        parts="_".join(parts)
        textrank_keyword_processor.add_keyword(keyword,parts)

    
    print(len(keyword_processor.get_all_keywords()))
    print(len(textrank_keyword_processor.get_all_keywords()))
    print(len(text_rank_key_words))

    table_to_process["pr_abstract"]=table_to_process["pr_abstract"].apply(lambda x: merge_two_keywords_methods(x, textrank_keyword_processor, keyword_processor))     

    for i in range(0, len(table_to_process)):
        metadata_table.loc[i,"pr_title"] = table_to_process.loc[i,"pr_title"]
        metadata_table.loc[i,"pr_abstract"] = table_to_process.loc[i,"pr_abstract"]

    print("Comparison of Text Data after Keywords Extraction using Pointwise Mutual Information method and TextRank")
    display(metadata_table[["title","pr_title","abstract","pr_abstract"]])

    print("Extracting keywords from texts using TF/IDF")
    dataset = []
    for i in range(0, len(table_to_process["pr_abstract"])):
        sentences = table_to_process.loc[i,"pr_abstract"]
        if sentences!=None:
            sentences=" ".join(sentences)
            dataset.append(sentences)

    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(dataset)

    index=0
    for i in range(0,len(metadata_table)):
        if table_to_process.loc[i,"pr_abstract"]==None:
            continue
        metadata_table.loc[i,"pr_abstract"]=retain_best_tf_idf_keywords(table_to_process.loc[i,"pr_abstract"], index,tfIdf,tfIdfVectorizer)
        index+=1
    return metadata_table

## Stemming or Morphological Analysis (Lemmatisation) 

In [15]:
def lemmatise_text(sentences):
    if sentences==None:
        return None
    lemmatizer = WordNetLemmatizer()
    for i in range(0, len(sentences)):
        try:
            if sentences[i] == "":
                continue
            words=sentences[i].split()
            lemmatised_words = [lemmatizer.lemmatize(word) for word in words]
            lemmatised_words = ' '.join(lemmatised_words)
            sentences[i]=lemmatised_words
        except:
            print(sentences)
            print(sentences[i])
            traceback.print_exc()
            break
    return sentences


## Indexing

In [16]:
def index_table(es,metadata_table,es_index="covid"):
    for i in range(0,len(metadata_table)):
        metadata_table.iloc[i].to_json(es_index+'.json')
        f = open(es_index+'.json')
        docket_content = f.read()
        row=json.loads(docket_content)
        try:
            es.index(index=es_index, id=i, body=row)
        except:
            traceback.print_exc() 
            print("Error:", "row #"+str(i))

## Searching

In [17]:
def search(es, es_index="covid",
    query={
          "query": {
            "match_phrase":{"publish_time":"2000-08-15"}
          }
        }):
    res = es.search(index=es_index, body=query)
    documents=[]
    for i in range(0, len(res['hits']['hits'])):
        doc=res['hits']['hits'][i]['_source']
        documents.append(doc)
    return documents

## Running of the program

In [18]:
def run_program(path = 'metadata.csv',es_index="covid", query={
              "query": {
                "match_phrase":{"publish_time":"2000-08-15"}
              }
            }):
    #Data loading
    print("Data loading")
    metadata_table= load_data(path)
    print("pr_title and pr_abstract columns have been added")
    display(metadata_table)
    
    #Indexing
    print("Creating index -",es_index)
    es=create_index(es_index)
    print("Indexing")
    index_table(es,metadata_table,es_index=es_index)
    print("Data indexed.")

    #Sentence splitting, text tokenisation and normalisation
    print("Sentence splitting, text tokenisation and normalisation")
    metadata_table=normalize_table(metadata_table)
    print("Comparison of Text Data after Sentence splitting, text tokenisation and normalisation step")
    display(metadata_table[["title","pr_title","abstract","pr_abstract"]])

    #Selecting keywords
    print("Selecting keywords")
    metadata_table=select_best_keywords(metadata_table)
    print("Comparison of Text Data after Selecting keywords step")
    display(metadata_table[["title","pr_title","abstract","pr_abstract"]])
    
    #Text lemmatisation
    print("Text lemmatisation")
    metadata_table["pr_abstract"]=metadata_table["pr_abstract"].apply(lambda x: lemmatise_text(x))
    metadata_table["pr_title"]=metadata_table["pr_title"].apply(lambda x: lemmatise_text(x))
    print("Comparison of Text Data after Applied Lemmatisation")
    display(metadata_table[["title","pr_title","abstract","pr_abstract"]])

    #Indexing
    print("Creating index -",es_index)
    es=create_index(es_index)
    print("Indexing")
    index_table(es,metadata_table,es_index=es_index)
    
    #Searching in ElasticSearch
    print("Searching in ElasticSearch")
    documents=search(es, es_index=es_index, query=query)
    print("Retrieved documents:")
    pprint.pprint(documents)
    
    return documents,es

In [19]:
documents,es=run_program(path = 'metadata.csv',es_index="covid", query={
              "query": {
                "match_phrase":{"publish_time":"2000-08-15"}
              }
            })
print("Retrieved documents:")
pprint.pprint(documents)

Data loading
+--------+--------------------+--------+--------------------+--------------------+--------+---------+-------+--------------------+------------+--------------------+--------------+------+----------------+--------+--------------------+--------------------+--------------------+-----+
|cord_uid|                 sha|source_x|               title|                 doi|   pmcid|pubmed_id|license|            abstract|publish_time|             authors|       journal|mag_id|who_covidence_id|arxiv_id|      pdf_json_files|      pmc_json_files|                 url|s2_id|
+--------+--------------------+--------+--------------------+--------------------+--------+---------+-------+--------------------+------------+--------------------+--------------+------+----------------+--------+--------------------+--------------------+--------------------+-----+
|ug7v899j|d1aafb70c066a2068...|     PMC|Clinical features...|10.1186/1471-2334...|PMC35282| 11472636|  no-cc|OBJECTIVE: This r...|  2001-07-0

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,880nqc0f,ed2de3694f5580ea38f4adf24bd1c8b46862df1f,PMC,Mannose-binding lectin deficiency and acute ex...,10.2147/copd.s33714,PMC3514010,23226013,no-cc,BACKGROUND: Mannose-binding lectin is a collec...,2012-11-23,"Albert, Richard K; Connett, John; Curtis, Jeff...",Int J Chron Obstruct Pulmon Dis,,,,document_parses/pdf_json/ed2de3694f5580ea38f4a...,document_parses/pmc_json/PMC3514010.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
996,01b0vnnm,2a7c951e191425fd9fa5ac108f07a1f02eb75872,PMC,The changing phenotype of microglia from homeo...,10.1186/2047-9158-1-9,PMC3514090,23210447,cc-by,It has been nearly a century since the early d...,2012-04-24,"Luo, Xiao-Guang; Chen, Sheng-Di",Transl Neurodegener,,,,document_parses/pdf_json/2a7c951e191425fd9fa5a...,document_parses/pmc_json/PMC3514090.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
997,5b29wtim,854e623d1f875e4605b2ffd3f72599d063a56cc0,PMC,Diversity of Salmonella spp. serovars isolated...,10.1186/1746-6148-8-201,PMC3514206,23098237,cc-by,BACKGROUND: Salmonellosis in water buffalo (Bu...,2012-10-25,"Borriello, Giorgia; Lucibelli, Maria G; Pescia...",BMC Vet Res,,,,document_parses/pdf_json/854e623d1f875e4605b2f...,document_parses/pmc_json/PMC3514206.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
998,z65xxk1f,3695704554777889f8232a9ea086df70bf17ff58,PMC,Severe Childhood Malaria Syndromes Defined by ...,10.1371/journal.pone.0049778,PMC3514223,23226502,cc-by,BACKGROUND: Cerebral malaria (CM) and severe m...,2012-12-04,"Burté, Florence; Brown, Biobele J.; Orimadegun...",PLoS One,,,,document_parses/pdf_json/3695704554777889f8232...,document_parses/pmc_json/PMC3514223.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,


pr_title and pr_abstract columns have been added


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,pr_title,pr_abstract
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,...,BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,...,Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,...,Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,...,Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,...,Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,880nqc0f,ed2de3694f5580ea38f4adf24bd1c8b46862df1f,PMC,Mannose-binding lectin deficiency and acute ex...,10.2147/copd.s33714,PMC3514010,23226013,no-cc,BACKGROUND: Mannose-binding lectin is a collec...,2012-11-23,...,Int J Chron Obstruct Pulmon Dis,,,,document_parses/pdf_json/ed2de3694f5580ea38f4a...,document_parses/pmc_json/PMC3514010.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,Mannose-binding lectin deficiency and acute ex...,BACKGROUND: Mannose-binding lectin is a collec...
996,01b0vnnm,2a7c951e191425fd9fa5ac108f07a1f02eb75872,PMC,The changing phenotype of microglia from homeo...,10.1186/2047-9158-1-9,PMC3514090,23210447,cc-by,It has been nearly a century since the early d...,2012-04-24,...,Transl Neurodegener,,,,document_parses/pdf_json/2a7c951e191425fd9fa5a...,document_parses/pmc_json/PMC3514090.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,The changing phenotype of microglia from homeo...,It has been nearly a century since the early d...
997,5b29wtim,854e623d1f875e4605b2ffd3f72599d063a56cc0,PMC,Diversity of Salmonella spp. serovars isolated...,10.1186/1746-6148-8-201,PMC3514206,23098237,cc-by,BACKGROUND: Salmonellosis in water buffalo (Bu...,2012-10-25,...,BMC Vet Res,,,,document_parses/pdf_json/854e623d1f875e4605b2f...,document_parses/pmc_json/PMC3514206.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,Diversity of Salmonella spp. serovars isolated...,BACKGROUND: Salmonellosis in water buffalo (Bu...
998,z65xxk1f,3695704554777889f8232a9ea086df70bf17ff58,PMC,Severe Childhood Malaria Syndromes Defined by ...,10.1371/journal.pone.0049778,PMC3514223,23226502,cc-by,BACKGROUND: Cerebral malaria (CM) and severe m...,2012-12-04,...,PLoS One,,,,document_parses/pdf_json/3695704554777889f8232...,document_parses/pmc_json/PMC3514223.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,Severe Childhood Malaria Syndromes Defined by ...,BACKGROUND: Cerebral malaria (CM) and severe m...


Creating index - covid
b'{\n  "name" : "DESKTOP-K0D65LT",\n  "cluster_name" : "elasticsearch",\n  "cluster_uuid" : "P8VwA_sWSl-ykHakwwPzXQ",\n  "version" : {\n    "number" : "7.10.2",\n    "build_flavor" : "default",\n    "build_type" : "zip",\n    "build_hash" : "747e1cc71def077253878a59143c1f785afa92b9",\n    "build_date" : "2021-01-13T00:42:12.435326Z",\n    "build_snapshot" : false,\n    "lucene_version" : "8.7.0",\n    "minimum_wire_compatibility_version" : "6.8.0",\n    "minimum_index_compatibility_version" : "6.0.0-beta1"\n  },\n  "tagline" : "You Know, for Search"\n}\n'
Indexing
Data indexed.
Sentence splitting, text tokenisation and normalisation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Comparison of Text Data after Sentence splitting, text tokenisation and normalisation step


Unnamed: 0,title,pr_title,abstract,pr_abstract
0,Clinical features of culture-proven Mycoplasma...,[clinical features of culture proven mycoplasm...,OBJECTIVE: This retrospective chart review des...,[objective this retrospective chart review des...
1,Nitric oxide: a pro-inflammatory mediator in l...,[nitric oxide a pro inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,[inflammatory diseases of the respiratory trac...
2,Surfactant protein-D and pulmonary host defense,[surfactant protein d and pulmonary host defense],Surfactant protein-D (SP-D) participates in th...,[surfactant protein d sp d participates in the...
3,Role of endothelin-1 in lung disease,[role of endothelin in lung disease],Endothelin-1 (ET-1) is a 21 amino acid peptide...,[endothelin et is a amino acid peptide with di...
4,Gene expression in epithelial cells in respons...,[gene expression in epithelial cells in respon...,Respiratory syncytial virus (RSV) and pneumoni...,[respiratory syncytial virus rsv and pneumonia...
...,...,...,...,...
995,Mannose-binding lectin deficiency and acute ex...,[mannose binding lectin deficiency and acute e...,BACKGROUND: Mannose-binding lectin is a collec...,[background mannose binding lectin is a collec...
996,The changing phenotype of microglia from homeo...,[the changing phenotype of microglia from home...,It has been nearly a century since the early d...,[it has been nearly a century since the early ...
997,Diversity of Salmonella spp. serovars isolated...,"[diversity of salmonella spp, serovars isolate...",BACKGROUND: Salmonellosis in water buffalo (Bu...,[background salmonellosis in water buffalo bub...
998,Severe Childhood Malaria Syndromes Defined by ...,[severe childhood malaria syndromes defined by...,BACKGROUND: Cerebral malaria (CM) and severe m...,[background cerebral malaria cm and severe mal...


Selecting keywords
Text Data after removing of stop-words


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,pr_title,pr_abstract
0,[clinical features culture proven mycoplasma p...,[objective retrospective chart review describe...
1,[nitric oxide pro inflammatory mediator lung d...,[inflammatory diseases respiratory tract commo...
2,[surfactant protein pulmonary host defense],[surfactant protein participates innate respon...
3,[role endothelin lung disease],[endothelin amino acid peptide diverse biologi...
4,[gene expression epithelial cells response pne...,[respiratory syncytial virus rsv pneumonia vir...
...,...,...
995,[mannose binding lectin deficiency acute exace...,[background mannose binding lectin collectin i...
996,[changing phenotype microglia homeostasis dise...,[nearly century since early description microg...
997,"[diversity salmonella spp, serovars isolated i...",[background salmonellosis water buffalo bubalu...
998,[severe childhood malaria syndromes defined pl...,[background cerebral malaria severe malarial a...


133627
Showing first 2000 top-freqent words in the corpus


Unnamed: 0,Word,Frequency
1,virus,985
2,influenza,945
3,infection,740
4,cells,711
5,patients,587
...,...,...
996,find,28
997,great,28
998,duration,28
999,initiation,28


Showing bigrams in the corpus found by Pointwise Mutual Information method
Applying frequency filter: a bigramm occurs more than 7 times


Unnamed: 0,bigram,PMI
0,rift valley,13.857927
1,endoplasmic reticulum,13.705924
2,bronchoalveolar lavage,13.327412
3,coronary artery,13.327412
4,vesicular stomatitis,13.327412
...,...,...
863,cells cells,1.665403
864,disease virus,1.581372
865,infection results,1.548103
866,pandemic virus,1.325173


Showing trigrams in the corpus found by Pointwise Mutual Information method
Applying frequency filter: a trigramm occurs more than 5 times


Unnamed: 0,trigram,PMI
0,mov avg cusum,29.411848
1,cortex magnoliae officinalis,29.148813
2,systemic lupus erythematosus,25.663387
3,autoimmune hemolytic anemia,25.052889
4,methicillin resistant staphylococcus,24.947180
...,...,...
219,influenza virus replication,9.049636
220,influenza virus infections,8.849042
221,pandemic influenza infection,8.659061
222,response influenza virus,8.532673


Extracting keywords from texts using Pointwise Mutual Information method and TextRank
Text 0 - Done
Text 1 - Done
Text 2 - Done
Text 3 - Done
Text 4 - Done
Text 5 - Done
Text 6 - Done
Text 7 - Done
Text 8 - Done
Text 9 - Done
Text 10 - Done
Text 11 - Done
Text 12 - Done
Text 13 - Done
Text 15 - Done
Text 16 - Done
Text 17 - Done
Text 18 - Done
Text 19 - Done
Text 20 - Done
Text 21 - Done
Text 22 - Done
Text 23 - Done
Text 24 - Done
Text 25 - Done
Text 26 - Done
Text 27 - Done
Text 28 - Done
Text 29 - Done
Text 30 - Done
Text 31 - Done
Text 32 - Done
Text 33 - Done
Text 34 - Done
Text 35 - Done
Text 36 - Done
Text 37 - Done
Text 38 - Done
Text 39 - Done
Text 40 - Done
Text 41 - Done
Text 42 - Done
Text 43 - Done
Text 44 - Done
Text 45 - Done
Text 46 - Done
Text 47 - Done
Text 48 - Done
Text 49 - Done
Text 50 - Done
Text 57 - Done
Text 58 - Done
Text 59 - Done
Text 60 - Done
Text 61 - Done
Text 62 - Done
Text 64 - Done
Text 65 - Done
Text 66 - Done
Text 67 - Done
Text 68 - Done
Text 69 -

Text 537 - Done
Text 538 - Done
Text 539 - Done
Text 540 - Done
Text 541 - Done
Text 542 - Done
Text 543 - Done
Text 544 - Done
Text 545 - Done
Text 546 - Done
Text 547 - Done
Text 548 - Done
Text 549 - Done
Text 550 - Done
Text 551 - Done
Text 552 - Done
Text 553 - Done
Text 554 - Done
Text 555 - Done
Text 556 - Done
Text 557 - Done
Text 558 - Done
Text 559 - Done
Text 560 - Done
Text 561 - Done
Text 562 - Done
Text 563 - Done
Text 564 - Done
Text 566 - Done
Text 567 - Done
Text 568 - Done
Text 569 - Done
Text 571 - Done
Text 572 - Done
Text 573 - Done
Text 574 - Done
Text 575 - Done
Text 576 - Done
Text 577 - Done
Text 578 - Done
Text 579 - Done
Text 580 - Done
Text 581 - Done
Text 582 - Done
Text 583 - Done
Text 584 - Done
Text 585 - Done
Text 586 - Done
Text 587 - Done
Text 588 - Done
Text 589 - Done
Text 590 - Done
Text 591 - Done
Text 592 - Done
Text 593 - Done
Text 594 - Done
Text 595 - Done
Text 596 - Done
Text 597 - Done
Text 598 - Done
Text 599 - Done
Text 600 - Done
Text 601

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Comparison of Text Data after Keywords Extraction using Pointwise Mutual Information method and TextRank


Unnamed: 0,title,pr_title,abstract,pr_abstract
0,Clinical features of culture-proven Mycoplasma...,[clinical features culture proven mycoplasma p...,OBJECTIVE: This retrospective chart review des...,[objective_retrospective_chart_review epidemio...
1,Nitric oxide: a pro-inflammatory mediator in l...,[nitric oxide pro inflammatory mediator lung d...,Inflammatory diseases of the respiratory tract...,[inflammatory_diseases respiratory_tract commo...
2,Surfactant protein-D and pulmonary host defense,[surfactant protein pulmonary host defense],Surfactant protein-D (SP-D) participates in th...,[surfactant_protein_participates response micr...
3,Role of endothelin-1 in lung disease,[role endothelin lung disease],Endothelin-1 (ET-1) is a 21 amino acid peptide...,[endothelin_amino_acid_peptide_diverse_biologi...
4,Gene expression in epithelial cells in respons...,[gene expression epithelial cells response pne...,Respiratory syncytial virus (RSV) and pneumoni...,[respiratory_syncytial_virus_rsv_pneumonia_vir...
...,...,...,...,...
995,Mannose-binding lectin deficiency and acute ex...,[mannose binding lectin deficiency acute exace...,BACKGROUND: Mannose-binding lectin is a collec...,[background_mannose_binding_lectin_collectin h...
996,The changing phenotype of microglia from homeo...,[changing phenotype microglia homeostasis dise...,It has been nearly a century since the early d...,[century early_description_microglia_rio many_...
997,Diversity of Salmonella spp. serovars isolated...,"[diversity salmonella spp, serovars isolated i...",BACKGROUND: Salmonellosis in water buffalo (Bu...,[background_salmonellosis_water_buffalo_bubalu...
998,Severe Childhood Malaria Syndromes Defined by ...,[severe childhood malaria syndromes defined pl...,BACKGROUND: Cerebral malaria (CM) and severe m...,[background_cerebral_malaria serious_life clin...


Extracting keywords from texts using TF/IDF
Comparison of Text Data after Selecting keywords step


Unnamed: 0,title,pr_title,abstract,pr_abstract
0,Clinical features of culture-proven Mycoplasma...,[clinical features culture proven mycoplasma p...,OBJECTIVE: This retrospective chart review des...,[objective retrospective chart review epidemio...
1,Nitric oxide: a pro-inflammatory mediator in l...,[nitric oxide pro inflammatory mediator lung d...,Inflammatory diseases of the respiratory tract...,[inflammatory diseases respiratory tract commo...
2,Surfactant protein-D and pulmonary host defense,[surfactant protein pulmonary host defense],Surfactant protein-D (SP-D) participates in th...,[surfactant protein participates response micr...
3,Role of endothelin-1 in lung disease,[role endothelin lung disease],Endothelin-1 (ET-1) is a 21 amino acid peptide...,[endothelin amino acid peptide diverse biologi...
4,Gene expression in epithelial cells in respons...,[gene expression epithelial cells response pne...,Respiratory syncytial virus (RSV) and pneumoni...,[respiratory syncytial virus rsv pneumonia vir...
...,...,...,...,...
995,Mannose-binding lectin deficiency and acute ex...,[mannose binding lectin deficiency acute exace...,BACKGROUND: Mannose-binding lectin is a collec...,[background mannose binding lectin collectin h...
996,The changing phenotype of microglia from homeo...,[changing phenotype microglia homeostasis dise...,It has been nearly a century since the early d...,[century early description microglia rio many ...
997,Diversity of Salmonella spp. serovars isolated...,"[diversity salmonella spp, serovars isolated i...",BACKGROUND: Salmonellosis in water buffalo (Bu...,"[, , present study, water buffalo calves typhi..."
998,Severe Childhood Malaria Syndromes Defined by ...,[severe childhood malaria syndromes defined pl...,BACKGROUND: Cerebral malaria (CM) and severe m...,[background cerebral malaria serious life clin...


Text lemmatisation
Comparison of Text Data after Applied Lemmatisation


Unnamed: 0,title,pr_title,abstract,pr_abstract
0,Clinical features of culture-proven Mycoplasma...,[clinical feature culture proven mycoplasma pn...,OBJECTIVE: This retrospective chart review des...,[objective retrospective chart review epidemio...
1,Nitric oxide: a pro-inflammatory mediator in l...,[nitric oxide pro inflammatory mediator lung d...,Inflammatory diseases of the respiratory tract...,[inflammatory disease respiratory tract common...
2,Surfactant protein-D and pulmonary host defense,[surfactant protein pulmonary host defense],Surfactant protein-D (SP-D) participates in th...,[surfactant protein participates response micr...
3,Role of endothelin-1 in lung disease,[role endothelin lung disease],Endothelin-1 (ET-1) is a 21 amino acid peptide...,[endothelin amino acid peptide diverse biologi...
4,Gene expression in epithelial cells in respons...,[gene expression epithelial cell response pneu...,Respiratory syncytial virus (RSV) and pneumoni...,[respiratory syncytial virus rsv pneumonia vir...
...,...,...,...,...
995,Mannose-binding lectin deficiency and acute ex...,[mannose binding lectin deficiency acute exace...,BACKGROUND: Mannose-binding lectin is a collec...,[background mannose binding lectin collectin h...
996,The changing phenotype of microglia from homeo...,[changing phenotype microglia homeostasis dise...,It has been nearly a century since the early d...,[century early description microglia rio many ...
997,Diversity of Salmonella spp. serovars isolated...,"[diversity salmonella spp, serovars isolated i...",BACKGROUND: Salmonellosis in water buffalo (Bu...,"[, , present study, water buffalo calf typhimu..."
998,Severe Childhood Malaria Syndromes Defined by ...,[severe childhood malaria syndrome defined pla...,BACKGROUND: Cerebral malaria (CM) and severe m...,[background cerebral malaria serious life clin...


Creating index - covid
b'{\n  "name" : "DESKTOP-K0D65LT",\n  "cluster_name" : "elasticsearch",\n  "cluster_uuid" : "P8VwA_sWSl-ykHakwwPzXQ",\n  "version" : {\n    "number" : "7.10.2",\n    "build_flavor" : "default",\n    "build_type" : "zip",\n    "build_hash" : "747e1cc71def077253878a59143c1f785afa92b9",\n    "build_date" : "2021-01-13T00:42:12.435326Z",\n    "build_snapshot" : false,\n    "lucene_version" : "8.7.0",\n    "minimum_wire_compatibility_version" : "6.8.0",\n    "minimum_index_compatibility_version" : "6.0.0-beta1"\n  },\n  "tagline" : "You Know, for Search"\n}\n'
Indexing
Searching in ElasticSearch
Retrieved documents:
[{'abstract': 'Inflammatory diseases of the respiratory tract are commonly '
              'associated with elevated production of nitric oxide (NO•) and '
              'increased indices of NO• -dependent oxidative stress. Although '
              'NO• is known to have anti-microbial, anti-inflammatory and '
              'anti-oxidant properties, variou

In [20]:
documents=search(es, es_index="covid",
    query={
          "query": {
            "match_phrase":{"publish_time":"2000-08-15"}
          }
        })
print("Retrieved documents:")
pprint.pprint(documents)

Retrieved documents:
[{'abstract': 'Inflammatory diseases of the respiratory tract are commonly '
              'associated with elevated production of nitric oxide (NO•) and '
              'increased indices of NO• -dependent oxidative stress. Although '
              'NO• is known to have anti-microbial, anti-inflammatory and '
              'anti-oxidant properties, various lines of evidence support the '
              'contribution of NO• to lung injury in several disease models. '
              'On the basis of biochemical evidence, it is often presumed that '
              'such NO• -dependent oxidations are due to the formation of the '
              'oxidant peroxynitrite, although alternative mechanisms '
              'involving the phagocyte-derived heme proteins myeloperoxidase '
              'and eosinophil peroxidase might be operative during conditions '
              'of inflammation. Because of the overwhelming literature on NO• '
              'generation and activi

In [21]:
documents=search(es, es_index="covid",
    query={
          "query": {
            "match_phrase":{"pr_abstract":"biological diversity"}
          }
        })
print("Retrieved documents:")
pprint.pprint(documents)

Retrieved documents:
[{'abstract': 'Sapovirus is a genus of caliciviruses that are known to cause '
              'enteric disease in humans and animals. There is considerable '
              'genetic diversity among the sapoviruses, which are classified '
              'into different genogroups based on phylogenetic analysis of the '
              'full-length capsid protein sequence. While several mammalian '
              'species, including humans, pigs, minks, and dogs, have been '
              'identified as animal hosts for sapoviruses, there were no '
              'reports of sapoviruses in bats in spite of their biological '
              'diversity. In this report, we present the results of a targeted '
              'surveillance study in different bat species in Hong Kong. Five '
              'of the 321 specimens from the bat species, Hipposideros pomona, '
              'were found to be positive for sapoviruses by RT-PCR. Complete '
              'or nearly full-leng

In [22]:
documents=search(es, es_index="covid",
    query={
          "query": {
            "match_phrase":{"abstract":"biological diversity"}
          }
        })
print("Retrieved documents:")
pprint.pprint(documents)

Retrieved documents:
[{'abstract': 'Horizontal DNA transfer is an important factor of evolution and '
              'participates in biological diversity. Unfortunately, the '
              'location and length of horizontal transfers (HTs) are known for '
              'very few species. The usage of short oligonucleotides in a '
              'sequence (the so-called genomic signature) has been shown to be '
              'species-specific even in DNA fragments as short as 1 kb. The '
              'genomic signature is therefore proposed as a tool to detect '
              'HTs. Since DNA transfers originate from species with a '
              'signature different from those of the recipient species, the '
              'analysis of local variations of signature along recipient '
              'genome may allow for detecting exogenous DNA. The strategy '
              'consists in (i) scanning the genome with a sliding window, and '
              'calculating the corresponding local

In [23]:
documents=search(es, es_index="covid",
    query={
          "query": {
            "match_all":{}
          }
        })
print("Retrieved documents:")
pprint.pprint(documents)

Retrieved documents:
[{'abstract': 'Nidovirus subgenomic mRNAs contain a leader sequence derived '
              'from the 5′ end of the genome fused to different sequences '
              '(‘bodies’) derived from the 3′ end. Their generation involves a '
              'unique mechanism of discontinuous subgenomic RNA synthesis that '
              'resembles copy-choice RNA recombination. During this process, '
              'the nascent RNA strand is transferred from one site in the '
              'template to another, during either plus or minus strand '
              'synthesis, to yield subgenomic RNA molecules. Central to this '
              'process are transcription-regulating sequences (TRSs), which '
              'are present at both template sites and ensure the fidelity of '
              'strand transfer. Here we present results of a comprehensive '
              'co-variation mutagenesis study of equine arteritis virus TRSs, '
              'demonstrating that disconti