## Data loading

In [119]:
import json
import re
import traceback 
from pyspark.sql import SparkSession
from elasticsearch import Elasticsearch, helpers
import requests, json, os, csv
from nltk.corpus import stopwords
import string
import unidecode
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
from flashtext import KeywordProcessor
import spacy
import pytextrank
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import WordNetLemmatizer
import pprint


In [120]:
spark = SparkSession \
    .builder \
    .appName("ElasticSpark-1") \
    .config("spark.driver.extraClassPath", "/path/elasticsearch-hadoop-7.6.2/dist/elasticsearch-spark-20_2.11-7.6.2.jar") \
    .config("spark.es.port","9200") \
    .config("spark.driver.memory", "8G") \
    .config("spark.executor.memory", "12G") \
    .getOrCreate()


In [121]:
path = 'metadata.csv'
metadata_df = spark.read.csv(path, multiLine=True, header=True)
metadata_df.show(1)

+--------+--------------------+--------+--------------------+--------------------+--------+---------+-------+--------------------+------------+--------------------+--------------+------+----------------+--------+--------------------+--------------------+--------------------+-----+
|cord_uid|                 sha|source_x|               title|                 doi|   pmcid|pubmed_id|license|            abstract|publish_time|             authors|       journal|mag_id|who_covidence_id|arxiv_id|      pdf_json_files|      pmc_json_files|                 url|s2_id|
+--------+--------------------+--------+--------------------+--------------------+--------+---------+-------+--------------------+------------+--------------------+--------------+------+----------------+--------+--------------------+--------------------+--------------------+-----+
|ug7v899j|d1aafb70c066a2068...|     PMC|Clinical features...|10.1186/1471-2334...|PMC35282| 11472636|  no-cc|OBJECTIVE: This r...|  2001-07-04|Madani, Tar

In [122]:
metadata_df = metadata_df.select("*").limit(1000)

In [123]:
metadata_df.show(3)

+--------+--------------------+--------+--------------------+--------------------+--------+---------+-------+--------------------+------------+--------------------+--------------+------+----------------+--------+--------------------+--------------------+--------------------+-----+
|cord_uid|                 sha|source_x|               title|                 doi|   pmcid|pubmed_id|license|            abstract|publish_time|             authors|       journal|mag_id|who_covidence_id|arxiv_id|      pdf_json_files|      pmc_json_files|                 url|s2_id|
+--------+--------------------+--------+--------------------+--------------------+--------+---------+-------+--------------------+------------+--------------------+--------------+------+----------------+--------+--------------------+--------------------+--------------------+-----+
|ug7v899j|d1aafb70c066a2068...|     PMC|Clinical features...|10.1186/1471-2334...|PMC35282| 11472636|  no-cc|OBJECTIVE: This r...|  2001-07-04|Madani, Tar

In [124]:
metadata_table = metadata_df.toPandas()

In [125]:
metadata_table

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,880nqc0f,ed2de3694f5580ea38f4adf24bd1c8b46862df1f,PMC,Mannose-binding lectin deficiency and acute ex...,10.2147/copd.s33714,PMC3514010,23226013,no-cc,BACKGROUND: Mannose-binding lectin is a collec...,2012-11-23,"Albert, Richard K; Connett, John; Curtis, Jeff...",Int J Chron Obstruct Pulmon Dis,,,,document_parses/pdf_json/ed2de3694f5580ea38f4a...,document_parses/pmc_json/PMC3514010.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
996,01b0vnnm,2a7c951e191425fd9fa5ac108f07a1f02eb75872,PMC,The changing phenotype of microglia from homeo...,10.1186/2047-9158-1-9,PMC3514090,23210447,cc-by,It has been nearly a century since the early d...,2012-04-24,"Luo, Xiao-Guang; Chen, Sheng-Di",Transl Neurodegener,,,,document_parses/pdf_json/2a7c951e191425fd9fa5a...,document_parses/pmc_json/PMC3514090.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
997,5b29wtim,854e623d1f875e4605b2ffd3f72599d063a56cc0,PMC,Diversity of Salmonella spp. serovars isolated...,10.1186/1746-6148-8-201,PMC3514206,23098237,cc-by,BACKGROUND: Salmonellosis in water buffalo (Bu...,2012-10-25,"Borriello, Giorgia; Lucibelli, Maria G; Pescia...",BMC Vet Res,,,,document_parses/pdf_json/854e623d1f875e4605b2f...,document_parses/pmc_json/PMC3514206.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
998,z65xxk1f,3695704554777889f8232a9ea086df70bf17ff58,PMC,Severe Childhood Malaria Syndromes Defined by ...,10.1371/journal.pone.0049778,PMC3514223,23226502,cc-by,BACKGROUND: Cerebral malaria (CM) and severe m...,2012-12-04,"Burté, Florence; Brown, Biobele J.; Orimadegun...",PLoS One,,,,document_parses/pdf_json/3695704554777889f8232...,document_parses/pmc_json/PMC3514223.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,


In [126]:
for i in range(0,len(metadata_table)):
    metadata_table.at[i,'publish_time']=str(metadata_table.at[i,'publish_time'])+" "
    metadata_table.at[i,'arxiv_id']=str(metadata_table.at[i,'arxiv_id'])+" "

In [127]:
metadata_table[:50]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
5,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998,green-oa,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,,document_parses/pdf_json/b2897e1277f56641193a6...,document_parses/pmc_json/PMC125340.xml.json,http://europepmc.org/articles/pmc125340?pdf=re...,
6,5yhe786e,3bb07ea10432f7738413dff9816809cc90f03f99,PMC,Debate: Transfusing to normal haemoglobin leve...,10.1186/cc987,PMC137267,11299062,no-cc,Recent evidence suggests that critically ill p...,2001-03-08,"Alvarez, Gonzalo; Hébert, Paul C; Szick, Sharyn",Crit Care,,,,document_parses/pdf_json/3bb07ea10432f7738413d...,document_parses/pmc_json/PMC137267.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,
7,8zchiykl,5806726a24dc91de3954001effbdffd7a82d54e2,PMC,The 21st International Symposium on Intensive ...,10.1186/cc1013,PMC137274,11353930,no-cc,The 21st International Symposium on Intensive ...,2001-05-02,"Ball, Jonathan; Venn, Richard",Crit Care,,,,document_parses/pdf_json/5806726a24dc91de39540...,document_parses/pmc_json/PMC137274.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,
8,8qnrcgnk,faaf1022ccfe93b032c5608097a53543ba24aedb,PMC,Heme oxygenase-1 and carbon monoxide in pulmon...,10.1186/1465-9921-4-7,PMC193681,12964953,no-cc,"Heme oxygenase-1 (HO-1), an inducible stress p...",2003-08-07,"Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augus...",Respir Res,,,,document_parses/pdf_json/faaf1022ccfe93b032c56...,document_parses/pmc_json/PMC193681.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,
9,jg13scgo,5b44feca5d6ffaaeb66501fa84cc6dd44d06660a,PMC,Technical Description of RODS: A Real-time Pub...,10.1197/jamia.m1345,PMC212776,12807803,bronze-oa,This report describes the design and implement...,2003-09-01,"Tsui, Fu-Chiang; Espino, Jeremy U.; Dato, Virg...",Journal of the American Medical Informatics As...,,,,document_parses/pdf_json/5b44feca5d6ffaaeb6650...,document_parses/pmc_json/PMC212776.xml.json,https://academic.oup.com/jamia/article-pdf/10/...,


In [128]:
res = requests.get('http://localhost:9200')
print (res.content)
es = Elasticsearch([{'host': 'localhost', 'port': '9200'}])

b'{\n  "name" : "DESKTOP-K0D65LT",\n  "cluster_name" : "elasticsearch",\n  "cluster_uuid" : "P8VwA_sWSl-ykHakwwPzXQ",\n  "version" : {\n    "number" : "7.10.2",\n    "build_flavor" : "default",\n    "build_type" : "zip",\n    "build_hash" : "747e1cc71def077253878a59143c1f785afa92b9",\n    "build_date" : "2021-01-13T00:42:12.435326Z",\n    "build_snapshot" : false,\n    "lucene_version" : "8.7.0",\n    "minimum_wire_compatibility_version" : "6.8.0",\n    "minimum_index_compatibility_version" : "6.0.0-beta1"\n  },\n  "tagline" : "You Know, for Search"\n}\n'


In [129]:
mapping = {
    'settings':{
        'number_of_shards': 1, 
        'number_of_replicas': 1
    },
    'mappings': {
        'properties': {
            'cord_uid': {
                'index': 'false', 
                'type': 'text'
            },
            'sha': {
                'index': 'true', 
                'type': 'text'
            },
            'source_x': {
                'index': 'true', 
                'type': 'text'
            },
            'title': {
                'index': 'true',
                'type': 'text', 
                'similarity': 'BM25'
            },
            'doi': {
                'index': 'true', 
                'type': 'text'
            },
            'pmcid': {
                'index': 'true',
                'type': 'text'
            },
            'license': {
                'index': 'true',
                'type': 'text'
            },
            'abstract': {
                'index': 'true',
                'type': 'text',
                'similarity': 'BM25'
            },
            'publish_time': {
                'index': 'true', 
                'type': 'text'
            },
            'authors': {
                'index': 'true',
                'type': 'text'
            },
            'journal': {
                'index': 'true',
                'type': 'text'
            },
            'who_covidence_id': {
                'index': 'true',
                'type': 'text'
            },
            'arxiv_id': {
                'index': 'true',
                'type': 'text'
            },
            'pdf_json_files': {
                'index': 'true',
                'type': 'text'
            },
            'pmc_json_files': {
                'index': 'true', 
                'type': 'text'
            },
            'url': {
                'index': 'true',
                'type': 'text'
            },
            's2_id': {
                'index': 'true', 
                'type': 'text'
            }
         }
     }
}

In [130]:
if es.indices.exists("covid"):
    es.indices.delete("covid") 
    
es.indices.create(index="covid",body=mapping)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'covid'}

## Sentence Splitting, Tokenization and Normalization

In [131]:
!pip install unidecode



In [132]:
!pip install gensim



In [133]:
class TextNormalizer:
    def __init__(self):
        self.punctuation_table = str.maketrans('','',string.punctuation)

    def normalize_text(self,text):
        if text==None:
            return None
        try: 
            normalized_sentences = []
            text = re.sub(' +',' ', text)
            text = unidecode.unidecode(text)
            text = text.lower()
            sentences = sent_tokenize(text)
        except:
            print("ERROR:", text)
            traceback.print_exc()
            return None
        
        for sentence in sentences:
            #remove punctuation
            sentence=re.sub("["+string.punctuation+"\d*]"," ",sentence)
            #strip leading/trailing whitespace
            sentence = sentence.strip()
            words = word_tokenize(sentence)
            new_sentence = ' '.join(words)
            normalized_sentences.append(new_sentence)
        return normalized_sentences

In [134]:
normaliser = TextNormalizer()

In [135]:
table_to_process=metadata_table[["title","abstract"]]

In [136]:
table_to_process

Unnamed: 0,title,abstract
0,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...
1,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...
2,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...
3,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...
4,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...
...,...,...
995,Mannose-binding lectin deficiency and acute ex...,BACKGROUND: Mannose-binding lectin is a collec...
996,The changing phenotype of microglia from homeo...,It has been nearly a century since the early d...
997,Diversity of Salmonella spp. serovars isolated...,BACKGROUND: Salmonellosis in water buffalo (Bu...
998,Severe Childhood Malaria Syndromes Defined by ...,BACKGROUND: Cerebral malaria (CM) and severe m...


In [137]:
table_to_process["title"]=table_to_process["title"].apply(lambda x: normaliser.normalize_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [138]:
table_to_process["abstract"]=table_to_process["abstract"].apply(lambda x: normaliser.normalize_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [139]:
table_to_process

Unnamed: 0,title,abstract
0,[clinical features of culture proven mycoplasm...,[objective this retrospective chart review des...
1,[nitric oxide a pro inflammatory mediator in l...,[inflammatory diseases of the respiratory trac...
2,[surfactant protein d and pulmonary host defense],[surfactant protein d sp d participates in the...
3,[role of endothelin in lung disease],[endothelin et is a amino acid peptide with di...
4,[gene expression in epithelial cells in respon...,[respiratory syncytial virus rsv and pneumonia...
...,...,...
995,[mannose binding lectin deficiency and acute e...,[background mannose binding lectin is a collec...
996,[the changing phenotype of microglia from home...,[it has been nearly a century since the early ...
997,"[diversity of salmonella spp, serovars isolate...",[background salmonellosis in water buffalo bub...
998,[severe childhood malaria syndromes defined by...,[background cerebral malaria cm and severe mal...


## Selecting key words

In [140]:
table_to_process

Unnamed: 0,title,abstract
0,[clinical features of culture proven mycoplasm...,[objective this retrospective chart review des...
1,[nitric oxide a pro inflammatory mediator in l...,[inflammatory diseases of the respiratory trac...
2,[surfactant protein d and pulmonary host defense],[surfactant protein d sp d participates in the...
3,[role of endothelin in lung disease],[endothelin et is a amino acid peptide with di...
4,[gene expression in epithelial cells in respon...,[respiratory syncytial virus rsv and pneumonia...
...,...,...
995,[mannose binding lectin deficiency and acute e...,[background mannose binding lectin is a collec...
996,[the changing phenotype of microglia from home...,[it has been nearly a century since the early ...
997,"[diversity of salmonella spp, serovars isolate...",[background salmonellosis in water buffalo bub...
998,[severe childhood malaria syndromes defined by...,[background cerebral malaria cm and severe mal...


In [141]:
table_to_process.loc[0,"abstract"]

['objective this retrospective chart review describes the epidemiology and clinical features of patients with culture proven mycoplasma pneumoniae infections at king abdulaziz university hospital jeddah saudi arabia',
 'methods patients with positive m pneumoniae cultures from respiratory specimens from january through december were identified through the microbiology records',
 'charts of patients were reviewed',
 'results patients were identified of whom required admission',
 'most infections were community acquired',
 'the infection affected all age groups but was most common in infants and pre school children',
 'it occurred year round but was most common in the fall and spring',
 'more than three quarters of patients had comorbidities',
 'twenty four isolates were associated with pneumonia with upper respiratory tract infections and with bronchiolitis',
 'cough fever and malaise were the most common symptoms and crepitations and wheezes were the most common signs',
 'most patients

In [142]:
s_top_words = set(stopwords.words('english'))

In [143]:
def remove_stop_words(text):
    if text==None:
        return
    for index,sentence in enumerate(text):
        sentence = sentence.split(" ")
        sentence = [word for word in sentence if word not in s_top_words and len(word)>2]
        sentence=" ".join(sentence)
        text[index]=sentence
    return text
    

In [144]:
table_to_process["title"]=table_to_process["title"].apply(lambda x: remove_stop_words(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [145]:
table_to_process["abstract"]=table_to_process["abstract"].apply(lambda x: remove_stop_words(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [146]:
table_to_process

Unnamed: 0,title,abstract
0,[clinical features culture proven mycoplasma p...,[objective retrospective chart review describe...
1,[nitric oxide pro inflammatory mediator lung d...,[inflammatory diseases respiratory tract commo...
2,[surfactant protein pulmonary host defense],[surfactant protein participates innate respon...
3,[role endothelin lung disease],[endothelin amino acid peptide diverse biologi...
4,[gene expression epithelial cells response pne...,[respiratory syncytial virus rsv pneumonia vir...
...,...,...
995,[mannose binding lectin deficiency acute exace...,[background mannose binding lectin collectin i...
996,[changing phenotype microglia homeostasis dise...,[nearly century since early description microg...
997,"[diversity salmonella spp, serovars isolated i...",[background salmonellosis water buffalo bubalu...
998,[severe childhood malaria syndromes defined pl...,[background cerebral malaria severe malarial a...


In [147]:
def get_words_corpus(table):
    words_corpus=[]
    for i in range(0, len(table)):
        row=table.loc[i]
        title_sentences = row["title"]
        abstract_sentences = row["abstract"]
        
        if title_sentences!=None:
            for i in range(0,len(title_sentences)):
                words_corpus.extend(title_sentences[i].split())
                
        if  abstract_sentences!=None:
            for i in range(0,len(abstract_sentences)):
                words_corpus.extend(abstract_sentences[i].split())
    return words_corpus
        

In [148]:
words_corpus=get_words_corpus(table_to_process)

In [149]:
len(words_corpus)

133627

In [150]:
dist = nltk.FreqDist(words_corpus) #Creating a distribution of words' frequencies
grams=dist.most_common(1000) #Obtaining the most frequent words
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [151]:
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(words_corpus)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(words_corpus)

In [152]:
print("Showing first",2000,"top-freqent words")
grams = pd.DataFrame(grams) #Building data table to represent selected by POS tagger word features 
grams.index = range(1,len(grams)+1)
grams.columns = ["Word", "Frequency"]
grams

Showing first 2000 top-freqent words


Unnamed: 0,Word,Frequency
1,virus,985
2,influenza,945
3,infection,740
4,cells,711
5,patients,587
...,...,...
996,find,28
997,great,28
998,duration,28
999,initiation,28


In [153]:
bigramFinder.apply_freq_filter(7)

In [154]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [155]:
bigramPMITable

Unnamed: 0,bigram,PMI
0,"(rift, valley)",13.857927
1,"(endoplasmic, reticulum)",13.705924
2,"(bronchoalveolar, lavage)",13.327412
3,"(coronary, artery)",13.327412
4,"(vesicular, stomatitis)",13.327412
...,...,...
863,"(cells, cells)",1.665403
864,"(disease, virus)",1.581372
865,"(infection, results)",1.548103
866,"(pandemic, virus)",1.325173


In [156]:
trigramFinder.apply_freq_filter(5)

In [157]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [158]:
len(trigramPMITable)

224

In [159]:
trigramPMITable

Unnamed: 0,trigram,PMI
0,"(mov, avg, cusum)",29.411848
1,"(cortex, magnoliae, officinalis)",29.148813
2,"(systemic, lupus, erythematosus)",25.663387
3,"(autoimmune, hemolytic, anemia)",25.052889
4,"(methicillin, resistant, staphylococcus)",24.947180
...,...,...
219,"(influenza, virus, replication)",9.049636
220,"(influenza, virus, infections)",8.849042
221,"(pandemic, influenza, infection)",8.659061
222,"(response, influenza, virus)",8.532673


In [160]:
bigramPMITable["bigram"]=bigramPMITable["bigram"].apply(lambda x: ' '.join(x))

In [161]:
bigramPMITable

Unnamed: 0,bigram,PMI
0,rift valley,13.857927
1,endoplasmic reticulum,13.705924
2,bronchoalveolar lavage,13.327412
3,coronary artery,13.327412
4,vesicular stomatitis,13.327412
...,...,...
863,cells cells,1.665403
864,disease virus,1.581372
865,infection results,1.548103
866,pandemic virus,1.325173


In [162]:
trigramPMITable["trigram"]=trigramPMITable["trigram"].apply(lambda x: ' '.join(x))

In [163]:
trigramPMITable

Unnamed: 0,trigram,PMI
0,mov avg cusum,29.411848
1,cortex magnoliae officinalis,29.148813
2,systemic lupus erythematosus,25.663387
3,autoimmune hemolytic anemia,25.052889
4,methicillin resistant staphylococcus,24.947180
...,...,...
219,influenza virus replication,9.049636
220,influenza virus infections,8.849042
221,pandemic influenza infection,8.659061
222,response influenza virus,8.532673


In [164]:
gram_dict=grams.set_index('Word').T.to_dict('list')

In [165]:
bigramPMIDict=bigramPMITable.set_index('bigram').T.to_dict('list')


In [166]:
trigramPMIDict=trigramPMITable.set_index('trigram').T.to_dict('list')

In [167]:
!pip install flashtext



In [168]:
keyword_processor = KeywordProcessor()

In [169]:
textrank_keyword_processor = KeywordProcessor()

In [170]:
gram_dict.update(bigramPMIDict)

In [171]:
bigramPMIDict.update(trigramPMIDict)

In [172]:
gram_dict

{'virus': [985],
 'influenza': [945],
 'infection': [740],
 'cells': [711],
 'patients': [587],
 'human': [558],
 'disease': [544],
 'viral': [525],
 'cell': [509],
 'results': [494],
 'protein': [484],
 'study': [441],
 'using': [437],
 'data': [389],
 'pandemic': [379],
 'viruses': [376],
 'also': [366],
 'health': [365],
 'associated': [351],
 'may': [351],
 'expression': [332],
 'analysis': [322],
 'used': [322],
 'specific': [320],
 'high': [317],
 'based': [309],
 'respiratory': [305],
 'host': [299],
 'proteins': [297],
 'gene': [296],
 'background': [294],
 'rna': [288],
 'methods': [287],
 'two': [282],
 'clinical': [280],
 'time': [273],
 'response': [259],
 'new': [259],
 'model': [259],
 'hiv': [254],
 'different': [249],
 'activity': [247],
 'role': [245],
 'immune': [239],
 'risk': [237],
 'however': [236],
 'one': [235],
 'studies': [234],
 'genes': [229],
 'acute': [228],
 'system': [228],
 'non': [226],
 'novel': [223],
 'infected': [223],
 'lung': [221],
 'type': [221

In [173]:
text_rank_key_words=dict()

In [174]:
def get_keywords_by_textrank(sentences):
    if sentences==None:
        return None
    keywords=dict()
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe("textrank", last=True)
    doc = nlp(" ".join(sentences))

    # examine the top-ranked phrases in the document

    for p in doc._.phrases:
        if p.rank>=0.05:
            keywords[p.text]=p.rank
    #         print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
    #         print(p.text)
    return keywords

In [175]:
 for i in range(0, len(table_to_process)):
        sentences=table_to_process.loc[i,"abstract"]
        if sentences!=None:
            keywords=get_keywords_by_textrank(sentences)
            if keywords!=None:
                text_rank_key_words.update(keywords)
                print("Text",i,"- Done")


Text 0 - Done
Text 1 - Done
Text 2 - Done
Text 3 - Done
Text 4 - Done
Text 5 - Done
Text 6 - Done
Text 7 - Done
Text 8 - Done
Text 9 - Done
Text 10 - Done
Text 11 - Done
Text 12 - Done
Text 13 - Done
Text 15 - Done
Text 16 - Done
Text 17 - Done
Text 18 - Done
Text 19 - Done
Text 20 - Done
Text 21 - Done
Text 22 - Done
Text 23 - Done
Text 24 - Done
Text 25 - Done
Text 26 - Done
Text 27 - Done
Text 28 - Done
Text 29 - Done
Text 30 - Done
Text 31 - Done
Text 32 - Done
Text 33 - Done
Text 34 - Done
Text 35 - Done
Text 36 - Done
Text 37 - Done
Text 38 - Done
Text 39 - Done
Text 40 - Done
Text 41 - Done
Text 42 - Done
Text 43 - Done
Text 44 - Done
Text 45 - Done
Text 46 - Done
Text 47 - Done
Text 48 - Done
Text 49 - Done
Text 50 - Done
Text 57 - Done
Text 58 - Done
Text 59 - Done
Text 60 - Done
Text 61 - Done
Text 62 - Done
Text 64 - Done
Text 65 - Done
Text 66 - Done
Text 67 - Done
Text 68 - Done
Text 69 - Done
Text 70 - Done
Text 71 - Done
Text 72 - Done
Text 73 - Done
Text 74 - Done
Text 

Text 542 - Done
Text 543 - Done
Text 544 - Done
Text 545 - Done
Text 546 - Done
Text 547 - Done
Text 548 - Done
Text 549 - Done
Text 550 - Done
Text 551 - Done
Text 552 - Done
Text 553 - Done
Text 554 - Done
Text 555 - Done
Text 556 - Done
Text 557 - Done
Text 558 - Done
Text 559 - Done
Text 560 - Done
Text 561 - Done
Text 562 - Done
Text 563 - Done
Text 564 - Done
Text 566 - Done
Text 567 - Done
Text 568 - Done
Text 569 - Done
Text 571 - Done
Text 572 - Done
Text 573 - Done
Text 574 - Done
Text 575 - Done
Text 576 - Done
Text 577 - Done
Text 578 - Done
Text 579 - Done
Text 580 - Done
Text 581 - Done
Text 582 - Done
Text 583 - Done
Text 584 - Done
Text 585 - Done
Text 586 - Done
Text 587 - Done
Text 588 - Done
Text 589 - Done
Text 590 - Done
Text 591 - Done
Text 592 - Done
Text 593 - Done
Text 594 - Done
Text 595 - Done
Text 596 - Done
Text 597 - Done
Text 598 - Done
Text 599 - Done
Text 600 - Done
Text 601 - Done
Text 602 - Done
Text 603 - Done
Text 604 - Done
Text 605 - Done
Text 606

In [176]:
for keyword in gram_dict.keys():
    parts=keyword.split()
    parts="_".join(parts)
    keyword_processor.add_keyword(keyword,parts)

In [177]:
for keyword in text_rank_key_words.keys():
    parts=keyword.split()
    parts="_".join(parts)
    textrank_keyword_processor.add_keyword(keyword,parts)

In [178]:
len(keyword_processor.get_all_keywords())

1868

In [179]:
len(textrank_keyword_processor.get_all_keywords())

19754

In [180]:
len(text_rank_key_words)

19754

In [181]:
def extract_keywords(text,keyword_processor):
    sentences=[]
    if text==None:
        return None
    for i in range(0, len(text)):
        keywords_found = keyword_processor.extract_keywords(text[i])
        sentences.append(" ".join(keywords_found))
    return sentences
    

In [182]:
def merge_two_keywords_methods(sentences, text_rank_key_word_processor, frequent_key_words_processor):
    if sentences==None:
        return None
    text_rank_version = extract_keywords(sentences,text_rank_key_word_processor)
    frequent_key_words_version = extract_keywords(sentences,frequent_key_words_processor)
    intersect = set(frequent_key_words_version) - set(text_rank_version)

    merged_version = text_rank_version + list(intersect)
    return merged_version

In [183]:
table_to_process["abstract"]=table_to_process["abstract"].apply(lambda x: merge_two_keywords_methods(x, textrank_keyword_processor, keyword_processor))     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [184]:
for i in range(0, len(table_to_process)):
    metadata_table.loc[i,"title"] = table_to_process.loc[i,"title"]
    metadata_table.loc[i,"abstract"] = table_to_process.loc[i,"abstract"]

In [185]:
metadata_table

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,[clinical features culture proven mycoplasma p...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,[objective_retrospective_chart_review epidemio...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,[nitric oxide pro inflammatory mediator lung d...,10.1186/rr14,PMC59543,11667967,no-cc,[inflammatory_diseases respiratory_tract commo...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,[surfactant protein pulmonary host defense],10.1186/rr19,PMC59549,11667972,no-cc,[surfactant_protein_participates response micr...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,[role endothelin lung disease],10.1186/rr44,PMC59574,11686871,no-cc,[endothelin_amino_acid_peptide_diverse_biologi...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,[gene expression epithelial cells response pne...,10.1186/rr61,PMC59580,11686888,no-cc,[respiratory_syncytial_virus_rsv_pneumonia_vir...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,880nqc0f,ed2de3694f5580ea38f4adf24bd1c8b46862df1f,PMC,[mannose binding lectin deficiency acute exace...,10.2147/copd.s33714,PMC3514010,23226013,no-cc,[background_mannose_binding_lectin_collectin h...,2012-11-23,"Albert, Richard K; Connett, John; Curtis, Jeff...",Int J Chron Obstruct Pulmon Dis,,,,document_parses/pdf_json/ed2de3694f5580ea38f4a...,document_parses/pmc_json/PMC3514010.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
996,01b0vnnm,2a7c951e191425fd9fa5ac108f07a1f02eb75872,PMC,[changing phenotype microglia homeostasis dise...,10.1186/2047-9158-1-9,PMC3514090,23210447,cc-by,[century early_description_microglia_rio many_...,2012-04-24,"Luo, Xiao-Guang; Chen, Sheng-Di",Transl Neurodegener,,,,document_parses/pdf_json/2a7c951e191425fd9fa5a...,document_parses/pmc_json/PMC3514090.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
997,5b29wtim,854e623d1f875e4605b2ffd3f72599d063a56cc0,PMC,"[diversity salmonella spp, serovars isolated i...",10.1186/1746-6148-8-201,PMC3514206,23098237,cc-by,[background_salmonellosis_water_buffalo_bubalu...,2012-10-25,"Borriello, Giorgia; Lucibelli, Maria G; Pescia...",BMC Vet Res,,,,document_parses/pdf_json/854e623d1f875e4605b2f...,document_parses/pmc_json/PMC3514206.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
998,z65xxk1f,3695704554777889f8232a9ea086df70bf17ff58,PMC,[severe childhood malaria syndromes defined pl...,10.1371/journal.pone.0049778,PMC3514223,23226502,cc-by,[background_cerebral_malaria serious_life clin...,2012-12-04,"Burté, Florence; Brown, Biobele J.; Orimadegun...",PLoS One,,,,document_parses/pdf_json/3695704554777889f8232...,document_parses/pmc_json/PMC3514223.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,


In [186]:
dataset = []
for i in range(0, len(table_to_process["abstract"])):
    sentences = table_to_process.loc[i,"abstract"]
    if sentences!=None:
        sentences=" ".join(sentences)
        dataset.append(sentences)
        
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(dataset)

In [187]:
def retain_best_tf_idf_keywords(sentences, index):
    if sentences==None:
        return None
    tf_idf_keyword_processor = KeywordProcessor()
    df = pd.DataFrame(tfIdf[index].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF_IDF"])
    df = df.sort_values('TF_IDF', ascending=False)
    df = df[df.TF_IDF>0.09]
    tf_idf_dict=df.T.to_dict('list')    
    for keyword in tf_idf_dict.keys():
        parts = " ".join(keyword.split("_"))
        tf_idf_keyword_processor.add_keyword(keyword,parts)
    sentences = extract_keywords(sentences,tf_idf_keyword_processor)
    return sentences

In [188]:
index=0
for i in range(0,len(metadata_table)):
    if table_to_process.loc[i,"abstract"]==None:
        continue
    metadata_table.loc[i,"abstract"]=retain_best_tf_idf_keywords(table_to_process.loc[i,"abstract"], index)
    index+=1

## Stemming or Morphological Analysis (Lemmatisation) 

In [189]:
def lematise_text(sentences):
    if sentences==None:
        return None
    lemmatizer = WordNetLemmatizer()
    for i in range(0, len(sentences)):
        try:
            if sentences[i] == "":
                continue
            words=sentences[i].split()
            lematised_words = [lemmatizer.lemmatize(word) for word in words]
            lematised_words = ' '.join(lematised_words)
            sentences[i]=lematised_words
        except:
            print(sentences)
            print(sentences[i])
            break
    return sentences


In [190]:
metadata_table["abstract"]=metadata_table["abstract"].apply(lambda x: lematise_text(x))

## Indexing

In [191]:
for i in range(0,len(metadata_table)):
    metadata_table.iloc[i].to_json('covid.json')
    f = open('covid.json')
    docket_content = f.read()
    row=json.loads(docket_content)
    try:
        es.index(index='covid', id=i, body=row)
    except:
        traceback.print_exc() 
        print("Error:", "row #"+str(i))

## Searching

In [192]:
query={
  "query": {
    "match_phrase":{"publish_time":"2000-08-15"}
  }
}

In [193]:
res = es.search(index="covid", body=query)

In [194]:
documents=[]
for i in range(0, len(res['hits']['hits'])):
    doc=res['hits']['hits'][i]['_source']
    documents.append(doc)

In [195]:
pprint.pprint(documents)

[{'abstract': ['inflammatory disease respiratory tract commonly associated '
               'elevated production nitric oxide index dependent oxidative '
               'stress',
               'property various line evidence support contribution lung '
               'injury several disease model',
               'basis biochemical evidence dependent oxidation formation '
               'alternative mechanism phagocyte heme protein eosinophil '
               'peroxidase condition inflammation',
               'overwhelming literature generation activity respiratory tract '
               'scope commentary review area',
               'recent evidence concept presumed contribution inflammatory '
               'disease lung',
               'literature generation respiratory tract area',
               'inflammatory disease respiratory tract commonly nitric oxide '
               'dependent oxidative stress',
               'basis evidence dependent formation although alternative '
  

In [196]:
# es.indices.create(index = 'covid', body = request_body)

1. Зберегти рядок у формат джісон
2. Відкрити інший інший файл у форматі джісон
3. Присвоїти файл до елементу рядка
4. Додати рядок у список рядків
5. Зберегти все в еластіксерч

In [197]:
# i=0

# file_link=table.loc[i,link_collumn]
# try:
#     if file_link==None:
#         print("ERROR:", i, "-", file_link)
#         table.loc[i,link_collumn]="Undefined"     
#     else:
#         res=re.findall('document_parses/pdf_json/\w+\.json|document_parses/pmc_json/\w+\.xml.json',file_link)
#         if len(res)>0:
#             file_link=res[0]
#         else:
#             print("ERROR:", i, "-", file_link)
#             table.loc[i,link_collumn]="Undefined"

#     f = open(file_link)
#     file_content = f.read()

#     table.iloc[i].to_json('row.json')
#     f = open('row.json')
#     row_content = f.read()
#     row_json_content=json.loads(row_content)
#     row_json_content[link_collumn]=file_content

# except:
#     print("ERROR:", i, "-", file_link)
#     traceback.print_exc()
#     table.loc[i,link_collumn]="Undefined"

In [198]:
# def load_article_files(table, link_collumn='pdf_json_files', filename='covid.json', file_link_start_part='document_parses/pdf_json/'):
#     for i in range(0,len(table)):
#         file_link=table.loc[i,link_collumn]
#         try:
#             if file_link==None:
#                 print("ERROR:", i, "-", file_link)
#                 table.loc[i,link_collumn]="Undefined"
#                 continue
#             else:
#                 res=re.findall('document_parses/pdf_json/\w+\.json|document_parses/pmc_json/\w+\.xml.json',file_link)
#                 if len(res)>0:
#                     file_link=res[0]
#                 else:
#                     print("ERROR:", i, "-", file_link)
#                     table.loc[i,link_collumn]="Undefined"
#                     continue
                    
#             f = open(file_link)
#             file_content = f.read()
            
#             table.iloc[i].to_json('row.json')
#             f = open('row.json')
#             row_content = f.read()
#             row_json_content=json.loads(row_content)
#             table.loc[i,link_collumn]=file_content
            
#         except:
#             print("ERROR:", i, "-", file_link)
#             traceback.print_exc()
#             table.loc[i,link_collumn]="Undefined"
#     return table
    

In [199]:
# load_article_files(metadata_table,link_collumn='pdf_json_files', filename='covid.json').head(2)

In [200]:
# load_article_files(metadata_table,link_collumn='pmc_json_files', filename='covid.json',file_link_start_part='document_parses/pmc_json/')

In [201]:
# directory = 'covid.json'

In [202]:
# for i in range(0,1):
# #     pdf_link=metadata_table.loc[i,'pdf_json_files']
# #     pmc_json=metadata_table.loc[i,'pmc_json_files']

# #     pdf_json = json.loads(metadata_table.loc[i,'pdf_json_files']) if pdf_link!="Undefined" else pdf_link
# #     pmc_json = json.loads(metadata_table.loc[i,'pmc_json_files']) if pmc_json!="Undefined" else pmc_json
#     metadata_table.iloc[i].to_json('covid.json')
#     f = open(directory)
#     docket_content = f.read()
#     row=json.loads(docket_content)
# #     row['pdf_json_files']=pdf_json
# #     row['pmc_json_files']=pmc_json
#     try:
#         es.index(index='covid', doc_type='docket', id=i, body=row)
#     except:
#         traceback.print_exc() 
#         print("Error:", "row #"+str(i))
#         continue