## Migrate metadata from MongoDB to Elasticsearch

In [1]:
!pip install elasticsearch==7.6.0
!pip install pymongo



In [2]:
from elasticsearch import helpers, Elasticsearch

# Initializing Elasticsearch instance
es = Elasticsearch("http://elastic:changeme@search.coronawhy.org:80/", Port=80)

In [3]:
# Installing required libraries
import json
import requests 
from tqdm import tqdm

In [4]:
# Connecting to Database
from pymongo import MongoClient
mongouser = 'coronawhyguest'
mongopass = 'coro901na'
cordversion = 'v38'
client = MongoClient("mongodb://%s:%s@mongodb.coronawhy.org" % (mongouser, mongopass))
db = client.get_database('cord19')
collection = db.v38

In [5]:
# No of documents in the collection
collection.estimated_document_count()

195711

In [12]:
# Getting the schema of the collection
fields = list(collection.find_one().keys())
fields

# We do not want '_id' since it is of MongoDB
fields.remove('_id')
fields

['cord_uid',
 'sha',
 'source_x',
 'title',
 'doi',
 'pmcid',
 'pubmed_id',
 'license',
 'abstract',
 'publish_time',
 'authors',
 'journal',
 'mag_id',
 'who_covidence_id',
 'arxiv_id',
 'pdf_json_files',
 'pmc_json_files',
 'url',
 's2_id']

In [13]:
collection.find_one()

{'_id': ObjectId('5f19280ff73ec959ba2171ce'),
 'cord_uid': 'ug7v899j',
 'sha': 'd1aafb70c066a2068b02786f8929fd9c900897fb',
 'source_x': 'PMC',
 'title': 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia',
 'doi': '10.1186/1471-2334-1-6',
 'pmcid': 'PMC35282',
 'pubmed_id': 11472636.0,
 'license': 'no-cc',
 'abstract': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was 

In [14]:
# Since the metadata schema does not contain any sort of internal complex structure, hence we can directly set the
# value type as string for the fields

## Preparing Elasticsearch request format
request_format = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings" : {
        "properties" : {}
    }
}

for field in fields:
    if field=='cord_uid':
        request_format["mappings"]["properties"][field] = { "type" : "text", "index" : True }
    else:
        request_format["mappings"]["properties"][field] = { "type" : "text" }

In [15]:
# Checking request_format
request_format

{'settings': {'number_of_shards': 1, 'number_of_replicas': 0},
 'mappings': {'properties': {'cord_uid': {'type': 'text', 'index': True},
   'sha': {'type': 'text'},
   'source_x': {'type': 'text'},
   'title': {'type': 'text'},
   'doi': {'type': 'text'},
   'pmcid': {'type': 'text'},
   'pubmed_id': {'type': 'text'},
   'license': {'type': 'text'},
   'abstract': {'type': 'text'},
   'publish_time': {'type': 'text'},
   'authors': {'type': 'text'},
   'journal': {'type': 'text'},
   'mag_id': {'type': 'text'},
   'who_covidence_id': {'type': 'text'},
   'arxiv_id': {'type': 'text'},
   'pdf_json_files': {'type': 'text'},
   'pmc_json_files': {'type': 'text'},
   'url': {'type': 'text'},
   's2_id': {'type': 'text'}}}}

In [None]:
########## Main pipeline ##########

# Delete existing index
try:
    delete = es.indices.delete(index = 'v38metadata')
except Exception as e:
    print("Index does not exist", e)

# Creating a new index
es.indices.create(index='v38metadata', body = request_format, ignore=400)

# Pulling data from MongoDB collection
for data in tqdm(db.v38.find(), total=db.v38.estimated_document_count()):
    data.pop('_id')
    try:
        # Indexing document in Elasticsearch
        res = es.index(index = 'v38metadata', body = data)
    except Exception as e:
        print("Error occurred in this data : ", data)

# Refresing Elasticsearch indices
es.indices.refresh('v38metadata')

print("Indexing finished")

In [18]:
print("Available indices are : ")
print(es.cat.indices())

Available indices are : 
green open .monitoring-kibana-7-2020.07.21 hZpexvaxSjC7f_9f9uhC_w 1 1     8639      0   3.8mb   1.8mb
green open .monitoring-es-7-2020.07.22     DUh81T84Qq-qQ7tXdHnyKA 1 1   242703  71380 264.5mb 132.2mb
green open .monitoring-kibana-7-2020.07.22 uVgbDmxsQamFyLCEZSTbUw 1 1     8640      0   3.8mb   1.9mb
green open .monitoring-kibana-7-2020.07.19 -pV1wDYpQ7ud8-bdqRLSSw 1 1     8639      0   3.8mb   1.9mb
green open .monitoring-es-7-2020.07.17     ZOc7oZnGSCGW5rhAaOmiMw 1 1   242729  72713 269.9mb 134.9mb
green open v9sentences                     _FmARaWMSGujONE8o3EldQ 1 1  4440895      0  42.1gb    21gb
green open mesh                            tfdzhY6OQ9W5Vx6kWwsLrQ 1 1  8775500      0   1.6gb 852.2mb
green open .monitoring-es-7-2020.07.20     kymdTKB2SyCzuOw9QzpM8Q 1 1   242730  71208 264.4mb 132.2mb
green open .monitoring-es-7-2020.07.21     nyBcoXBJQjqJux_m0OXikQ 1 1   242729  71337 265.6mb   133mb
green open grid                            mJ7sKFcHTMei2W

## Notebook as a script for metadata

In case we require it is a script to be executed in one.

In [None]:
import os
os.system('pip install elasticsearch==7.6.0')
os.system('pip install pymongo')

from elasticsearch import helpers, Elasticsearch

# Initializing Elasticsearch instance
es = Elasticsearch("http://elastic:changeme@search.coronawhy.org:80/", Port=80)

# Installing required libraries
import json
import requests 
from tqdm import tqdm

# Connecting to Database
from pymongo import MongoClient
mongouser = 'coronawhyguest'
mongopass = 'coro901na'
cordversion = 'v38'
client = MongoClient("mongodb://%s:%s@mongodb.coronawhy.org" % (mongouser, mongopass))
db = client.get_database('cord19')
collection = db.v38

fields = list(collection.find_one().keys())
fields.remove('_id')

# Since the metadata schema does not contain any sort of internal complex structure, hence we can directly set the
# value type as string for the fields

## Preparing Elasticsearch request format
request_format = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings" : {
        "properties" : {}
    }
}

for field in fields:
    if field=='cord_uid':
        request_format["mappings"]["properties"][field] = { "type" : "text", "index" : True }
    else:
        request_format["mappings"]["properties"][field] = { "type" : "text" }
        
########## Main pipeline ##########

# Delete existing index
try:
    delete = es.indices.delete(index = 'v38metadata')
except Exception as e:
    print("Index does not exist", e)

# Creating a new index
es.indices.create(index='v38metadata', body = request_format, ignore=400)

# Pulling data from MongoDB collection
for data in tqdm(db.v38.find(), total=db.v38.estimated_document_count()):
    data.pop('_id')
    try:
        # Indexing document in Elasticsearch
        res = es.index(index = 'v38metadata', body = data)
    except Exception as e:
        print("Error occurred in this data : ", data)

# Refresing Elasticsearch indices
es.indices.refresh('v38metadata')

print("Indexing finished")