CSCI E-599 Management of DynamoDB database called `demographics`
=====


---


![xkcd_penguins](https://imgs.xkcd.com/comics/march_of_the_penguins.png)


<a id='back-to-top'></a>

### [*Creation of the database*](#creation)
- [Creation of the dynamodb table called `demographics`](#create_table)
- [Batch populate the table with `batch_write_item`](#batch_from_web)   
- [Populate the table from JSON](#populate_from_json)  



### [*Manage the database*](#management)
- [Inquire about the table](#describe_table)
- [Update with new attributes](#update_attributes)
- [Put item into the table](#put_item)
- [Query the total number of items](#query_number_items)
- [Update the secondary index](#update_table_index)
- [Delete the table](#delete_table)


### [*Metadata table for `demographics`*](#metadata)
- [Creation of the metadata table called `demographics_meta`](#create_meta)
- [Populate the table with new metadata items](#populate_meta)   
- [Query the attributes](#query_meta)
- [Delete the metadata table](#delete_meta)


-------

### Preliminaries - libraries to load

In [19]:
import os
import sys
os.environ["TZ"]="UTC"
import time
import boto3

from boto3 import resource
from boto3.dynamodb.conditions import Key

from datetime import datetime
import bs4 as bs
import xml.etree.ElementTree as ET
from lxml import etree
from lxml import html
from unidecode import unidecode
from lxml.etree import tostring
import json
import decimal
import glob
from collections import Counter

import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
## Markdown CSS
from IPython.core.display import HTML
HTML("""
<style>

div.cell { 
    margin-top:1em;
    margin-bottom:1em;
}

div.text_cell_render h1 {
    font-size: 1.8em;
    line-height:1.2em;
    text-align:center;
}

div.text_cell_render h2 {
margin-bottom: -0.2em;
}

table tbody tr td:first-child, 
table tbody tr th:first-child, 
table thead tr th:first-child, 
table tbody tr td:nth-child(4), 
table thead tr th:nth-child(4) {
    background-color: #edf4e8;
}

div.text_cell_render { 
    font-family: 'Garamond';
    font-size:1.4em;
    line-height:1.3em;
    padding-left:3em;
    padding-right:3em;
}

div#notebook-container    { width: 95%; }
div#menubar-container     { width: 65%; }
div#maintoolbar-container { width: 99%; }

</style>
""")

[back-to-top](#back-to-top)
<a id='creation'></a>

*Creation of the database*  
=====


<a id='create_table'></a>


Create the dynamodb table called `demographics`
----


In [9]:
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')

try:
    table = dynamodb_client.create_table(
        TableName='demographics_alt',
        KeySchema=[
            {
                'AttributeName': 'pmcid', 
                'KeyType': 'HASH'
            }
        ], 
        AttributeDefinitions=[
            {
                'AttributeName': 'pmcid', 
                'AttributeType': 'S'
            }
        ], 
        ProvisionedThroughput={
            'ReadCapacityUnits': 50, 
            'WriteCapacityUnits': 50
        },
        StreamSpecification={
        'StreamEnabled': True,
        'StreamViewType': 'NEW_AND_OLD_IMAGES'
        }
    )

    dynamodb_client.get_waiter('table_exists').wait(TableName='demographics')
    print("Table status:",  table['TableDescription']['TableStatus'])
    print("Item count:", table['TableDescription']['ItemCount'])
    
except dynamodb_client.exceptions.ResourceInUseException:
    print("Table in use error - do you really want to recreate the table?")
    pass
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

Table status: CREATING
Item count: 0


In [65]:
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')

try:
    table = dynamodb_client.create_table(
        TableName='demographics',
        KeySchema=[
            {
                'AttributeName': 'pmcid', 
                'KeyType': 'HASH'
            },
            {
                'AttributeName': 'pmid',
                'KeyType': 'RANGE' 
            }
        ], 
        AttributeDefinitions=[
            {
                'AttributeName': 'pmcid', 
                'AttributeType': 'S'
            },
            {
                'AttributeName': 'pmid',
                'AttributeType': 'S'
            },

        ], 
        ProvisionedThroughput={
            'ReadCapacityUnits': 10, 
            'WriteCapacityUnits': 10
        },
        StreamSpecification={
        'StreamEnabled': True,
        'StreamViewType': 'NEW_AND_OLD_IMAGES'
        }
    )

    dynamodb_client.get_waiter('table_exists').wait(TableName='demographics')
    print("Table status:",  table['TableDescription']['TableStatus'])
    print("Item count:", table['TableDescription']['ItemCount'])
    
except dynamodb_client.exceptions.ResourceInUseException:
    print("Table in use error - do you really want to recreate the table?")
    pass
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

Table status: CREATING
Item count: 0


[back-to-top](#back-to-top)
<a id='batch_from_web'></a>


Batch populate the table with `batch_write_item`
------

----


### `batch_write_item` from the XML files in a folder

In [None]:
item_count = 0
itemset_array = []
now = datetime.now().strftime("%Y-%m-%d")
file_path = '/home/dave/datapubmed/targetarticles/'

client = boto3.client('dynamodb')

file_list = glob.glob(file_path + 'PMC*.nxml')
#choice = np.random.randint(10000, size=1000)
for f in range(len(file_list)): 
#for f in choice: 
    item_count += 1
    tree = read_xml(file_list[f])
    if tree is not None:
        dict_article_meta = parse_article_meta(tree)
        pmid = dict_article_meta['pmid'];# print (pmid)
        if (pmid == ''):
            pmid = 'PMID missing'
        pmc = dict_article_meta['pmc']; #print (pmc)
        
        tree_title = tree.find('//title-group/article-title')
        if tree_title is not None:
            title = [t for t in tree_title.itertext()]
            sub_title = tree.xpath('//title-group/subtitle/text()')
            title.extend(sub_title)
            title = [t.replace('\n', ' ').replace('\t', ' ') for t in title]
            if len(title):
                full_title = ' '.join(title)
            else:
                full_title = 'Title missing'
        else:
            full_title = 'Title missing'
            
        item_dict = {
            "PutRequest": {
                "Item": {
                    "pmcid": {"S": pmc},
                    "pmid": {"S": pmid},
                    "date_processed": {"S": now},
                    "title": {"S": full_title},
                }
            }
        }

        itemset_array.append(item_dict)
        
    if (item_count % 25 == 0):
        response = client.batch_write_item(RequestItems={ "demographics": itemset_array})
        itemset_array = []
    #if (item_count % 10000 == 0):
    #    print ("The item_count is: %s " % item_count)

### Create a JSON file with the same `batch_write_item` input

In [14]:
file_path = '/home/dave/datapubmed/targetarticles/'
file_list = []
#for pmcid in list_of_new_pmcids:
for pmcid in missing:
    file_list.append(*glob.glob(file_path + 'PMC'+ pmcid +'.nxml'))
file_list[:20]

['/home/dave/datapubmed/targetarticles/PMC4944317.nxml',
 '/home/dave/datapubmed/targetarticles/PMC4785788.nxml',
 '/home/dave/datapubmed/targetarticles/PMC4760072.nxml',
 '/home/dave/datapubmed/targetarticles/PMC4828621.nxml',
 '/home/dave/datapubmed/targetarticles/PMC4976526.nxml',
 '/home/dave/datapubmed/targetarticles/PMC2228184.nxml',
 '/home/dave/datapubmed/targetarticles/PMC3008007.nxml',
 '/home/dave/datapubmed/targetarticles/PMC2775678.nxml',
 '/home/dave/datapubmed/targetarticles/PMC5643429.nxml',
 '/home/dave/datapubmed/targetarticles/PMC4633648.nxml',
 '/home/dave/datapubmed/targetarticles/PMC4754267.nxml',
 '/home/dave/datapubmed/targetarticles/PMC5504832.nxml',
 '/home/dave/datapubmed/targetarticles/PMC3585216.nxml',
 '/home/dave/datapubmed/targetarticles/PMC5779605.nxml',
 '/home/dave/datapubmed/targetarticles/PMC5743172.nxml',
 '/home/dave/datapubmed/targetarticles/PMC5618110.nxml',
 '/home/dave/datapubmed/targetarticles/PMC4262386.nxml',
 '/home/dave/datapubmed/targeta

In [21]:
def read_xml(path):
    """
    Parse tree from given XML path
    """
    try:
        tree = etree.parse(path)
    except:
        try:
            tree = etree.fromstring(path)
        except Exception as e:
            print("Error: it was not able to read a path, a file-like object, or a string as an XML")
            raise
    if '.nxml' in path:
        remove_namespace(tree) # strip namespace for
    return tree

def remove_namespace(tree):
    """
    Strip namespace from parsed XML
    """
    for node in tree.iter():
        try:
            has_namespace = node.tag.startswith('{')
        except AttributeError:
            continue  # node.tag is not a string (node is a comment or similar)
        if has_namespace:
            node.tag = node.tag.split('}', 1)[1]
            
def parse_article_meta(tree):
    """
    Parse PMID, PMC and DOI from given article tree
    """
    article_meta = tree.find('.//article-meta')
    pmid_node = article_meta.find('article-id[@pub-id-type="pmid"]')
    pmc_node = article_meta.find('article-id[@pub-id-type="pmc"]')
    pub_id_node = article_meta.find('article-id[@pub-id-type="publisher-id"]')
    doi_node = article_meta.find('article-id[@pub-id-type="doi"]')

    pmid = pmid_node.text if pmid_node is not None else ''
    pmc = pmc_node.text if pmc_node is not None else ''
    pub_id = pub_id_node.text if pub_id_node is not None else ''
    doi = doi_node.text if doi_node is not None else ''

    dict_article_meta = {'pmid': pmid,
                         'pmc': pmc,
                         'doi': doi,
                         'publisher_id': pub_id}

    return dict_article_meta

item_count = 0
itemset_array = []
now = datetime.now().strftime("%Y-%m-%d")

#file_path = '/home/dave/datapubmed/targetarticles/'

client = boto3.client('dynamodb')

#file_list = glob.glob(file_path + 'PMC*.nxml')

for f in range(len(file_list)): 

    tree = read_xml(file_list[f])
    if tree is not None:
        dict_article_meta = parse_article_meta(tree)
        pmid = dict_article_meta['pmid'];# print (pmid)
        if pmid == '':
            pmid = 'PMID missing'
        pmc = dict_article_meta['pmc']; #print (pmc)
        
        tree_title = tree.find('//title-group/article-title')
        if tree_title is not None:
            title = [t for t in tree_title.itertext()]
            sub_title = tree.xpath('//title-group/subtitle/text()')
            title.extend(sub_title)
            title = [t.replace('\n', ' ').replace('\t', ' ') for t in title]
            if len(title):
                full_title = ' '.join(title)
            else:
                full_title = 'Title missing'
        else:
            full_title = 'Title missing'
            
        item_dict = {
            "PutRequest": {
                "Item": {
                    "pmcid": {"S": pmc},
                    "pmid": {"S": pmid},
                    "date_processed": {"S": now},
                    "title": {"S": full_title},
                }
            }
        }

        itemset_array.append(item_dict)
output_dict = { "demographics": itemset_array}

### Dump the JSON file to disk

In [None]:
jsondata = json.dumps(output_dict)
f = open("output_dict.json","w")
f.write(jsondata)
f.close()

### Create a JSON file with the list of PMCIDs in the demographics JSON file created above

In [None]:
#print([d['PutRequest']['Item']['pmcid']['S']  for d in output_dict['demographics']]) 

PMCIDs_in_demographics = [d['PutRequest']['Item']['pmcid']['S']  for d in output_dict['demographics']]
jsondata = json.dumps(pmcid_in_demographics)
f = open("PMCIDs_in_demographics.json","w")
f.write(jsondata)
f.close()

### Check the distribution of the contents of the `output_dict` dict

In [None]:
pprint(Counter( [d['PutRequest']['Item']['pmcid']['S']  for d in output_dict['demographics']]).most_common(40))
pprint(Counter( [d['PutRequest']['Item']['pmid']['S']  for d in output_dict['demographics']]).most_common(40))
pprint(Counter( [d['PutRequest']['Item']['date_processed']['S']  for d in output_dict['demographics']]).most_common(40))
pprint(Counter( [d['PutRequest']['Item']['title']['S']  for d in output_dict['demographics']]).most_common(40))

[back-to-top](#back-to-top)
<a id='populate_from_json'></a>


Populate the table from JSON
------


----


### Fix the missing PMIDs in the table - these are missing in the XML files

In [None]:
openaccess_df = pd.read_csv('oa_file_list.csv')
print(openaccess_df.shape)

openaccess_df['PMCID'] = openaccess_df['Accession ID'].str[3:]
print(openaccess_df.count(axis=0))
openaccess_df.head()

In [None]:
demographics_in_oa = openaccess_df[openaccess_df['PMCID'].isin(PMCIDs_in_demographics)]
print(len(demographics_in_oa))
print(demographics_in_oa[:10])
demographics_in_oa = demographics_in_oa.set_index('PMCID')
demographics_in_oa.head()

In [None]:
missing_pmid = []
for d in output_dict['demographics']:
    mykey = d['PutRequest']['Item']['pmcid']['S']
    if (d['PutRequest']['Item']['pmid']['S'] == ''):
        missing_pmid.append(mykey)
        d['PutRequest']['Item']['pmid']['S'] = str(int(demographics_in_oa['PMID'].loc[mykey]))
        
print(len(missing_pmid))
print(missing_pmid[:10])

In [None]:
for mykey in missing_pmid:
    print(int(demographics_in_oa['PMID'].loc[mykey]))

In [None]:
resource = boto3.resource('dynamodb')
table = resource.Table('demographics')

for i in range(len(missing_pmid)):
    mykey = missing_pmid[i]

    table.update_item(
        Key={'pmcid': mykey},
        UpdateExpression="set pmid = :s",
        ExpressionAttributeValues={
            ':s': str(int(demographics_in_oa['PMID'].loc[mykey]))
        },
    )

### Create a JSON file with the table items from the `output_dict.json` file 

In [27]:
# In PutRequest style
demographic_json = []
for d in output_dict['demographics']:
    itemdict = {
        "pmcid": {"S": str(d['PutRequest']['Item']['pmid']['S'])},
        "pmid":  {"S": str(d['PutRequest']['Item']['pmcid']['S'])},
        "date_processed":  {"S": str(d['PutRequest']['Item']['date_processed']['S'])},
        "title":  {"S": unidecode(d['PutRequest']['Item']['title']['S'])}
        }
    demographic_json.append(itemdict)

In [23]:
# In normal put style
now = datetime.now().strftime("%Y-%m-%d")
demographic_json = []
for d in output_dict['demographics']:
    itemdict = {
        "pmcid": str(d['PutRequest']['Item']['pmcid']['S']),
        "pmid":  str(d['PutRequest']['Item']['pmid']['S']),
        "date_processed":  str(now),
        "title":  unidecode(d['PutRequest']['Item']['title']['S'])
        }
    demographic_json.append(itemdict)

In [None]:
jsondata = json.dumps(demographic_json)
f = open("demographic_json.json","w")
f.write(jsondata)
f.close()

#f = open('demographic_json.json')
#request_items = json.loads(f.read())

### Batch put from the JSON object

#### *This is what has populated the current table*

In [24]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

with table.batch_writer() as batch:
    for i in range(len(demographic_json)):
        batch.put_item(
            Item=demographic_json[i]
        )
        if (i % 1000 == 0):
            print ("The item_count is: %s " % (i))

The item_count is: 0 


### Normal  put from the JSON object

In [None]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

for i in range(len(demographic_json)):
    table.put_item(
        Item=demographic_json[i]
    )
#    if (i % 10 == 0):
#        print ("The item_count is: %s " % (i))

### Normal  put from a JSON file

In [None]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
input_file = "demographic_json.json"

try:
    table = dynamodb_resource.Table('demographics')
    print("Instantiate a table: ",table.creation_date_time)
    print("Ready to load data\n")
    incr = 0
    with open(input_file) as json_file:
        itemset = json.load(json_file, parse_float = decimal.Decimal)
        for item in itemset:
            incr += 1
            #pmcid = item['pmcid']
            #title = item['title']
            #date_processed = item['date_processed']
            #print("Adding record # ", incr," pmcid: ",pmcid," title: ",title,)
            #if (incr % 10000 == 0):
            #    print("Adding record # ", incr)
            table.put_item(
               Item={
                   'pmcid': item,
                   #'pmcid': pmcid,
                   #'title': title,
                   #'date_processed': date_processed,
                }
            )
            
except dynamodb_client.exceptions.ResourceNotFoundException:
    print("Table does not exist - cannot delete it")
    pass            
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

[back-to-top](#back-to-top)
<a id='management'></a>

*Manage the database*  
=====

<a id='describe_table'></a>

Inquire about the table
------

----


In [None]:
dynamodb_client = boto3.client('dynamodb')

try:
    response = dynamodb_client.describe_table(TableName='demographics')
    pprint(response)
    
except dynamodb_client.exceptions.ResourceNotFoundException:
    print("Table does not exist - cannot describe it")
    pass

In [26]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

def get_table_metadata(table_name):
    """
    Get some metadata about chosen table.
    """
    table = dynamodb_resource.Table(table_name)

    return {
        'num_items': table.item_count,
        'primary_key_name': table.key_schema[0],
        'status': table.table_status,
        'bytes_size': table.table_size_bytes,
        'global_secondary_indices': table.global_secondary_indexes
    }

demographics_metata = get_table_metadata('demographics')
pprint(demographics_metata)

{'bytes_size': 677169553,
 'global_secondary_indices': None,
 'num_items': 297786,
 'primary_key_name': {'AttributeName': 'pmcid', 'KeyType': 'HASH'},
 'status': 'ACTIVE'}


[back-to-top](#back-to-top)
<a id='update_attributes'></a>


Update with new attributes
------

----


### Remove an attribute from an item


In [105]:
f = open('missing_update3004.json')
missing = json.loads(f.read())
f.close()
print(len(missing))

missing.remove('5885332')
missing.remove('5819069')
print(len(missing))

3326
3324


In [106]:
# Update from loaded JSON object of input attributes input_json

dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')

for pmcid in missing:

    dynamodb_client.update_item(
        TableName = 'demographics',
        Key = {'pmcid': {'S': pmcid}},
        AttributeUpdates={
        'errorStatus': {'Action': 'DELETE'}
        }
    )

 ### Update an entire dict item from loaded JSON object of input attributes input_json

In [None]:
# Update from loaded JSON object of input attributes input_json

responses = []
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

for i in range(len(input_json)):
    #mykey = input_json[i]['pmcid']

    table.update_item(Item=input_json[i])
       
#    if (i % 10000 == 0):
#        print ("The item_count is: %s " % (i))

### Update some titles in the dynamodb, from a list of PMCID in a JSON file
### First create an array containing the updated titles

In [25]:
f = open("trunk_tables.json","r")
input_json = json.loads(f.read())
f.close()

update_title = []
for d in input_json[:5]:
    pmcid = d['Item']['pmcid']
    i += 1
    fpath = glob.glob('/home/dave/datapubmed/targetarticles/PMC'+pmcid+'.nxml')
    fh = open(fpath[0],'r')
    soup = bs.BeautifulSoup(fh,'lxml')
    #print(soup)
    title = soup.find("article-title").get_text()
    
    print(pmcid)
    print(title)
    
    update_title.append({'Item': {'pmcid': pmcid, 'title': str(title)} })

5769429
Effects of three frequencies of self-monitored blood glucose on HbA1c and quality of life in patients with type 2 diabetes with once daily insulin and stable control: a randomized trial
2734346
Vitamin B12 status in patients of Turkish and Dutch descent with depression: a comparative cross-sectional study
5788687
Chemoradiotherapy in combination with radical surgery is associated with better outcome in cervical cancer patients
5667482
Antibiotic perturbation of mixed-strain Pseudomonas aeruginosa infection in patients with cystic fibrosis
3831254
Atovaquone-proguanil in the treatment of imported uncomplicated Plasmodium falciparum malaria: a prospective observational study of 553 cases


### Then update the item in the dynamodb table

In [26]:
# Update one attribute from loaded JSON object of input attributes input_json
from boto3.dynamodb.conditions import Key

responses = []
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

for i in range(len(update_title)):
    mykey = update_title[i]['Item']['pmcid']
    
    response = table.update_item(
        Key={'pmcid': mykey},
        UpdateExpression="set title = :s",
        ExpressionAttributeValues={
            ':s': update_title[i]['Item']['title']
        },
        ReturnValues="UPDATED_NEW"
    )
    responses.append(response)

### Update the item with the table1 string that comes from an external JSON file

In [32]:
# Update one attribute from loaded JSON object of input attributes input_json
from boto3.dynamodb.conditions import Key

responses = []
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

f = open("trunk_tables.json","r")
input_json = json.loads(f.read())
f.close()

for i in range(len(input_json)):
    mykey = input_json[i]['Item']['pmcid']
    
    response = table.update_item(
        Key={'pmcid': mykey},
        UpdateExpression="set table1 = :s",
        ExpressionAttributeValues={
            ':s': input_json[i]['Item']['table']
        },
        ReturnValues="UPDATED_NEW"
    )
    #responses.append(response)

In [19]:
'''
jsondata = json.dumps(responses)
f = open("update_tables_responses.json","w")
f.write(jsondata)
f.close()
'''

[back-to-top](#back-to-top)
<a id='put_item'></a>

Put item into the table
------

----

In [125]:
from boto3 import resource
from boto3.dynamodb.conditions import Key

# The boto3 dynamoDB resource
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

def read_table_item(table_name, pk_name, pk_value):
    """
    Return item read by primary key.
    """
    table = dynamodb_resource.Table(table_name)
    response = table.get_item(Key={pk_name: pk_value})

    return response

def add_item(table_name, col_dict):
    """
    Add one item (row) to table. col_dict is a dictionary {col_name: value}.
    """
    table = dynamodb_resource.Table(table_name)
    response = table.put_item(Item=col_dict)

    return response

def delete_item(table_name, pk_name, pk_value):
    """
    Delete an item (row) in table from its primary key.
    """
    table = dynamodb_resource.Table(table_name)
    response = table.delete_item(Key={pk_name: pk_value})

    return

In [141]:
response = read_table_item('demographics', 'pmcid', '2842548')["Item"]
print(response)

{'pmcid': '2842548', 'pmid': '20119873', 'date_processed': '2018-05-06'}


In [137]:
table = dynamodb_resource.Table('demographics')
table.delete_item(Key={'pmcid': '2842549'})

{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
   'content-length': '2',
   'content-type': 'application/x-amz-json-1.0',
   'date': 'Sun, 06 May 2018 13:15:58 GMT',
   'server': 'Server',
   'x-amz-crc32': '2745614147',
   'x-amzn-requestid': '4IOB7UJGQH6VVHF04EA7EKVRVNVV4KQNSO5AEMVJF66Q9ASUAAJG'},
  'HTTPStatusCode': 200,
  'RequestId': '4IOB7UJGQH6VVHF04EA7EKVRVNVV4KQNSO5AEMVJF66Q9ASUAAJG',
  'RetryAttempts': 0}}

In [146]:
try:
    table = dynamodb_resource.Table('demographics')
    response = table.get_item(Key={'pmcid': '2842548'})["Item"]
except KeyError:
    print("PMCID not in database - OK")
else: 
    print("Deleting unavailable PMCID from database")
    response = table.delete_item(Key={'pmcid': '2842548'})
    print(response)
finally:
    print("Delete PMCID finished")

PMCID not in database - OK
Delete PMCID finished


In [90]:
from boto3 import resource
from boto3.dynamodb.conditions import Key

dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

new_list = ["5921293", "5921393", "5921441", "5921537", "5921545", "5921551", "5921555", "5921754", "5921777", "5921784", "5921968", "5922463", "5922473", "5922475", "5922494", "5922512", "5922517", "5923187", "5923189", "5923190", "5923197", "5924454", "5924456", "5924465", "5924469", "5924472", "5924483", "5924485", "5924495", "5924497", "5924498", "5924499", "5924508"]

for dd in new_list:
    delete_item('demographics', 'pmcid', dd)

### Get an item from the table

In [10]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')
pmcidd_list = ['1513380']

for i in range(len(pmcidd_list)):

    mykey = pmcidd_list[i]
    response = table.get_item(Key={'pmcid': mykey})
    pprint(response)

{'Item': {'date_processed': '2018-04-07',
          'errorStatus': 'not randomized clinical trials',
          'pmcid': '1513380',
          'pmid': '16469109',
          'title': 'Evaluation and selection of tandem repeat loci for a  '
                   'Brucella  MLVA typing assay'},
 'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
                                      'content-length': '245',
                                      'content-type': 'application/x-amz-json-1.0',
                                      'date': 'Wed, 18 Apr 2018 16:24:17 GMT',
                                      'server': 'Server',
                                      'x-amz-crc32': '1511758879',
                                      'x-amzn-requestid': '6KP5NMKTIEAIKSSOQ977ISQETRVV4KQNSO5AEMVJF66Q9ASUAAJG'},
                      'HTTPStatusCode': 200,
                      'RequestId': '6KP5NMKTIEAIKSSOQ977ISQETRVV4KQNSO5AEMVJF66Q9ASUAAJG',
                      'RetryAttempts': 0}}


[back-to-top](#back-to-top)
<a id='query_number_items'></a>


Query the total number of items
----


In [None]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

response = table.scan()
pmcids_in_demographics = list(map(lambda d: d['pmcid'], response['Items']))

while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    pmcids_in_demographics.extend(list(map(lambda d: d['pmcid'], response['Items'])))
    

elapsed_time = (time.time() - start_time) / 60.

In [5]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

response = table.scan()

pmcids_list = []
for d in response['Items']:
    pmcids_list.append(d['pmcid'])

while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    for d in response['Items']:
        pmcids_list.append(d['pmcid'])

    
elapsed_time = (time.time() - start_time) / 60.

print(len(pmcids_list))
print(pmcids_list[:20])
print (elapsed_time)

297786
['5405840', '2891753', '3453493', '3463035', '5048196', '5073468', '3689686', '4463981', '5381770', '3101364', '3305974', '2621409', '4872957', '3976659', '4221004', '3806431', '5469286', '280705', '4283867', '3908486']
10.869550589720408


In [6]:
print(len(pmcids_list))
print (elapsed_time)

print(pmcids_list[:20])

297786
10.869550589720408
['5405840', '2891753', '3453493', '3463035', '5048196', '5073468', '3689686', '4463981', '5381770', '3101364', '3305974', '2621409', '4872957', '3976659', '4221004', '3806431', '5469286', '280705', '4283867', '3908486']


In [7]:
targetarticles_files_list = glob.glob('/home/dave/datapubmed/targetarticles/PMC*.nxml')
targetarticles_files_list = [i[40:] for i  in targetarticles_files_list  ]
targetarticles_files_list = [i[:-5] for i  in targetarticles_files_list  ]

print(targetarticles_files_list[:5])
print(len(targetarticles_files_list))

['3778263', '4473156', '5771285', '4264693', '4210730']
297876


In [72]:
overlap = list(set.intersection(set(targetarticles_files_list),set(pmcids_list)))
print(len(overlap))
missing = list(set(targetarticles_files_list)-set(pmcids_list))
print(missing)

297786
['4944317', '4785788', '4760072', '4828621', '4976526', '2228184', '3008007', '2775678', '5643429', '4633648', '4754267', '5504832', '3585216', '5779605', '5743172', '5618110', '4262386', '2604886', '5089533', '5850733', '4305571', '4210062', '5723000', '2719593', '3382468', '4482050', '4310883', '3809218', '2365166', '2010328', '4625716', '5714616', '4093809', '4223617', '5361878', '4928134', '5154172', '5459830', '5299488', '5064824', '1785373', '3893583', '5665161', '3977546', '5168601', '5629297', '5102273', '4906605', '2807149', '5807467', '3632158', '4508277', '3554429', '3282277', '3965122', '4902190', '2977546', '4018267', '5025218', '4413803', '5651857', '3241335', '5083722', '5602838', '3902330', '4548310', '4418041', '5597500', '3514223', '4763193', '3335638', '3921260', '4919485', '1552061', '5837366', '3876307', '3235297', '5733425', '3899261', '4061537', '3599633', '3794630', '5512524', '4883761', '5005760', '5735637', '4327595', '5579465', '5154137', '4444136']


In [73]:
from shutil import copyfile

source = 'targetarticles'
destination = 'missing'

for fh in missing:
    copyfile('./'+ source +'/PMC' + fh +'.nxml', './'+ destination +'/PMC' + fh +'.nxml')

Check if the item has the "errorstatus" attribute, and if so, save its PMCID to a list
----

In [8]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

response = table.scan()

errorstatus = []
for d in response['Items']:
    if 'errorStatus' in d.keys():
        if (d['errorStatus'] == 'not randomized clinical trials'):
            errorstatus.append(d['pmcid'])

while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    for d in response['Items']:
        if 'errorStatus' in d.keys():
            if (d['errorStatus'] == 'not randomized clinical trials'):
                errorstatus.append(d['pmcid'])

    

elapsed_time = (time.time() - start_time) / 60.

print(len(errorstatus))
print(errorstatus[:20])
print (elapsed_time)

'''
249291
9.458658456802368
'''

26229
['2977546', '3008007', '3977546', '5005760', '4444136', '5459830', '3965122', '5512524', '5807467', '2775678', '4872957', '3920516', '3648317', '5872926', '1199527', '3685044', '3275215', '5165073', '1513380', '3918149']
12.374004781246185


'\n249291\n9.458658456802368\n'

In [9]:
jsondata = json.dumps(missing)
f = open("missing.json","w")
f.write(jsondata)
f.close()

In [33]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

response = table.scan()

numwithsentences = 0 
for d in response['Items']:
    if 'table1' in d.keys():
        numwithsentences += 1

while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    for d in response['Items']:
        if 'table1' in d.keys():
            numwithsentences += 1

    

elapsed_time = (time.time() - start_time) / 60.

print(numwithsentences)
print (elapsed_time)

'''
249291
9.458658456802368
'''

72722
12.356586949030559


'\n249291\n9.458658456802368\n'

In [112]:
zika_pmcid = ['5870999', '5804079', '5812309', '5800699', '5778641', '5712121', '5712781', '5707826', '5741275', '5640845', '5636051', '5576130', '5718785', '5559635', '5552158', '5890528', '5645715', '5484518', '5490820', '5853302', '5506587', '5443526', '5436874', '5414005', '5405794', '5457678', '5357690', '5319961', '5225411', '5360567', '5322287', '5245776', '5240929', '5165716', '5165082', '5210549', '5091912', '5105876', '5072634', '5137455', '5055662', '5034338', '5078599', '4989439', '4988764', '4980036', '4947515', '5124348', '4962217', '4915778', '4794306']

In [61]:
from boto3.dynamodb.conditions import Key, Attr

dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

answer_zika = []
for dd in zika_pmcid:
    response = table.query(
        KeyConditionExpression=Key('pmcid').eq(dd)
    )
    items = response['Items']
    answer_zika.append(items)

elapsed_time = (time.time() - start_time) / 60.

print (elapsed_time)

'''
249291
9.458658456802368
'''

0.09643032153447469


'\n249291\n9.458658456802368\n'

In [None]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

numwithsentences = 0 
for dd in zika_pmcid[:20]:
    response = table.query(
        KeyConditionExpression=Key('pmcid').eq(dd)
    )
   # if 'sentences' in response['Items'].keys():
    print(response['Items'])
    numwithsentences += 1

In [66]:
jsondata = json.dumps(answer_zika)
f = open("zika_query.json","w")
f.write(jsondata)
f.close()

Scan the table for all items processed since a date
----

In [None]:
from boto3.dynamodb.conditions import Key, Attr

dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

response = table.scan(
    FilterExpression=Attr('date_processed').gt('2018-04-29')
)
items = response['Items']
pprint(items)

elapsed_time = (time.time() - start_time) / 60.

print (elapsed_time)

### Do it with the page return

In [69]:
from boto3.dynamodb.conditions import Key, Attr

now = datetime.now().strftime("%Y-%m-%d")
date_1 = datetime.strptime(now, "%Y-%m-%d")
now = str(now)
since_date = date_1 + timedelta(days=-1)
since_date = str(since_date.strftime("%Y-%m-%d"))

dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

response = table.scan(
    FilterExpression=Attr('date_processed').gt(since_date)
)

numtotal = 0 
numwithsentences = 0 
numwithtables = 0 
for d in response['Items']:
    numtotal += 1
    if 'table1' in d.keys():
        numwithtables += 1
    if 'sentences' in d.keys():
        numwithsentences += 1
        
while 'LastEvaluatedKey' in response:
    response = table.scan(
        FilterExpression=Attr('date_processed').gt(since_date),
        ExclusiveStartKey=response['LastEvaluatedKey']
    )
    for d in response['Items']:
        numtotal += 1
        if 'table1' in d.keys():
            numwithtables += 1
        if 'sentences' in d.keys():
            numwithsentences += 1

elapsed_time = (time.time() - start_time) / 60.

print (elapsed_time)

1.492504096031189


'\n249291\n9.458658456802368\n'

In [70]:
print(numtotal)
print(numwithsentences)
print(numwithtables)

3416
0
1018


In [None]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

def get_table_metadata(table_name):
    """
    Get some metadata about chosen table.
    """
    table = dynamodb_resource.Table(table_name)

    return {
        'num_items': table.item_count,
        'primary_key_name': table.key_schema[0],
        'status': table.table_status,
        'bytes_size': table.table_size_bytes,
        'global_secondary_indices': table.global_secondary_indexes
    }

demographics_metata = get_table_metadata('demographics')
pprint(demographics_metata)

In [81]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

jsondata = {
    "update": '2018-04-29', 
    "total_items": table.item_count,
    "total_updates": numtotal,
    "with_sentences": numwithsentences,
    "with_tables": numwithtables
}


s3bucketfile = "update_stats.json"
s3bucket = "pubminer-upload-test"
s3_resource = boto3.resource('s3')
obj = s3_resource.Object(s3bucket,s3bucketfile)
obj.put(Body=json.dumps(jsondata), ACL='public-read')

{'ETag': '"290af6e241e67db5c2600b2923b176ec"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Tue, 01 May 2018 17:12:52 GMT',
   'etag': '"290af6e241e67db5c2600b2923b176ec"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'XM1Jcku/gk9qRbEWtBYWWqLSEyg0mVqfCNsOVxkjJjrRqBzfPxOc8q1mOKrAS/qnhGQ6lWKPJdo=',
   'x-amz-request-id': '465AD98A30740A5D'},
  'HTTPStatusCode': 200,
  'HostId': 'XM1Jcku/gk9qRbEWtBYWWqLSEyg0mVqfCNsOVxkjJjrRqBzfPxOc8q1mOKrAS/qnhGQ6lWKPJdo=',
  'RequestId': '465AD98A30740A5D',
  'RetryAttempts': 0}}

In [54]:
from boto3.dynamodb.conditions import Key, Attr

dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

response = table.scan(
    FilterExpression=Key('pmcid').eq('5812309')
)
items = response['Items']
print(items)

    

elapsed_time = (time.time() - start_time) / 60.

print (elapsed_time)

'''
249291
9.458658456802368
'''

[]
0.008143361409505208


'\n249291\n9.458658456802368\n'

In [None]:
from boto3.dynamodb.conditions import Key, Attr

dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

zika_list = [{"S": d} for d in zika_pmcid]

response = table.query(
    IndexName='pmcid',
    KeyConditions={
        "pmcid":{
            "ComparisonOperator":'IN',
            "AttributeValueList": [{'S': '5870999'},  {'S': '5804079'}, {'S': '5812309'}]  #zika_list
        }
    }
)
items = response['Items']
print(items)

    

elapsed_time = (time.time() - start_time) / 60.

print (elapsed_time)

'''
249291
9.458658456802368
'''

In [None]:
from boto3.dynamodb.conditions import Key, Attr

dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

start_time = time.time()

zika_list = [{"S": d} for d in zika_pmcid]

response = table.query(
    IndexName='pmcid',
    ExpressionAttributeValues={
        "pmcid":{
            "ComparisonOperator":'IN',
            "AttributeValueList": [{'S': '5870999'},  {'S': '5804079'}, {'S': '5812309'}]  #zika_list
        }
    }
)
items = response['Items']
print(items)

    

elapsed_time = (time.time() - start_time) / 60.

print (elapsed_time)

'''
249291
9.458658456802368
'''

## Query items by PMCID key

In [None]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics')

request = {
    'ExpressionAttributeNames': {
        '#n0': 'hash_key',
        '#n1': 'range_key'
    },
    'ExpressionAttributeValues': {
        ':v0': {'S': MY_HASH_KEY},
        ':v1': {'N': GT_RANGE_KEY}
    },
    'KeyConditionExpression': '(#n0 = :v0) AND (#n1 > :v1)',
    'TableName': TABLE_NAME
}
response = table.query(**request)

[back-to-top](#back-to-top)
<a id='update_table_index'></a>


update the `demographics` table, like delete an index, modify throughput
----

In [None]:
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')

response = dynamodb_client.update_table(
    #ProvisionedThroughput={
    #    'ReadCapacityUnits': 10,
    #    'WriteCapacityUnits': 10,
    #},
    GlobalSecondaryIndexUpdates=[
        {
            'Delete': {
                'IndexName': 'pmid'
            }
        },
    ],
    TableName='demographics_alt',
)

[back-to-top](#back-to-top)
<a id='delete_table'></a>


Delete the `demographics` table
----


In [8]:
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')

try:
    dynamodb_client.delete_table(TableName='demographics_alt')
    print('Deleted table')
        
except dynamodb_client.exceptions.ResourceNotFoundException:
    print("Table does not exist - cannot delete it")
    pass
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

Deleted table


[back-to-top](#back-to-top)
<a id='metadata'></a>

*Metadata table  `demographics_meta`*
=====


<a id='create_meta'></a>


Creation of the metadata table called `demographics_meta`
----

-----

In [110]:
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')

try:
    table = dynamodb_client.create_table(
        TableName='demographics_meta',
        KeySchema=[
            {
                'AttributeName': 'source', 
                'KeyType': 'HASH'
            }
        ], 
        AttributeDefinitions=[
            {
                'AttributeName': 'source', 
                'AttributeType': 'S'
            },
        ], 
        ProvisionedThroughput={
            'ReadCapacityUnits': 5, 
            'WriteCapacityUnits': 5
        },
        StreamSpecification={
        'StreamEnabled': True,
        'StreamViewType': 'NEW_AND_OLD_IMAGES'
        }
    )

    dynamodb_client.get_waiter('table_exists').wait(TableName='demographics_meta')
    print("Table status:",  table['TableDescription']['TableStatus'])
    print("Item count:", table['TableDescription']['ItemCount'])
    
except dynamodb_client.exceptions.ResourceInUseException:
    print("Table in use error - do you really want to recreate the table?")
    pass
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

Table status: CREATING
Item count: 0


[back-to-top](#back-to-top)
<a id='populate_meta'></a>


Populate the table with new metadata items
------

----


## Populate from a list in memory

In [None]:
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
small_list_of_ids = [str(i) for i in range(20)]


class DecimalEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, decimal.Decimal):
            if o % 1 > 0:
                return float(o)
            else:
                return int(o)
        return super(DecimalEncoder, self).default(o)
    
response = dynamodb_client.put_item(
    Item={
        'source': {"S": "demographics"},
        'pmcids':{"NS": small_list_of_ids},
        'items': {"N": decimal.Decimal(len(small_list_of_ids))},
        'items_downloaded': {"N": str(len(small_list_of_ids))},
        'date_updated': {"S": now} ,
    },
    TableName='demographics_meta',
)

## Populate from a list in a JSON file

In [61]:
jsondata = json.dumps(small_list_of_ids)
f = open("small_list_of_ids.json","w")
f.write(jsondata)
f.close()

In [62]:
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
#file_path = '/home/dave/datapubmed/'
#input_file = file_path + "inS3BucketIDs.json"
input_file = "small_list_of_ids.json"
try:
    with open(input_file) as json_file:
        itemset = json.load(json_file)
        response = dynamodb_client.put_item(
            Item={
               'source': {"S": "saved_json"},
               'pmcids':{"NS": itemset},
               'items': {"N": str(len(itemset))},
               'date_updated': {"S": now} ,
            },
            TableName='demographics_meta',
        )
            
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise            

[back-to-top](#back-to-top)
<a id='query_meta'></a>


Query the metadata table on attributes
------

----


In [64]:
dynamodb_resource = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb_resource.Table('demographics_meta')
item_list = ['demographics']

for i in range(len(item_list)):

    mykey = item_list[i]
    response = table.get_item(Key={'source': mykey})
    pprint(response)

{'Item': {'date_updated': '2018-04-09 21:36:15',
          'items': Decimal('20'),
          'pmcids': {Decimal('1785373'),
                     Decimal('2977546'),
                     Decimal('3514223'),
                     Decimal('3554429'),
                     Decimal('3809218'),
                     Decimal('3893583'),
                     Decimal('3921260'),
                     Decimal('4061537'),
                     Decimal('4093809'),
                     Decimal('4210062'),
                     Decimal('4310883'),
                     Decimal('4763193'),
                     Decimal('4785788'),
                     Decimal('4906605'),
                     Decimal('5102273'),
                     Decimal('5405840'),
                     Decimal('5504832'),
                     Decimal('5651857'),
                     Decimal('5743172'),
                     Decimal('5837366')},
          'source': 'demographics'},
 'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-a

[back-to-top](#back-to-top)
<a id='delete_meta'></a>


Delete the metadata table
----


In [109]:
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1')

try:
    dynamodb_client.delete_table(TableName='demographics_meta')
    print('Deleted table')
        
except dynamodb_client.exceptions.ResourceNotFoundException:
    print("Table does not exist - cannot delete it")
    pass
except OSError as err:
    print("OS error: {0}".format(err))
except ValueError:
    print("Could not convert data to an integer.")
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

Deleted table
