##### Homework1

In [11]:
import pymongo
import json

client = pymongo.MongoClient('localhost', 27017)
db = client["Homeworks"]
collection = db["pubmed"]

with open('pubmed_cleaned.json', "r") as f:
    data = json.load(f)

for doc in data:
    if '_id' in doc:
        doc['_id'] = str(doc['_id']['$oid'])

try:
    collection.insert_many(data)
except pymongo.errors.BulkWriteError as e:
    print("Bulk write error occurred:", e)

In [80]:
''' Je choisit le key '_id' pour l'index car c'est une valeur unique pour chaque article et il est 
utilisé pour identifier les articles dans la base de données pubmed.'''

collection.create_index([('_id',1)])

'_id_1'

In [28]:
# 2) Delete every paper that was published prior 2019 
from datetime import datetime
docs = collection.find()
for doc in docs:
    date_string = doc['date']
    date_string = date_string.replace("year", "").replace("month", "").replace("day", "").replace("hour", "").replace("minute", "").replace(",","-").replace(" ","")
    date = datetime.strptime(date_string, "%Y-%m-%d-%H-%M")
    if date.year < 2019:
        collection.delete_one({'_id':doc['_id']})

In [32]:
# 3) How many paper have a single author ? Two authors ?
single_author_count = collection.count_documents({"$expr": {"$eq": [{"$size": {"$split": ["$authors", "\n"]},}, 1]}})
two_authors_count = collection.count_documents({"$expr": {"$eq": [{"$size": {"$split": ["$authors", "\n"]},}, 2]}})
print("Number of papers with a single author: ", single_author_count)
print("Number of papers with two authors: ", two_authors_count)

Number of papers with a single author:  5480
Number of papers with two authors:  10153


In [34]:
# 4) What's the last paper inserted in the db ?
last_paper = collection.find().sort([("_id", -1)]).limit(1)
print("Last paper inserted in the db: ", last_paper[0]['pmid'], last_paper[0]['title'])

Last paper inserted in the db:  31226374 Identification of a novel UDP-glycosyltransferase gene from Rhodiola rosea and its expression during biotransformation of upstream precursors in callus culture.


In [35]:
# 5) Find articles with null meshwords.
docs = collection.find({"meshwords": None})
for doc in docs:
    print(doc['pmid'], doc['title'])

30618194 Liberating Native Mass Spectrometry from Dependence on Volatile Salt Buffers by Use of Gabor Transform.
30620089 Palladium-Catalyzed Carbo-Oxygenation of Propargylic Amines using in Situ Tether Formation.
30620713 Factors associated with distant metastasis in pediatric thyroid cancer: evaluation of the SEER database.
30621200 Special Issue: Gut Bacteria-Mucus Interaction.
30625514 Introduction to 2019 World Federation for Medical Education World Conference.
30626152 Editorial for the Special Issue on Glassy Materials Based Microdevices.
30630289 Comparative Behavioral Correlation of High and Low-Performing Mice in the Forced Swim Test.
30631788 Correction: Comparison of school day eating behaviours of 8-11 year old children from Adelaide, South Australia, and London, England: Child eating behaviours in South Australia and England.
30635026 Correction to: RNA G-quadruplexes at upstream open reading frames cause DHX36- and DHX9-dependent translation of human mRNAs.
30636978 Corr

In [42]:
# 6) Choose a keyword you are interested in (machine learning, computer vision,...). 
# Find the number of articles with the choosen keyword in their meshwords, abstract or title.
keyword = "data science"
docs = collection.find({"$or": [{"meshwords": {"$regex": keyword, "$options": "i"}},
                                 {"abstract": {"$regex": keyword, "$options": "i"}}, 
                                 {"title": {"$regex": keyword, "$options": "i"}}]})
doc_list = list(docs)
doc_num = len(doc_list)
print(doc_num)

19


In [46]:
# 7) What's the number of articles that have at least one affiliation AND meshwords.
docs = collection.find({"$and": [{"affiliation": {"$ne": ""}}, 
                                 {"meshwords": {"$ne": ""}}]
                                 })
doc_list = list(docs)
doc_num = len(doc_list)
print(doc_num)

99786


In [79]:
# 8) How many articles have a publishing date after 2020 ?

from datetime import datetime

docs = collection.find({"date": {"$gt": datetime(2020, 1, 1)}})
doc_count = len(list(docs))
print(doc_count)
    

0


In [64]:
# 9) Find articles where there's atleast one affiliation from a choosen country (you decide which one).
country = "China"
docs = collection.find({"authors": {"$regex": country, "$options": "i"}})
for doc in docs:
    print(doc['pmid'], doc['title'])

30614629 A change of leadership at JIPB: A message to the plant sciences community.
30620713 Factors associated with distant metastasis in pediatric thyroid cancer: evaluation of the SEER database.
30626732 The microRNAs let-7 and miR-9 down-regulate the axon-guidance genes Ntn1 and Dcc during peripheral nerve regeneration.
30629332 Characterization of the novel HLA-DQB1*03:01:45 allele by sequencing-based typing.
30629810 Characterization of the novel HLA-DRB1*11:245 allele by sequencing-based typing.
30629812 Characterization of the novel HLA-B*51:228 allele in a Chinese individual.
30636978 Correction to: High-performance gene expression and knockout tools using sleeping beauty transposon system.
30638935 Leptin-elicited miRNA-342-3p potentiates gemcitabine resistance in pancreatic ductal adenocarcinoma.
30639237 Novel SASS6 compound heterozygous mutations in a Chinese family with primary autosomal recessive microcephaly.
30641153 The Xishuangbanna Declaration on Plant Conservation.

In [82]:
# 10) Check for any duplicates. (hint: look at the doi or the pmid)
pipline = [{"$group": {"_id": {"doi": "$doi"}, "count": {"$sum": 1}}}, 
           {"$match": {"count": {"$gt": 1}}}]

docs = collection.aggregate(pipline)
for doc in docs:
    print(doc)


{'_id': {'doi': '10.1093/jas/skz192'}, 'count': 2}
{'_id': {'doi': ''}, 'count': 5}
{'_id': {'doi': ' }, { name ml '}, 'count': 55}
{'_id': {'doi': '10.1093/cid/ciz518'}, 'count': 2}
{'_id': {'doi': ' } } }, from journal { title { iso-jta '}, 'count': 10}
{'_id': {'doi': ' }, authors { names std { { name ml '}, 'count': 73}
{'_id': {'doi': '10.1093/jnci/djz062'}, 'count': 2}
{'_id': {'doi': ', affil str '}, 'count': 40}
{'_id': {'doi': '10.23876/j.krcp.19.006'}, 'count': 2}


In [97]:
# 11) Remove every articles where the abstract starts with an "R".
docs = collection.find()

for doc in docs:
    abstract = str(doc["abstract"]).strip("\"") # ??? comment update the abstract field
       
collection.delete_many({"abstract": {"$regex": "^R"}})

DeleteResult({'n': 0, 'ok': 1.0}, acknowledged=True)

In [None]:
# 12) Return the list of papers (pmid) where there's atleast one affiliation per author


In [None]:
# 13) Create 500 random samples of the dataset, compute a statistics that you are interested in and 
# check how it behaves through the different samples

In [None]:
# 14) Sandbox exercise: think of a problematic and try to answer it.