### Setup Dependencies

In [37]:
import csv
import json
from pymongo import MongoClient

In [38]:
# drop database
client = MongoClient('mongodb://localhost:27017/')
client.drop_database('nobel')
client.list_database_names()

['admin', 'config', 'local', 'test']

In [39]:
def import_json_array(filename, collection):
    """Takes a csv file, converts each row into json, and inserts the rows into mongo """

    client = MongoClient('mongodb://localhost:27017/')
    db = client.get_database('nobel')

    with open(filename, 'r') as data_file:
        # json_data = data_file.read()
        array = json.load(data_file)
        for entry in array:
           db[collection].insert_one(entry)

import_json_array("../data/laureates.json","laureates")
import_json_array("../data/prizes.json","prizes")

In [40]:
# list database names
client = MongoClient('mongodb://localhost:27017/')
db = client.get_database('nobel')
client.list_database_names(),db.list_collection_names()

(['admin', 'config', 'local', 'nobel', 'test'], ['laureates', 'prizes'])

In [41]:
def import_csv(filename, collection):
    """Takes a json file, converts each row into json, and inserts the rows into mongo """

    client = MongoClient('mongodb://localhost:27017/')
    db = client.get_database('test')

    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            row['newfield'] = 'newvalue'
            db[collection].insert_one(row)
import_csv("../data/test.csv","test_collection")

In [42]:
def read(collection):
    """Reads from a collection and prints out data """

    client = MongoClient('mongodb://localhost:27017/')
    db = client.get_database('nobel')

    cursor = db[collection].find()

    for row in cursor:
        # Do stuff here
        print(row)
        
read("test_collection")

In [43]:
len(list(db.prizes.find())), len(list(db.laureates.find()))

(590, 934)

In [44]:
db["prizes"].count_documents({}),db.laureates.count_documents({})

(590, 934)

In [45]:
db["prizes"].find_one()

{'_id': ObjectId('60016c9d585e08fd9974875c'),
 'year': '2018',
 'category': 'physics',
 'overallMotivation': '“for groundbreaking inventions in the field of laser physics”',
 'laureates': [{'id': '960',
   'firstname': 'Arthur',
   'surname': 'Ashkin',
   'motivation': '"for the optical tweezers and their application to biological systems"',
   'share': '2'},
  {'id': '961',
   'firstname': 'Gérard',
   'surname': 'Mourou',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'},
  {'id': '962',
   'firstname': 'Donna',
   'surname': 'Strickland',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'}]}

In [46]:
cursor = db["prizes"].find({"year": "2018"})
for row in cursor:
    # Do stuff here
    print(row['category'])

physics
medicine
economics
chemistry
peace


In [47]:
criteria = {"year": {"$exists": True}}
db["prizes"].find_one(criteria)

{'_id': ObjectId('60016c9d585e08fd9974875c'),
 'year': '2018',
 'category': 'physics',
 'overallMotivation': '“for groundbreaking inventions in the field of laser physics”',
 'laureates': [{'id': '960',
   'firstname': 'Arthur',
   'surname': 'Ashkin',
   'motivation': '"for the optical tweezers and their application to biological systems"',
   'share': '2'},
  {'id': '961',
   'firstname': 'Gérard',
   'surname': 'Mourou',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'},
  {'id': '962',
   'firstname': 'Donna',
   'surname': 'Strickland',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'}]}

In [48]:
criteria = {"year": {"$ne": "2018"}}
db["prizes"].find_one(criteria)


{'_id': ObjectId('60016c9d585e08fd9974875f'),
 'year': '2017',
 'category': 'peace',
 'laureates': [{'id': '948',
   'firstname': 'International Campaign to Abolish Nuclear Weapons (ICAN)',
   'motivation': '"for its work to draw attention to the catastrophic humanitarian consequences of any use of nuclear weapons and for its ground-breaking efforts to achieve a treaty-based prohibition of such weapons"',
   'share': '1',
   'surname': ''}]}

In [49]:
db["prizes"].distinct("category")

['chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics']

In [50]:
# Filter for laureates with at least three prizes
criteria = {"prizes.2": {"$exists": True}}

# Find one laureate with at least three prizes
db.laureates.find_one(criteria), "Count: "+str(db.laureates.count_documents(criteria))

({'_id': ObjectId('60016c9c585e08fd997484eb'),
  'id': '482',
  'firstname': 'Comité international de la Croix Rouge (International Committee of the Red Cross)',
  'born': '0000-00-00',
  'died': '0000-00-00',
  'gender': 'org',
  'prizes': [{'year': '1917',
    'category': 'peace',
    'share': '1',
    'affiliations': [[]]},
   {'year': '1944', 'category': 'peace', 'share': '1', 'affiliations': [[]]},
   {'year': '1963', 'category': 'peace', 'share': '2', 'affiliations': [[]]}]},
 'Count: 1')

In [51]:
isSameSet = set(db.prizes.distinct("category")) == set(db.laureates.distinct("prizes.category"))
countries = set(db.laureates.distinct("diedCountry")) - set(db.laureates.distinct("bornCountry"))
count = len(set(db.laureates.distinct("prizes.affiliations.country")))
# # The number of distinct countries of laureate affiliation for prizes
isSameSet, countries, count

(True,
 {'Barbados',
  'Czechoslovakia',
  'East Germany',
  'Gabon',
  'Greece',
  'Israel',
  'Jamaica',
  'Northern Rhodesia (now Zambia)',
  'Philippines',
  'Puerto Rico',
  'Tunisia',
  'USSR',
  'Yugoslavia (now Serbia)'},
 29)

In [52]:
db.laureates.distinct("prizes.affiliations.country",{"bornCountry": "USA"})

['Australia', 'Denmark', 'USA', 'United Kingdom']

In [53]:
# Save a filter for organization laureates with prizes won before 1945
before = {
    "gender": "org",
    "prizes.year": {"$lt": "1945"},
    }

# Save a filter for organization laureates with prizes won in or after 1945
in_or_after = {
    "gender": "org",
    "prizes.year": {"$gte": "1945"},
    }

n_before = db.laureates.count_documents(before)
n_in_or_after = db.laureates.count_documents(in_or_after)
ratio = n_in_or_after / (n_in_or_after + n_before)
print(ratio)

0.84


In [54]:
db.laureates.distinct("gender")

['female', 'male', 'org']

In [55]:
list(db.laureates.find())[:3]

[{'_id': ObjectId('60016c9b585e08fd997483b5'),
  'id': '2',
  'firstname': 'Hendrik Antoon',
  'surname': 'Lorentz',
  'born': '1853-07-18',
  'died': '1928-02-04',
  'bornCountry': 'the Netherlands',
  'bornCountryCode': 'NL',
  'bornCity': 'Arnhem',
  'diedCountry': 'the Netherlands',
  'diedCountryCode': 'NL',
  'gender': 'male',
  'prizes': [{'year': '1902',
    'category': 'physics',
    'share': '2',
    'motivation': '"in recognition of the extraordinary service they rendered by their researches into the influence of magnetism upon radiation phenomena"',
    'affiliations': [{'name': 'Leiden University',
      'city': 'Leiden',
      'country': 'the Netherlands'}]}]},
 {'_id': ObjectId('60016c9c585e08fd997483b6'),
  'id': '95',
  'firstname': 'Leon Neil',
  'surname': 'Cooper',
  'born': '1930-02-28',
  'died': '0000-00-00',
  'bornCountry': 'USA',
  'bornCountryCode': 'US',
  'bornCity': 'New York, NY',
  'gender': 'male',
  'prizes': [{'year': '1972',
    'category': 'physics'

In [56]:
from bson.regex import Regex
# Regular expressions - case_sensitve 
db.laureates.distinct("bornCountry",{"bornCountry": {"$regex": "Poland","$options":"i"}})
db.laureates.distinct("bornCountry",{"bornCountry": Regex("^Poland \(now")})

['Poland (now Belarus)', 'Poland (now Lithuania)', 'Poland (now Ukraine)']

In [57]:
db.laureates.count_documents({"firstname": Regex("^G"), "surname": Regex("^S")})

9

In [58]:
# Filter for laureates with "Germany" in their "bornCountry" value
criteria = {"bornCountry": Regex("Germany")}
print(set(db.laureates.distinct("bornCountry", criteria)))

{'Germany (now Russia)', 'Bavaria (now Germany)', 'W&uuml;rttemberg (now Germany)', 'Hesse-Kassel (now Germany)', 'East Friesland (now Germany)', 'Germany (now Poland)', 'Germany (now France)', 'Prussia (now Germany)', 'West Germany (now Germany)', 'Mecklenburg (now Germany)', 'Schleswig (now Germany)', 'Germany'}


In [59]:
#Filter for currently-Germany countries of birth. Fill in a string value to be sandwiched between the strings "now" and "$"
criteria = {"bornCountry": Regex("now Germany\)" + "$")}
print(set(db.laureates.distinct("bornCountry", criteria)))

{'Bavaria (now Germany)', 'W&uuml;rttemberg (now Germany)', 'Hesse-Kassel (now Germany)', 'East Friesland (now Germany)', 'West Germany (now Germany)', 'Prussia (now Germany)', 'Mecklenburg (now Germany)', 'Schleswig (now Germany)'}


In [60]:
# Save a filter for laureates with prize motivation values containing "transistor" as a substring
criteria = {"prizes.motivation": Regex("transistor")}

# Save the field names corresponding to a laureate's first name and last name
first, last = "firstname", "surname"
print([(laureate[first], laureate[last]) for laureate in db.laureates.find(criteria)])

[('William Bradford', 'Shockley'), ('John', 'Bardeen'), ('Walter Houser', 'Brattain')]


In [61]:
# projection
docs = db.laureates.find(filter={}, projection={"prizes.affiliations":1, "_id":0})
list(docs)[:3]

[{'prizes': [{'affiliations': [{'name': 'Leiden University',
      'city': 'Leiden',
      'country': 'the Netherlands'}]}]},
 {'prizes': [{'affiliations': [{'name': 'Brown University',
      'city': 'Providence, RI',
      'country': 'USA'}]}]},
 {'prizes': [{'affiliations': [{'name': 'University of Pennsylvania',
      'city': 'Philadelphia, PA',
      'country': 'USA'}]}]}]

In [62]:
db.laureates.find_one({"prizes": {"$elemMatch": {"category": "physics", "year": "1903"}}})

{'_id': ObjectId('60016c9c585e08fd997483f7'),
 'id': '4',
 'firstname': 'Antoine Henri',
 'surname': 'Becquerel',
 'born': '1852-12-15',
 'died': '1908-08-25',
 'bornCountry': 'France',
 'bornCountryCode': 'FR',
 'bornCity': 'Paris',
 'diedCountry': 'France',
 'diedCountryCode': 'FR',
 'gender': 'male',
 'prizes': [{'year': '1903',
   'category': 'physics',
   'share': '2',
   'motivation': '"in recognition of the extraordinary services he has rendered by his discovery of spontaneous radioactivity"',
   'affiliations': [{'name': 'École Polytechnique',
     'city': 'Paris',
     'country': 'France'}]}]}

In [63]:
list(db.laureates.find(projection={"firstname": 1, "surname": 1, "prizes.share": 1, "_id": 0}))[:3]

[{'firstname': 'Hendrik Antoon',
  'surname': 'Lorentz',
  'prizes': [{'share': '2'}]},
 {'firstname': 'Leon Neil', 'surname': 'Cooper', 'prizes': [{'share': '3'}]},
 {'firstname': 'John Robert',
  'surname': 'Schrieffer',
  'prizes': [{'share': '3'}]}]

In [64]:
# Use projection to select only firstname and surname
docs = db.laureates.find(
       filter= {"firstname" : {"$regex" : "^G"},
                "surname" : {"$regex" : "^S"}  },
   projection= ["firstname", "surname"]  )

# Iterate over docs and concatenate first name and surname
full_names = [doc["firstname"] + " " + doc["surname"]  for doc in docs]

# Print the full names
print(full_names)

['George D. Snell', 'Gustav Stresemann', 'Glenn Theodore Seaborg', 'George J. Stigler', 'George F. Smoot', 'George E. Smith', 'George P. Smith', 'George Bernard Shaw', 'Giorgos Seferis']


In [71]:
# Save documents, projecting out laureates share
prizes = db.prizes.find({}, ["laureates.share"])

# Iterate over prizes
for prize in prizes:
    # Initialize total share
    total_share = 0
    
    # Iterate over laureates for the prize
    for laureate in prize["laureates"]:
        # add the share of the laureate to total_share
        total_share += 1 / float(laureate['share'])
        
    # Print the total share    
    print(total_share)    

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [76]:
from operator import itemgetter
docs = list(db.prizes.find({"category": "physics"},["year"], sort=[("year", -1)]))
# docs = sorted(docs, key=itemgetter("year"), reverse=True)
print([doc['year'] for doc in docs][:5])

['2018', '2017', '2016', '2015', '2014']


In [79]:
for doc in db.prizes.find(
    {"year":{"$gt": "1966", "$lt": "1970"}}, 
    ["category","year"], 
    sort=[("year",1), ("category", -1)]):
    print("{year} {category}".format(**doc)) ##

1967 physics
1967 medicine
1967 literature
1967 chemistry
1968 physics
1968 peace
1968 medicine
1968 literature
1968 chemistry
1969 physics
1969 peace
1969 medicine
1969 literature
1969 economics
1969 chemistry


In [81]:
docs = list(db.laureates.find(
    {"born": {"$gte": "1900"}, "prizes.year": {"$gte": "1954"}},
    {"born": 1, "prizes.year": 1, "_id": 0},
    sort=[("prizes.year", 1), ("born", -1)]))
for doc in docs[:5]:
    print(doc)

{'born': '1916-08-25', 'prizes': [{'year': '1954'}]}
{'born': '1915-06-15', 'prizes': [{'year': '1954'}]}
{'born': '1901-02-28', 'prizes': [{'year': '1954'}, {'year': '1962'}]}
{'born': '1913-07-12', 'prizes': [{'year': '1955'}]}
{'born': '1911-01-26', 'prizes': [{'year': '1955'}]}


In [93]:
from operator import itemgetter

def all_laureates(prize):  
  # sort the laureates by surname
  sorted_laureates = sorted(prize["laureates"], key=itemgetter("surname"))
  
  # extract surnames
  surnames = [laureate["surname"] for laureate in sorted_laureates]
  
  # concatenate surnames separated with " and " 
  all_names = " and ".join(surnames)
  
  return all_names

# find physics prizes, project year and name, and sort by year
docs = db.prizes.find(
           filter= {"category": "physics"}, 
           projection= ["year", "laureates.firstname", "laureates.surname"], 
           sort= [("year", 1)])

# print the year and laureate names (from all_laureates)
for doc in docs:
  print("{year}: {names}".format(year=doc["year"], names=all_laureates(doc)))

1901: Röntgen
1902: Lorentz and Zeeman
1903: Becquerel and Curie and Curie, née Sklodowska
1904: (John William Strutt)
1905: von Lenard
1906: Thomson
1907: Michelson
1908: Lippmann
1909: Braun and Marconi
1910: van der Waals
1911: Wien
1912: Dalén
1913: Kamerlingh Onnes
1914: von Laue
1915: Bragg and Bragg
1917: Barkla
1918: Planck
1919: Stark
1920: Guillaume
1921: Einstein
1922: Bohr
1923: Millikan
1924: Siegbahn
1925: Franck and Hertz
1926: Perrin
1927: Compton and Wilson
1928: Richardson
1929: de Broglie
1930: Raman
1932: Heisenberg
1933: Dirac and Schrödinger
1935: Chadwick
1936: Anderson and Hess
1937: Davisson and Thomson
1938: Fermi
1939: Lawrence
1943: Stern
1944: Rabi
1945: Pauli
1946: Bridgman
1947: Appleton
1948: Blackett
1949: Yukawa
1950: Powell
1951: Cockcroft and Walton
1952: Bloch and Purcell
1953: Zernike
1954: Born and Bothe
1955: Kusch and Lamb
1956: Bardeen and Brattain and Shockley
1957: Lee and Yang
1958: Cherenkov and Frank and Tamm
1959: Chamberlain and Segrè
19

In [105]:
# original categories from 1901
original_categories = db.prizes.distinct("category", {"year": "1901"})
print(original_categories)

# project year and category, and sort
docs = db.prizes.find(
        filter={},
        projection = {"year":1,"category":1,"_id":0},
        sort=[("year",-1),("category",1)]# -1 = descending, 1 = ascending
)
#print the documents
for doc in docs:
  print(doc)

gory': 'chemistry'}
{'year': '2001', 'category': 'economics'}
{'year': '2001', 'category': 'literature'}
{'year': '2001', 'category': 'medicine'}
{'year': '2001', 'category': 'peace'}
{'year': '2001', 'category': 'physics'}
{'year': '2000', 'category': 'chemistry'}
{'year': '2000', 'category': 'economics'}
{'year': '2000', 'category': 'literature'}
{'year': '2000', 'category': 'medicine'}
{'year': '2000', 'category': 'peace'}
{'year': '2000', 'category': 'physics'}
{'year': '1999', 'category': 'chemistry'}
{'year': '1999', 'category': 'economics'}
{'year': '1999', 'category': 'literature'}
{'year': '1999', 'category': 'medicine'}
{'year': '1999', 'category': 'peace'}
{'year': '1999', 'category': 'physics'}
{'year': '1998', 'category': 'chemistry'}
{'year': '1998', 'category': 'economics'}
{'year': '1998', 'category': 'literature'}
{'year': '1998', 'category': 'medicine'}
{'year': '1998', 'category': 'peace'}
{'year': '1998', 'category': 'physics'}
{'year': '1997', 'category': 'chemistr

# Indexes

When to use?
- Queries with high specifity
- large documents
- large collections

In [106]:
db.prizes.create_index([("category",1),("year",1)])

'category_1_year_1'

In [109]:
db.prizes.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]},
 'category_1_year_1': {'v': 2, 'key': [('category', 1), ('year', 1)]}}

In [110]:
db.prizes.find({"category": "economics"}, {"year": 1, "_id": 0}).explain()

{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'nobel.prizes',
  'indexFilterSet': False,
  'parsedQuery': {'category': {'$eq': 'economics'}},
  'winningPlan': {'stage': 'PROJECTION_COVERED',
   'transformBy': {'year': 1, '_id': 0},
   'inputStage': {'stage': 'IXSCAN',
    'keyPattern': {'category': 1, 'year': 1},
    'indexName': 'category_1_year_1',
    'isMultiKey': False,
    'multiKeyPaths': {'category': [], 'year': []},
    'isUnique': False,
    'isSparse': False,
    'isPartial': False,
    'indexVersion': 2,
    'direction': 'forward',
    'indexBounds': {'category': ['["economics", "economics"]'],
     'year': ['[MinKey, MaxKey]']}}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 50,
  'executionTimeMillis': 0,
  'totalKeysExamined': 50,
  'totalDocsExamined': 0,
  'executionStages': {'stage': 'PROJECTION_COVERED',
   'nReturned': 50,
   'executionTimeMillisEstimate': 0,
   'works': 51,
   'advanced': 50,
   'needTime': 0,
   'nee

In [107]:
list(db.prizes.find({"category": "economics"}, {"year": 1, "_id": 0}))

[{'year': '1969'},
 {'year': '1970'},
 {'year': '1971'},
 {'year': '1972'},
 {'year': '1973'},
 {'year': '1974'},
 {'year': '1975'},
 {'year': '1976'},
 {'year': '1977'},
 {'year': '1978'},
 {'year': '1979'},
 {'year': '1980'},
 {'year': '1981'},
 {'year': '1982'},
 {'year': '1983'},
 {'year': '1984'},
 {'year': '1985'},
 {'year': '1986'},
 {'year': '1987'},
 {'year': '1988'},
 {'year': '1989'},
 {'year': '1990'},
 {'year': '1991'},
 {'year': '1992'},
 {'year': '1993'},
 {'year': '1994'},
 {'year': '1995'},
 {'year': '1996'},
 {'year': '1997'},
 {'year': '1998'},
 {'year': '1999'},
 {'year': '2000'},
 {'year': '2001'},
 {'year': '2002'},
 {'year': '2003'},
 {'year': '2004'},
 {'year': '2005'},
 {'year': '2006'},
 {'year': '2007'},
 {'year': '2008'},
 {'year': '2009'},
 {'year': '2010'},
 {'year': '2011'},
 {'year': '2012'},
 {'year': '2013'},
 {'year': '2014'},
 {'year': '2015'},
 {'year': '2016'},
 {'year': '2017'},
 {'year': '2018'}]

In [117]:
 db.prizes.find_one(
    {"year": 1, "laureates.share": 1},
    sort=[("year", -1)]
)

In [119]:
# Specify an index model for compound sorting
index_model = [("category", 1), ("year", -1)]
db.prizes.create_index(index_model)

# Collect the last single-laureate year for each category
report = ""
for category in sorted(db.prizes.distinct("category")):
    doc = db.prizes.find_one(
        {"category":category, "laureates.share": "1"},
        sort=[("year", -1)]
    )
    report += "{category}: {year}\n".format(**doc)

print(report)

chemistry: 2011
economics: 2017
literature: 2017
medicine: 2016
peace: 2017
physics: 1992



In [120]:
from collections import Counter

# Ensure an index on country of birth
db.laureates.create_index([("bornCountry", 1)])

# Collect a count of laureates for each country of birth
n_born_and_affiliated = {
    country: db.laureates.count_documents({
        "bornCountry": country,
        "prizes.affiliations.country": country
    })
    for country in db.laureates.distinct("bornCountry")
}

five_most_common = Counter(n_born_and_affiliated).most_common(5)
print(five_most_common)

[('USA', 241), ('United Kingdom', 56), ('France', 26), ('Germany', 19), ('Japan', 17)]


In [121]:
# Pagination
for doc in db.prizes.find({"laureates.share":"3"}, skip=6,limit=3):
    print("{year} {category}".format(**doc))

2011 peace
2010 chemistry
2008 chemistry


In [122]:
list(db.prizes.find({"category": "economics"},
                    {"year": 1, "_id": 0})
     .sort("year")
     .limit(3)
     .limit(5))

[{'year': '1969'},
 {'year': '1970'},
 {'year': '1971'},
 {'year': '1972'},
 {'year': '1973'}]

In [123]:
db.prizes.find_one()

{'_id': ObjectId('60016c9d585e08fd9974875c'),
 'year': '2018',
 'category': 'physics',
 'overallMotivation': '“for groundbreaking inventions in the field of laser physics”',
 'laureates': [{'id': '960',
   'firstname': 'Arthur',
   'surname': 'Ashkin',
   'motivation': '"for the optical tweezers and their application to biological systems"',
   'share': '2'},
  {'id': '961',
   'firstname': 'Gérard',
   'surname': 'Mourou',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'},
  {'id': '962',
   'firstname': 'Donna',
   'surname': 'Strickland',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'}]}

In [125]:
from pprint import pprint

# Fetch prizes with quarter-share laureate(s)
filter_ = {"laureates.share": "4"}

# Save the list of field names
projection = ["category", "year", "laureates.motivation"]

# Save a cursor to yield the first five prizes
cursor = db.prizes.find(filter_, projection).sort("year",1).limit(5)
pprint(list(cursor))

[{'_id': ObjectId('60016c9d585e08fd99748951'),
  'category': 'physics',
  'laureates': [{'motivation': '"in recognition of the extraordinary services '
                               'he has rendered by his discovery of '
                               'spontaneous radioactivity"'},
                {'motivation': '"in recognition of the extraordinary services '
                               'they have rendered by their joint researches '
                               'on the radiation phenomena discovered by '
                               'Professor Henri Becquerel"'},
                {'motivation': '"in recognition of the extraordinary services '
                               'they have rendered by their joint researches '
                               'on the radiation phenomena discovered by '
                               'Professor Henri Becquerel"'}],
  'year': '1903'},
 {'_id': ObjectId('60016c9d585e08fd997488f8'),
  'category': 'chemistry',
  'laureates': [{'motivation':

In [126]:
db.laureates.find_one()

{'_id': ObjectId('60016c9b585e08fd997483b5'),
 'id': '2',
 'firstname': 'Hendrik Antoon',
 'surname': 'Lorentz',
 'born': '1853-07-18',
 'died': '1928-02-04',
 'bornCountry': 'the Netherlands',
 'bornCountryCode': 'NL',
 'bornCity': 'Arnhem',
 'diedCountry': 'the Netherlands',
 'diedCountryCode': 'NL',
 'gender': 'male',
 'prizes': [{'year': '1902',
   'category': 'physics',
   'share': '2',
   'motivation': '"in recognition of the extraordinary service they rendered by their researches into the influence of magnetism upon radiation phenomena"',
   'affiliations': [{'name': 'Leiden University',
     'city': 'Leiden',
     'country': 'the Netherlands'}]}]}

In [128]:
from pprint import pprint

# Write a function to retrieve a page of data
def get_particle_laureates(page_number=1, page_size=3):
    if page_number < 1 or not isinstance(page_number, int):
        raise ValueError("Pages are natural numbers (starting from 1).")
    particle_laureates = list(
        db.laureates.find(
            {"prizes.motivation": {"$regex": "particle"}},
            ["firstname", "surname", "prizes"])
        .sort([("prizes.year", 1), ("surname", 1)])
        .skip(page_size * (page_number - 1))
        .limit(page_size))
    return particle_laureates

# Collect and save the first nine pages
pages = [get_particle_laureates(page_number=page) for page in range(1,9)]
pprint(pages[0])

[{'_id': ObjectId('60016c9c585e08fd99748414'),
  'firstname': 'Charles Thomson Rees',
  'prizes': [{'affiliations': [{'city': 'Cambridge',
                                'country': 'United Kingdom',
                                'name': 'University of Cambridge'}],
              'category': 'physics',
              'motivation': '"for his method of making the paths of '
                            'electrically charged particles visible by '
                            'condensation of vapour"',
              'share': '2',
              'year': '1927'}],
  'surname': 'Wilson'},
 {'_id': ObjectId('60016c9c585e08fd9974842a'),
  'firstname': 'Sir John Douglas',
  'prizes': [{'affiliations': [{'city': 'Harwell, Berkshire',
                                'country': 'United Kingdom',
                                'name': 'Atomic Energy Research '
                                        'Establishment'}],
              'category': 'physics',
              'motivation': '"for their pione

In [133]:
cursor = (db.laureates.find(
    projection={"firstname": 1, "prizes.year": 1, "_id": 0},
    filter={"gender": "org"})
 .limit(3).sort("prizes.year", -1))

project_stage = {"$project": {"firstname": 1, "prizes.year": 1, "_id": 0}}
match_stage = {"$match": {"gender": "org"}}
limit_stage = {"$limit": 3}
sort_stage = {"$sort": {"prizes.year": -1}}
pipeline=[match_stage, project_stage, sort_stage, limit_stage]
list(db.laureates.aggregate(pipeline))

[{'firstname': 'International Campaign to Abolish Nuclear Weapons (ICAN)',
  'prizes': [{'year': '2017'}]},
 {'firstname': 'National Dialogue Quartet', 'prizes': [{'year': '2015'}]},
 {'firstname': 'Organisation for the Prohibition of Chemical Weapons (OPCW)',
  'prizes': [{'year': '2013'}]}]

In [134]:
# Translate cursor to aggregation pipeline
pipeline = [
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$limit": 3}
]

for doc in db.laureates.aggregate(pipeline):
    print("{bornCountry}: {prizes}".format(**doc))

the Netherlands: [{'affiliations': [{'country': 'the Netherlands'}]}]
USA: [{'affiliations': [{'country': 'USA'}]}]
USA: [{'affiliations': [{'country': 'USA'}]}]


In [135]:
from collections import OrderedDict
from itertools import groupby
from operator import itemgetter

original_categories = set(db.prizes.distinct("category", {"year": "1901"}))

# Save an pipeline to collect original-category prizes
pipeline = [
    {"$match": {"category": {"$in": list(original_categories)}}},
    {"$project": {"category": 1, "year": 1}},
    {"$sort": OrderedDict([("year", -1)])}
]
cursor = db.prizes.aggregate(pipeline)
for key, group in groupby(cursor, key=itemgetter("year")):
    missing = original_categories - {doc["category"] for doc in group}
    if missing:
        print("{year}: {missing}".format(year=key, missing=", ".join(sorted(missing))))

2018: literature
1972: peace
1967: peace
1966: peace
1956: peace
1955: peace
1948: peace
1943: literature, peace
1939: peace
1935: literature
1934: physics
1933: chemistry
1932: peace
1931: physics
1928: peace
1925: medicine
1924: chemistry, peace
1923: peace
1921: medicine
1919: chemistry
1918: literature, medicine, peace
1917: chemistry, medicine
1916: chemistry, medicine, peace, physics
1915: medicine, peace
1914: literature, peace


# Fields paths
to create new fields

In [136]:
db.laureates.aggregate([{"$project": {"n_prizes":{"$size":"$prizes"}}}]).next()

{'_id': ObjectId('60016c9b585e08fd997483b5'), 'n_prizes': 1}

In [139]:
list(db.laureates.aggregate([
    {"$project": {"n_prizes": {"$size": "$prizes"}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum": "$n_prizes"}}}
]))

[{'_id': None, 'n_prizes_total': 941}]

In [142]:
# Aggregates who has 3 shares and who dont
list(db.prizes.aggregate([
    {"$project": {"allThree": {"$setEquals": ["$laureates.share", ["3"]]},
                  "noneThree": {"$not": {"$setIsSubset": [["3"], "$laureates.share"]}}}},
    {"$match": {"$nor": [{"allThree": True}, {"noneThree": True}]}}]))

[]

In [143]:
# Count prizes awarded (at least partly) to organizations as a sum over sizes of "prizes" arrays.
pipeline = [
    {"$match": {"gender": "org"}},
    {"$project": {"n_prizes": {"$size": "$prizes"}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum": "$n_prizes"}}}
]

print(list(db.laureates.aggregate(pipeline)))

[{'_id': None, 'n_prizes_total': 27}]


In [146]:
from collections import OrderedDict

original_categories = sorted(set(db.prizes.distinct("category", {"year": "1901"})))
pipeline = [
    {"$match": {"category": {"$in": original_categories}}},
    {"$project": {"category": 1, "year": 1}},
    
    # Collect the set of category values for each prize year.
    {"$group": {"_id": "$year", "categories": {"$addToSet": "$category"}}},
    
    # Project categories *not* awarded (i.e., that are missing this year).
    {"$project": {"missing": {"$setDifference": [original_categories, "$categories"]}}},
    
    # Only include years with at least one missing category
    {"$match": {"missing.0": {"$exists": True}}},
    
    # Sort in reverse chronological order. Note that "_id" is a distinct year at this stage.
    {"$sort": OrderedDict([("_id", -1)])},
]
for doc in db.prizes.aggregate(pipeline):
    print("{year}: {missing}".format(year=doc["_id"],missing=", ".join(sorted(doc["missing"]))))

2018: literature
1972: peace
1967: peace
1966: peace
1956: peace
1955: peace
1948: peace
1943: literature, peace
1939: peace
1935: literature
1934: physics
1933: chemistry
1932: peace
1931: physics
1928: peace
1925: medicine
1924: chemistry, peace
1923: peace
1921: medicine
1919: chemistry
1918: literature, medicine, peace
1917: chemistry, medicine
1916: chemistry, medicine, peace, physics
1915: medicine, peace
1914: literature, peace


## Get element of array

In [147]:
list(db.prizes.aggregate([
    {"$unwind":"$laureates"},
    {"$project": {
        "_id":0, "year":1, "category":1,
        "laureates.surname": 1, "laureates.share": 1
    }},
    {"$limit": 3}
]))

[{'year': '2018',
  'category': 'physics',
  'laureates': {'surname': 'Ashkin', 'share': '2'}},
 {'year': '2018',
  'category': 'physics',
  'laureates': {'surname': 'Mourou', 'share': '4'}},
 {'year': '2018',
  'category': 'physics',
  'laureates': {'surname': 'Strickland', 'share': '4'}}]

## $unwind

In [148]:
pipeline = [
    # Unwind the laureates array
    {"$unwind": "$laureates"},
    {"$lookup": {
        "from": "laureates", "foreignField": "id",
        "localField": "laureates.id", "as": "laureate_bios"}},

    # Unwind the new laureate_bios array
    {"$unwind": "$laureate_bios"},
    {"$project": {"category": 1,
                  "bornCountry": "$laureate_bios.bornCountry"}},

    # Collect bornCountry values associated with each prize category
    {"$group": {"_id": "$category",
                "bornCountries": {"$addToSet": "$bornCountry"}}},

    # Project out the size of each category's (set of) bornCountries
    {"$project": {"category": 1,
                  "nBornCountries": {"$size": "$bornCountries"}}},
    {"$sort": {"nBornCountries": -1}},
]
for doc in db.prizes.aggregate(pipeline): print(doc)

{'_id': 'literature', 'nBornCountries': 55}
{'_id': 'peace', 'nBornCountries': 50}
{'_id': 'chemistry', 'nBornCountries': 48}
{'_id': 'medicine', 'nBornCountries': 44}
{'_id': 'physics', 'nBornCountries': 44}
{'_id': 'economics', 'nBornCountries': 21}


## $bucket

In [150]:
#How many prizes were awarded to immigrants?
pipeline = [
    # Limit results to people; project needed fields; unwind prizes
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
  
    # Count prizes with no country-of-birth affiliation
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"},
]

print(list(db.laureates.aggregate(pipeline)))

[{'awardedElsewhere': 478}]


In [151]:
pipeline = [
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"},
]

# Construct the additional filter stage
added_stage = {"$match": {"prizes.affiliations.country": {"$in": db.laureates.distinct("prizes.affiliations.country")}}}

# Insert this stage into the pipeline
pipeline.insert(3, added_stage)
print(list(db.laureates.aggregate(pipeline)))

[{'awardedElsewhere': 252}]
