# connect to mongo and get data

In [1]:
import pymongo
from pymongo import MongoClient
import datetime
from bson.objectid import ObjectId
from bson.json_util import dumps
from hashlib import md5
import pprint

In [63]:
# create a MongoClient to the running mongod instance with the default host and port
client = MongoClient()

# we can also specify the host and port
client = MongoClient('localhost', 27017)
client = MongoClient('mongodb://localhost:27017/') #using the mongoDB uri format

In [64]:
# access the database (or create it if does not exist)
db = client['test-database']

In [65]:
# access a collection (or create it if does not exist)
collection = db['posts']
collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'test-database'), 'posts')

In [66]:
db.list_collection_names() # print all the collections in the db

['posts']

# insert and query data

## single document

In [67]:
# drop previous collection
db.posts.drop()
posts = db.posts

In [68]:
# insert a manfactured social media post, using the insert_one() command
post = {"author": "Mike",
        "text": "My first blog post!",
        "tags": ["mongodb", "python", "pymongo"],
        "date": datetime.datetime.now(datetime.UTC)}

post_id = posts.insert_one(post).inserted_id
post_id

ObjectId('682845b63b7073c4346369eb')

In [69]:
db.list_collection_names() # print all the collections in the db

['posts']

In [70]:
# query a document, using the find_one() command
pprint.pprint(posts.find_one({"author": "Mike"}))

{'_id': ObjectId('682845b63b7073c4346369eb'),
 'author': 'Mike',
 'date': datetime.datetime(2025, 5, 17, 8, 15, 50, 74000),
 'tags': ['mongodb', 'python', 'pymongo'],
 'text': 'My first blog post!'}


In [71]:
posts.find_one({"author": "Eliot"})

In [72]:
# querying by ObjectId
pprint.pprint(posts.find_one({"_id": post_id}))

{'_id': ObjectId('682845b63b7073c4346369eb'),
 'author': 'Mike',
 'date': datetime.datetime(2025, 5, 17, 8, 15, 50, 74000),
 'tags': ['mongodb', 'python', 'pymongo'],
 'text': 'My first blog post!'}


In [73]:
# the objectId is an object, not a string
post_id_as_str = str(post_id)
posts.find_one({"_id": post_id_as_str})

In [74]:
# to convert a string to ObjectId
def get_ObjId(id):
    return ObjectId(str(id))

def get(str_id):
    # Convert from string to ObjectId:
    id = get_ObjId(str_id)
    document = db.posts.find_one({"_id": id})
    return document

pprint.pprint(get(post_id))

{'_id': ObjectId('682845b63b7073c4346369eb'),
 'author': 'Mike',
 'date': datetime.datetime(2025, 5, 17, 8, 15, 50, 74000),
 'tags': ['mongodb', 'python', 'pymongo'],
 'text': 'My first blog post!'}


## bulk insert

In [81]:
# insert many manfactured social media posts, using the insert_one() command
new_posts = [{"author": "Mike",
              "text": "Another post!",
              "tags": ["bulk", "insert"],
              "date": datetime.datetime(2009, 11, 12, 11, 14)},
             {"author": "Eliot",
              "title": "MongoDB is fun",
              "text": "and pretty easy too!",
              "date": datetime.datetime(2009, 11, 10, 10, 45)}]
result = posts.insert_many(new_posts)
result.inserted_ids

[ObjectId('682846083b7073c4346369f0'), ObjectId('682846083b7073c4346369f1')]

In [82]:
for post in posts.find():
    pprint.pprint(post)

{'_id': ObjectId('682845b63b7073c4346369eb'),
 'author': 'Mike',
 'date': datetime.datetime(2025, 5, 17, 8, 15, 50, 74000),
 'tags': ['mongodb', 'python', 'pymongo'],
 'text': 'My first blog post!'}
{'_id': ObjectId('682845bd3b7073c4346369ec'),
 'author': 'Mike',
 'date': datetime.datetime(2009, 11, 12, 11, 14),
 'tags': ['bulk', 'insert'],
 'text': 'Another post!'}
{'_id': ObjectId('682845bd3b7073c4346369ed'),
 'author': 'Eliot',
 'date': datetime.datetime(2009, 11, 10, 10, 45),
 'text': 'and pretty easy too!',
 'title': 'MongoDB is fun'}
{'_id': ObjectId('682846083b7073c4346369f0'),
 'author': 'Mike',
 'date': datetime.datetime(2009, 11, 12, 11, 14),
 'tags': ['bulk', 'insert'],
 'text': 'Another post!'}
{'_id': ObjectId('682846083b7073c4346369f1'),
 'author': 'Eliot',
 'date': datetime.datetime(2009, 11, 10, 10, 45),
 'text': 'and pretty easy too!',
 'title': 'MongoDB is fun'}


In [85]:
# remove all duplicated documents (using hash)
record_hashes = set()

for record in db.posts.find():
    # remove the id from the record, so that the hash will be evaluated on the document without id
    record_id = record.pop('_id')
    record_hash = md5(dumps(record).encode("utf-8")).hexdigest()

    if record_hash in record_hashes:
        db.posts.delete_one({'_id': record_id})
    else:
        record_hashes.add(record_hash)

for post in db.posts.find():
    pprint.pprint(post)

{'_id': ObjectId('682845b63b7073c4346369eb'),
 'author': 'Mike',
 'date': datetime.datetime(2025, 5, 17, 8, 15, 50, 74000),
 'tags': ['mongodb', 'python', 'pymongo'],
 'text': 'My first blog post!'}
{'_id': ObjectId('682845bd3b7073c4346369ec'),
 'author': 'Mike',
 'date': datetime.datetime(2009, 11, 12, 11, 14),
 'tags': ['bulk', 'insert'],
 'text': 'Another post!'}
{'_id': ObjectId('682845bd3b7073c4346369ed'),
 'author': 'Eliot',
 'date': datetime.datetime(2009, 11, 10, 10, 45),
 'text': 'and pretty easy too!',
 'title': 'MongoDB is fun'}


In [86]:
# remove duplicates based on a key
pipeline = [
    {"$group":{ 
     "_id": {'author':"$author", 'text':"$text"}, 
      "date": {"$first": '$date'},
      "tags": {"$first": '$tags'},
      "ID": {"$first": '$_id'},
     }},
     {"$out" : "deduplicated"}
]

db.posts.aggregate(pipeline)

for post in db.deduplicated.find():
    pprint.pprint(post)

{'ID': ObjectId('682845bd3b7073c4346369ed'),
 '_id': {'author': 'Eliot', 'text': 'and pretty easy too!'},
 'date': datetime.datetime(2009, 11, 10, 10, 45),
 'tags': None}
{'ID': ObjectId('682845b63b7073c4346369eb'),
 '_id': {'author': 'Mike', 'text': 'My first blog post!'},
 'date': datetime.datetime(2025, 5, 17, 8, 15, 50, 74000),
 'tags': ['mongodb', 'python', 'pymongo']}
{'ID': ObjectId('682845bd3b7073c4346369ec'),
 '_id': {'author': 'Mike', 'text': 'Another post!'},
 'date': datetime.datetime(2009, 11, 12, 11, 14),
 'tags': ['bulk', 'insert']}


In [87]:
for post in db.deduplicated.find({"_id.author": "Mike"}):
    pprint.pprint(post)

{'ID': ObjectId('682845b63b7073c4346369eb'),
 '_id': {'author': 'Mike', 'text': 'My first blog post!'},
 'date': datetime.datetime(2025, 5, 17, 8, 15, 50, 74000),
 'tags': ['mongodb', 'python', 'pymongo']}
{'ID': ObjectId('682845bd3b7073c4346369ec'),
 '_id': {'author': 'Mike', 'text': 'Another post!'},
 'date': datetime.datetime(2009, 11, 12, 11, 14),
 'tags': ['bulk', 'insert']}


## counting

In [88]:
db.posts.count_documents({})

3

In [89]:
posts.count_documents({"author": "Mike"})

2

## range queries

In [90]:
d = datetime.datetime(2009, 11, 12, 12)
for post in posts.find({"date": {"$lt": d}}).sort("author"):
    pprint.pprint(post)

{'_id': ObjectId('682845bd3b7073c4346369ed'),
 'author': 'Eliot',
 'date': datetime.datetime(2009, 11, 10, 10, 45),
 'text': 'and pretty easy too!',
 'title': 'MongoDB is fun'}
{'_id': ObjectId('682845bd3b7073c4346369ec'),
 'author': 'Mike',
 'date': datetime.datetime(2009, 11, 12, 11, 14),
 'tags': ['bulk', 'insert'],
 'text': 'Another post!'}
