# connect to mongo and get data

In [None]:
import pymongo
from pymongo import MongoClient
import datetime
from bson.objectid import ObjectId
from bson.json_util import dumps
from hashlib import md5
import pprint

In [None]:
# create a MongoClient to the running mongod instance with the default host and port
client = MongoClient()

# we can also specify the host and port
client = MongoClient('localhost', 27017)
client = MongoClient('mongodb://localhost:27017/') #using the mongoDB uri format

In [None]:
# access the database (or create it if does not exist)
db = client['test-database']

In [None]:
# access a collection (or create it if does not exist)
collection = db['posts']
collection

In [None]:
db.list_collection_names() # print all the collections in the db

# insert and query data

## single document

In [None]:
# drop previous collection
db.posts.drop()
posts = db.posts

In [None]:
# insert a manfactured social media post, using the insert_one() command
post = {"author": "Mike",
        "text": "My first blog post!",
        "tags": ["mongodb", "python", "pymongo"],
        "date": datetime.datetime.now(datetime.UTC)}

post_id = posts.insert_one(post).inserted_id
post_id

In [None]:
db.list_collection_names() # print all the collections in the db

In [None]:
# query a document, using the find_one() command
pprint.pprint(posts.find_one({"author": "Mike"}))

In [None]:
posts.find_one({"author": "Eliot"})

In [None]:
# querying by ObjectId
pprint.pprint(posts.find_one({"_id": post_id}))

In [None]:
# the objectId is an object, not a string
post_id_as_str = str(post_id)
posts.find_one({"_id": post_id_as_str})

In [None]:
# to convert a string to ObjectId
def get_ObjId(id):
    return ObjectId(str(id))

def get(str_id):
    # Convert from string to ObjectId:
    id = get_ObjId(str_id)
    document = db.posts.find_one({"_id": id})
    return document

pprint.pprint(get(post_id))

## bulk insert

In [None]:
# insert many manfactured social media posts, using the insert_one() command
new_posts = [{"author": "Mike",
              "text": "Another post!",
              "tags": ["bulk", "insert"],
              "date": datetime.datetime(2009, 11, 12, 11, 14)},
             {"author": "Eliot",
              "title": "MongoDB is fun",
              "text": "and pretty easy too!",
              "date": datetime.datetime(2009, 11, 10, 10, 45)}]
result = posts.insert_many(new_posts)
result.inserted_ids

In [None]:
for post in posts.find():
    pprint.pprint(post)

In [None]:
# remove all duplicated documents (using hash)
record_hashes = set()

for record in db.posts.find():
    # remove the id from the record, so that the hash will be evaluated on the document without id
    record_id = record.pop('_id')
    record_hash = md5(dumps(record).encode("utf-8")).hexdigest()

    if record_hash in record_hashes:
        db.posts.delete_one({'_id': record_id})
    else:
        record_hashes.add(record_hash)

for post in db.posts.find():
    pprint.pprint(post)

In [None]:
# remove duplicates based on a key
pipeline = [
    {"$group":{ 
     "_id": {'author':"$author", 'text':"$text"}, 
      "date": {"$first": '$date'},
      "tags": {"$first": '$tags'},
      "ID": {"$first": '$_id'},
     }},
     {"$out" : "deduplicated"}
]

db.posts.aggregate(pipeline)

for post in db.deduplicated.find():
    pprint.pprint(post)

In [None]:
for post in db.deduplicated.find({"_id.author": "Mike"}):
    pprint.pprint(post)

## counting

In [None]:
db.posts.count_documents({})

In [None]:
posts.count_documents({"author": "Mike"})

## range queries

In [None]:
d = datetime.datetime(2009, 11, 12, 12)
for post in posts.find({"date": {"$lt": d}}).sort("author"):
    pprint.pprint(post)