## MongoDB

In [None]:
# import statements
import os
from pymongo import MongoClient
import bson
from datetime import datetime, timedelta

### Connection establishment

In [None]:
client = MongoClient('mongodb://localhost:27017/')
db = client.sample_training

In [None]:
# directory where the JSON files are stored
json_dir = 'sample_training'
json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]
collections = [f.replace(".json", "") for f in json_files]
collections

In [None]:
for idx, json_file in enumerate(json_files):
    with open(os.path.join(json_dir, json_file), 'r') as f:
        for line in f:
            data = bson.json_util.loads(line.strip())
            db[collections[idx]].insert_one(data)
        
        print(f"Loaded {json_file} into the '{collections[idx]}' collection.")

In [None]:
db.list_collection_names()

#### Q: Find all inspections that either occurred in Manhattan or Brooklyn.

- equivalent to:
```
    SELECT * FROM <TABLE NAME>
    WHERE <SOME COLUMN1> = <SOME VALUE> OR
        <SOME COLUMN 2> >= <SOME VALUE1> AND <SOME COLUMN2> <= <SOME VALUE 2>
```

In [None]:
cursor = db.inspections.find({
    "$or": [
        { "address.city": "MANHTTAN" },
        { "address.city": "BROOKLYN" }
    ]
})

ny_la_inspections = list(cursor)
ny_la_inspections

### Mongodb comparison operators

- `$eq`: Matches values that are equal to a specified value.
- `$gt`: Matches values that are greater than a specified value.
- `$gte`: Matches values that are greater than or equal to a specified value.
- `$in`: Matches any of the values specified in an array.
- `$lt`: Matches values that are less than a specified value.
- `$lte`: Matches values that are less than or equal to a specified value.
- `$ne`: Matches all values that are not equal to a specified value.
- `$nin`: Matches none of the values specified in an array.

Documentation: https://www.mongodb.com/docs/manual/reference/operator/query-comparison/

### `limit()` method

- specify the maximum number of documents the cursor will return
- documentation: https://www.mongodb.com/docs/manual/reference/method/cursor.limit/#mongodb-method-cursor.limit

#### Q: Find the first five trips.

- equivalent to: `SELECT * FROM <TABLE NAME> LIMIT <N>`

In [None]:
five_trips = list(db.trips.find().limit(5))
five_trips

### Sorting using `sort` method

### `sort()` method

- Specify in the sort parameter the field or fields to sort by and a value of 1 or -1 to specify an ascending or descending sort respectively.
- documentation: https://www.mongodb.com/docs/manual/reference/method/cursor.sort/#mongodb-method-cursor.sort

### `$regex`
- documentation: https://www.mongodb.com/docs/manual/reference/operator/query/regex/

#### Q: Find all inspections that occurred in 2015 and sort them by ascending order of `id`.

- equivalent to: `SELECT * FROM <TABLE NAME> WHERE <SOME COL> LIKE <SOME SEARCH TERM> ORDER BY <SOME COL> ASC`

In [None]:
inspections_2015 = db.inspections.find({
    "date": {"$regex": "2015"}
}).sort("id", 1)  # 1 for ascending order
list(inspections_2015)

Sort the same using descending order.

In [None]:
inspections_2015 = db.inspections.find({
    "date": {"$regex": "2015"}
}).sort("id", -1)  # 1 for ascending order
list(inspections_2015)

#### Q: Find all inspections on all incorporated businesses.

In [None]:
inc_inspections = db.inspections.find({
    "business_name": {"$regex": "INC$|INC\\.$"}
})
list(inc_inspections)[-3:]

### `findOne(query, projection, options)`

- Fetches the first document that matches the query
- documentation: https://www.mongodb.com/docs/manual/reference/method/db.collection.findOne/
- **IMPORTANT**: In Python API, you must replace camelcase with `_`. That is, method name is `find_one`.

#### Q: Find the first trip.

In [None]:
trip = db.trips.find_one()
trip

### MongoDB shell `mongosh`

```
docker exec -it <container name> mongosh
show dbs
use sample_training
show collections
db.trips.find().limit(5).pretty()
```

### `db.collection.countDocuments(query, options)`

- Returns an integer for the number of documents that match the query of the collection or view.
- documentation: https://www.mongodb.com/docs/manual/reference/method/db.collection.countDocuments/

#### Q: How many trips are in the trips collection?

In [None]:
db.trips.count_documents({})

#### Q: How many trips were taken by people born after the year 1988?

In [None]:
db.trips.count_documents({ "birth year": { "$gt": 1988 } })

### AirBnb Listings and Reviews

Source: https://www.mongodb.com/docs/atlas/sample-data/sample-airbnb/

In [None]:
db = client.sample_airbnb

In [None]:
# directory where the JSON files are stored
json_dir = 'sample_airbnb'
json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]
collections = [f.replace(".json", "") for f in json_files]
collections

In [None]:
for idx, json_file in enumerate(json_files):
    with open(os.path.join(json_dir, json_file), 'r') as f:
        for line in f:
            data = bson.json_util.loads(line.strip())
            db[collections[idx]].insert_one(data)
        
        print(f"Loaded {json_file} into the '{collections[idx]}' collection.")

In [None]:
db.list_collection_names()

In [None]:
#db.listingsAndReviews.find_one({}, {"_id": 0})

### Mongodb logical operators

- `$and`: Joins query clauses with a logical AND returns all documents that match the conditions of both clauses.
- `$not`: Inverts the effect of a query predicate and returns documents that do not match the query predicate.
- `$nor`: Joins query clauses with a logical NOR returns all documents that fail to match both clauses.
- `$or`: Joins query clauses with a logical OR returns all documents that match the conditions of either clause.

Documentation: https://www.mongodb.com/docs/manual/reference/operator/query-logical/

#### Q: Find listings where the price is 100 and the number of bedrooms is 2.

Default logical operator is `$and`.

In [None]:
cursor = db.listingsAndReviews.find(
    {"price": 100, "bedrooms": 2},
    {"name": 1, "price": 1, "bedrooms": 1, "_id": 0}
)
listings = list(cursor)
listings[:5]

You could also explicitly make it clear that `$and` operator is being applied.

In [None]:
cursor = db.listingsAndReviews.find(
    {"$and": [{"price": 100}, {"bedrooms": 2}]},
    {"name": 1, "price": 1, "bedrooms": 1, "_id": 0}
)
listings = list(cursor)
listings[:5]

#### Q: Find listings where the price is 100 or the number of bedrooms is 2.

In [None]:
cursor = db.listingsAndReviews.find(
    {"$or": [{"price": 100}, {"bedrooms": 2}]},
    {"name": 1, "price": 1, "bedrooms": 1, "_id": 0}
)
listings = list(cursor)
listings[:5]

#### Q: Find listings where the price is not greater than 100.

In [None]:
cursor = db.listingsAndReviews.find(
    {"price": {"$not": {"$gt": 100}}},
    {"name": 1, "price": 1, "_id": 0}
)
listings = list(cursor)
listings[:5]

Of course, you can solve this without using the `$not` operator too.

In [None]:
cursor = db.listingsAndReviews.find(
    {"price": {"$lte": 100}},
    {"name": 1, "price": 1, "_id": 0}
)
listings = list(cursor)
listings[:5]

#### Q: Find listings where the price is neither 100 nor 200.

In [None]:
cursor = db.listingsAndReviews.find(
    {"$nor": [{"price": 100}, {"price": 200}]},
    {"name": 1, "price": 1, "bedrooms": 1, "_id": 0}
)
listings = list(cursor)
listings[:5]

### Mongodb element query operators

- `$exists` (`True` or `False`): Matches documents that have the specified field.
- `$type`: Selects documents if a field is of the specified type.

Documentation: https://www.mongodb.com/docs/manual/reference/operator/query-element/

#### Q: Find listings where the `cleaning_fee` field exists.

In [None]:
cursor = db.listingsAndReviews.find(
    {"cleaning_fee": {"$exists": True}}, 
    {"name": 1, "cleaning_fee": 1, "_id": 0}
)
listings = list(cursor)
len(listings)

In [None]:
cursor = db.listingsAndReviews.find(
    {"cleaning_fee": {"$exists": False}}, 
    {"name": 1, "cleaning_fee": 1, "_id": 0}
)
listings = list(cursor)
len(listings)

List of types documentation: https://www.mongodb.com/docs/manual/reference/operator/query/type/#mongodb-query-op.-type

#### Q: Find listings where the price is stored as decimal.

In [None]:
cursor = db.listingsAndReviews.find(
    {"price": {"$type": "decimal"}}, 
    {"name": 1, "price": 1, "_id": 0}
)
listings = list(cursor)
len(listings)

### Mongodb evaluation query operators

- `$regex`: Selects documents where values match a specified regular expression.
- `$expr`: Allows use of aggregation expressions within the query language.
- `$mod`: Performs a modulo operation on the value of a field and selects documents with a specified result.

Documentation: https://www.mongodb.com/docs/manual/reference/operator/query-logical/

#### Q: Find all listings where extra_people is more than twice of guests_included.

In [None]:
cursor = db.listingsAndReviews.find(
    {
        "$expr": {
            "$gt": ["$extra_people", {
                "$multiply": ["$guests_included", 2]
            }]
        }
    },
    {"name": 1, "extra_people": 1, "guests_included": 1, "_id": 0}
)
listings = list(cursor)
listings[:1]

#### Q: Find listings where the last_review date is within the last 30 days.
For the purpose of this question, let's assume current day is March 11th 2019. Last review date from the dataset. 

**Self-assesment**: Try writing code to figure this out!

In [None]:
march_11_2019 = datetime(2019, 3, 11, 0, 0)
thirty_days_ago = march_11_2019 - timedelta(days=30)

cursor = db.listingsAndReviews.find(
    {
        "$expr": {
            "$gt": ["$last_review", thirty_days_ago]
        }
    },
    {"name": 1, "last_review": 1, "_id": 0}
)
listings = list(cursor)
listings[:5]

#### Q: Find listings where the price is a multiple of 5.

In [None]:
cursor = db.listingsAndReviews.find(
    {"price": {"$mod": [5, 0]}},
    {"name": 1, "price": 1, "_id": 0}
)
listings = list(cursor)
listings[:3]

### Array Query Operators

- `$all`: Matches arrays that contain all elements specified in the query.
- `$elemMatch`: Selects documents if at least one element in the array field matches all the specified $elemMatch conditions.
- `$size`: Selects documents if the array field is a specified size.

- documentation: https://www.mongodb.com/docs/manual/reference/operator/query-array/

#### Q: Find the name the amenities of listings where the number of amenities is exactly 5.

In [None]:
cursor = db.listingsAndReviews.find(
    {"amenities": {"$size": 5}},
    {"name": 1, "amenities": 1, "_id": 0}
)
listings = list(cursor)
listings[:3]

#### Q: Find the name the amenities of all listings that have "Pack ’n Play/travel crib".

In [None]:
cursor = db.listingsAndReviews.find(
    {
        "amenities": {
            "$in": ["Pack ’n Play/travel crib"]
        }
    },
    {"name": 1, "amenities": 1, "_id": 0}
)
listings = list(cursor)
listings[:1]

#### Q: Find the name the amenities of all listings that have "Pack ’n Play/travel crib" and "High chair".

In [None]:
cursor = db.listingsAndReviews.find(
    {
        "amenities": {
            "$all": ["Pack ’n Play/travel crib", "High chair"]
        }
    },
    {"name": 1, "amenities": 1, "_id": 0}
)
listings = list(cursor)
listings[:1]

#### Q: Find the name the amenities of listings that have at least one of: "Pack ’n Play/travel crib", "High chair".

In [None]:
cursor = db.listingsAndReviews.find(
    {
        "amenities": {
            "$elemMatch": {
                "$in": ["Pack ’n Play/travel crib", "High chair"]}
        }
    },
    {"name": 1, "amenities": 1, "_id": 0}
)
listings = list(cursor)
listings[:1]

#### Q: Find all listings with at least 10 amenities.

In [None]:
cursor = db.listingsAndReviews.find(
    {
        "$expr": {
            "$gte": [{"$size": "$amenities"}, 10] 
        }
    },
    {"name": 1, "amenities": 1, "_id": 0}
)
listings = list(cursor)
listings[:2]

### Mongodb projection operators

- `$slice`: Limits the number of elements in an array that appear in the query results.
    - Positive \<N\>: Slices first N elements.
    - Negative \<N\>: Slices last N elements.
    - **IMPORTANT NOTE:** Slicing is applied inside projection and not inside the query aka selection.

#### Q: Find the first 3 amenities for each listing.

In [None]:
cursor = db.listingsAndReviews.find({},
    {"name": 1, "amenities": {"$slice": 3}, "_id": 0}
)
listings = list(cursor)
listings[:1]

#### Q: Find the last 3 amenities for each listing.

In [None]:
cursor = db.listingsAndReviews.find({},
    {"name": 1, "amenities": {"$slice": -3}, "_id": 0}
)
listings = list(cursor)
listings[:1]

### Analytics dataset

Source: https://www.mongodb.com/docs/atlas/sample-data/sample-analytics/

In [None]:
db = client.sample_analytics

In [None]:
# directory where the JSON files are stored
json_dir = 'sample_analytics'
json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]
collections = [f.replace(".json", "") for f in json_files]
collections

In [None]:
for idx, json_file in enumerate(json_files):
    with open(os.path.join(json_dir, json_file), 'r') as f:
        for line in f:
            data = bson.json_util.loads(line.strip())
            db[collections[idx]].insert_one(data)
        
        print(f"Loaded {json_file} into the '{collections[idx]}' collection.")

### Combining information from multiple collections

#### Q: Find all transactions made by customers born in 1988.

Let's first find relevant information from customers collection.

In [None]:
start_date = datetime(1988, 1, 1)
end_date = datetime(1989, 1, 1)

born_in_1988 = db.customers.find(
    {
        "birthdate": {
            "$gte": start_date, "$lt": end_date
        }
    }, 
    {"accounts": 1, "birthdate": 1, "_id": 1})
born_in_1988 = list(born_in_1988)

In [None]:
all_accounts = []

for customer in born_in_1988:
    accounts = customer["accounts"]
    all_accounts.extend(accounts)

In [None]:
transactions = db.transactions.find({"account_id": {"$in": all_accounts}})
transactions = list(transactions)
transactions[:1]