In [1]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pymongo
from pymongo import MongoClient
import pandas as pd

In [3]:
payments = pd.read_csv("Project 2 Launch  - Payments.csv")
rooms = pd.read_csv("Project 2 Launch  - Rooms.csv")
reserve = pd.read_csv("Project 2 Launch  - Reservations.csv")
guests = pd.read_csv("Project 2 Launch  - Guests.csv")

In [4]:
rooms.columns = rooms.columns.str.strip()
reserve.columns = reserve.columns.str.strip()
payments.columns = payments.columns.str.strip()
guests.columns = guests.columns.str.strip()

In [5]:
launch = MongoClient('mongodb://localhost:27017/')

In [6]:
db = launch.project

In [7]:
collection = db.payments
collection1 = db.rooms
collection2 = db.reserve
collection3 = db.guests

In [8]:
data = payments.to_dict(orient='records')
data1 = rooms.to_dict(orient='records')
data2 = reserve.to_dict(orient='records')
data3 = guests.to_dict(orient='records')

In [9]:
collection.insert_many(data)
collection1.insert_many(data1)
collection2.insert_many(data2)
collection3.insert_many(data3)

InsertManyResult([ObjectId('6659cbed50daff0c7009108d'), ObjectId('6659cbed50daff0c7009108e'), ObjectId('6659cbed50daff0c7009108f'), ObjectId('6659cbed50daff0c70091090'), ObjectId('6659cbed50daff0c70091091'), ObjectId('6659cbed50daff0c70091092'), ObjectId('6659cbed50daff0c70091093'), ObjectId('6659cbed50daff0c70091094'), ObjectId('6659cbed50daff0c70091095'), ObjectId('6659cbed50daff0c70091096'), ObjectId('6659cbed50daff0c70091097'), ObjectId('6659cbed50daff0c70091098'), ObjectId('6659cbed50daff0c70091099'), ObjectId('6659cbed50daff0c7009109a'), ObjectId('6659cbed50daff0c7009109b'), ObjectId('6659cbed50daff0c7009109c'), ObjectId('6659cbed50daff0c7009109d'), ObjectId('6659cbed50daff0c7009109e'), ObjectId('6659cbed50daff0c7009109f'), ObjectId('6659cbed50daff0c700910a0'), ObjectId('6659cbed50daff0c700910a1'), ObjectId('6659cbed50daff0c700910a2'), ObjectId('6659cbed50daff0c700910a3'), ObjectId('6659cbed50daff0c700910a4'), ObjectId('6659cbed50daff0c700910a5'), ObjectId('6659cbed50daff0c700910

In [10]:
initial_pipeline = [
    {
        "$match": {
            # Add criteria to filter your primary collection before the lookup
            "Amount": {"$gte": 1000}  # Example filter
        }
    },
    {
        "$lookup": {
            "from": "reserve",  # The target collection to join
            "localField": "Reservation_ID",  # Field from the input documents (payments collection)
            "foreignField": "Reservation_ID",  # Field from the documents of the 'reserve' collection
            "as": "reserve"  # Output array field to add to each input document
        }
    },
    {
        "$unwind": "$reserve"  # Unwind the 'reserve' array
    },
    {
        "$lookup": {
            "from": "guests",  # The target collection to join
            "localField": "reserve.Guest_ID",  # Field from the 'reserve' documents
            "foreignField": "Guest_ID",  # Field from the documents of the 'guests' collection
            "as": "guests"  # Output array field to add to each input document
        }
    },
    {
        "$unwind": "$guests"  # Unwind the 'guests' array
    },
    {
        "$project": {
            "Amount": 1,  # Include the 'Amount' field from payments collection
            "Guest_ID": "$reserve.Guest_ID",  # Include the 'Guest_ID' from reserve
            "Email": "$guests.Email",  # Include the 'Email' from guests
            "Name": "$guests.Name"  # Include the 'Name' from guests
        }
    },
    {
        "$sort": {
            "Amount": -1  # Sort by 'Amount' field in descending order
        }
    }
]

# Add the duplicate removal stage
deduplication_stage = [
    {
        "$group": {
            "_id": {
                "Amount": "$Amount",
                "Guest_ID": "$Guest_ID",
                "Email": "$Email",
                "Name": "$Name"
            },
            "doc": { "$first": "$$ROOT" }
        }
    },
    {
        "$replaceRoot": { "newRoot": "$doc" }
    }
]

# Add a match stage to filter out documents without all required fields
cleanup_stage = [
    {
        "$match": {
            "Guest_ID": { "$exists": True },
            "Email": { "$exists": True },
            "Name": { "$exists": True }
        }
    },
    {
        "$sort": {
            "Amount": -1  # Sort by 'Amount' field in descending order
        }
    }
]

# Combine all pipeline stages
pipeline = initial_pipeline + deduplication_stage + cleanup_stage

# Execute the pipeline
result = list(collection.aggregate(pipeline))

# Print the results
for doc in result:
    print(doc)

{'_id': ObjectId('66573e20002bea8b8345f4fd'), 'Amount': 3300, 'Guest_ID': 975786, 'Email': 'j9b2@example.com', 'Name': 'Zoe Watson'}
{'_id': ObjectId('66573e20002bea8b8345f4ef'), 'Amount': 2400, 'Guest_ID': 547839, 'Email': 'd7k8h2@example.com', 'Name': 'Isabella Hayes'}
{'_id': ObjectId('66573e20002bea8b8345f4eb'), 'Amount': 1950, 'Guest_ID': 834726, 'Email': 'q6a5b3@example.com', 'Name': 'Sophia Bennett'}
{'_id': ObjectId('66573e20002bea8b8345f4f9'), 'Amount': 1500, 'Guest_ID': 527038, 'Email': 'h2d4@example.com', 'Name': 'Lily Edwards'}
{'_id': ObjectId('66573e20002bea8b8345f4f1'), 'Amount': 1350, 'Guest_ID': 276385, 'Email': 'c6b7t3@example.com', 'Name': 'Charlotte Jenkins'}
{'_id': ObjectId('66573e20002bea8b8345f4f7'), 'Amount': 1350, 'Guest_ID': 672839, 'Email': 't9x2d4@example.com', 'Name': 'Grace Henderson'}
{'_id': ObjectId('66573e20002bea8b8345f4e9'), 'Amount': 1200, 'Guest_ID': 384920, 'Email': 'j1l4d8@example.com', 'Name': 'Emma Sullivan'}
{'_id': ObjectId('66573e20002bea8b