In [2]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.13.0-cp39-cp39-win_amd64.whl (747 kB)
Collecting dnspython<3.0.0,>=1.16.0
  Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.13.0


In [16]:
# Data Exploration - Airbnb Paris Dataset
# Run this notebook locally while MongoDB runs in Docker

import pandas as pd
import os
from pymongo import MongoClient
from datetime import datetime

print("=== Airbnb Data Explorer ===")

client = MongoClient("mongodb://admin:admin@localhost:27017/")
db = client.airbnb
collection = db.listings

print(f"Total documents: {collection.count_documents({})}")

=== Airbnb Data Explorer ===
Total documents: 95885


In [5]:
print("=== QUERY 1: Listings per property type ===")

result1 = collection.aggregate([
    {"$group": {"_id": "$property_type", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}}
])

for doc in result1:
    print(f"{doc['_id']}: {doc['count']} listings")

=== QUERY 1: Listings per property type ===
Entire rental unit: 80516 listings
Private room in rental unit: 5980 listings
Entire condo: 2679 listings
Room in boutique hotel: 1256 listings
Entire loft: 1007 listings
Room in hotel: 1003 listings
Entire home: 696 listings
Private room in bed and breakfast: 590 listings
Private room in condo: 344 listings
Entire serviced apartment: 277 listings
Shared room in rental unit: 274 listings
Entire townhouse: 268 listings
Private room in home: 122 listings
Private room in townhouse: 86 listings
Private room in loft: 73 listings
Room in aparthotel: 73 listings
Shared room in hostel: 54 listings
Private room in hostel: 53 listings
Entire place: 46 listings
Private room in guesthouse: 45 listings
Private room in guest suite: 38 listings
Private room: 37 listings
Entire vacation home: 37 listings
Private room in casa particular: 36 listings
Entire guesthouse: 28 listings
Tiny home: 26 listings
Shared room in condo: 23 listings
Entire guest suite: 22 

In [None]:
print("=== QUERY 2: Listings on June 12, 2024 ===")

from datetime import datetime

result2 = collection.count_documents({
    "last_scraped": "2024-06-12" 
})

print(f"Listings on June 12, 2024: {result2}")

=== QUERY 2: Listings on June 12, 2024 ===
Listings on June 12, 2024: 41323


In [7]:
print("=== QUERY 3: Top 5 listings by reviews ===")

result3 = collection.find().sort("reviews.count", -1).limit(5)

for i, doc in enumerate(result3, 1):
    print(f"{i}. Listing {doc['listing_id']}: {doc['reviews']['count']} reviews")

=== QUERY 3: Top 5 listings by reviews ===
1. Listing 17222007: 3067 reviews
2. Listing 26244787: 2620 reviews
3. Listing 41020735: 2294 reviews
4. Listing 40194697: 2105 reviews
5. Listing 35145338: 2048 reviews


In [8]:
print("=== QUERY 4: Total unique hosts ===")

unique_hosts = collection.distinct("host.host_id")
print(f"Total unique hosts: {len(unique_hosts)}")

=== QUERY 4: Total unique hosts ===
Total unique hosts: 71979


In [9]:
print("=== QUERY 5: Instant bookable rentals ===")

instant_count = collection.count_documents({"instant_bookable": True})
total_count = collection.count_documents({})
percentage = (instant_count / total_count) * 100

print(f"Instant bookable: {instant_count}")
print(f"Total listings: {total_count}")
print(f"Percentage: {percentage:.1f}%")

=== QUERY 5: Instant bookable rentals ===
Instant bookable: 22094
Total listings: 95885
Percentage: 23.0%


In [10]:
print("=== QUERY 6: Hosts with 100+ listings ===")

pipeline6 = [
    {"$group": {
        "_id": "$host.host_id", 
        "host_name": {"$first": "$host.host_name"}, 
        "count": {"$sum": 1}
    }},
    {"$match": {"count": {"$gte": 100}}},
    {"$sort": {"count": -1}}
]

result6 = list(collection.aggregate(pipeline6))
total_hosts = len(collection.distinct("host.host_id"))

print(f"Hosts with 100+ listings: {len(result6)}")
print(f"Percentage of total hosts: {len(result6)/total_hosts*100:.2f}%")

for doc in result6:
    print(f"Host {doc['_id']} ({doc['host_name']}): {doc['count']} listings")

=== QUERY 6: Hosts with 100+ listings ===
Hosts with 100+ listings: 24
Percentage of total hosts: 0.03%
Host 314994947 (Blueground): 730 listings
Host 33889201 (Veeve): 497 listings
Host 50502817 (Pierre De WeHost): 426 listings
Host 50978178 (Sébastien): 307 listings
Host 26981054 (Cédric De ClickYourFlat): 274 listings
Host 460047164 (FlexLiving): 232 listings
Host 7642792 (Ludovic): 211 listings
Host 436103373 (David Et Warren): 210 listings
Host 528015349 (Checkmyguest): 189 listings
Host 125797498 (Welkeys): 170 listings
Host 335998296 (Studioprestige): 154 listings
Host 51567288 (Sweet Inn): 149 listings
Host 1112584 (IntoParis): 145 listings
Host 564251645 (Rusard): 137 listings
Host 440295601 (Parisian Home): 132 listings
Host 517515174 (Barnes): 122 listings
Host 99040006 (Jérémy): 120 listings
Host 28313443 (Michael & Johanna): 119 listings
Host 24495283 (Giacomo): 111 listings
Host 506389460 (Check My Guest): 109 listings
Host 374552379 (Checkmyguest): 104 listings
Host 5127

In [11]:
print("=== QUERY 7: Superhosts ===")

unique_superhosts = len(collection.distinct("host.host_id", {"host.is_superhost": True}))
total_hosts = len(collection.distinct("host.host_id"))
percentage = (unique_superhosts / total_hosts) * 100

print(f"Unique superhosts: {unique_superhosts}")
print(f"Total hosts: {total_hosts}")
print(f"Percentage: {percentage:.1f}%")

=== QUERY 7: Superhosts ===
Unique superhosts: 10088
Total hosts: 71979
Percentage: 14.0%
