# Task 4 — MongoDB (Compass) + JSON Export + Benchmark

This notebook helps you complete Task 4:

1. Convert the Task 1 CSV (`../data/processed/books.csv`) into JSON for MongoDB.
2. Import the JSON using **MongoDB Compass**.
3. Run sample MongoDB queries (filters + sorts).
4. Run a simple **execution time comparison**: MySQL vs MongoDB.

> Assumption: your notebook working directory is `notebooks/`.


In [2]:
from pathlib import Path
import pandas as pd
import json


## 1) Paths (relative to notebooks/)


In [3]:
CSV_PATH = (Path.cwd() / "../data/processed/books.csv").resolve()
EXPORT_DIR = (Path.cwd() / "../data/exports").resolve()
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

JSON_ARRAY_PATH = EXPORT_DIR / "books.json"
NDJSON_PATH = EXPORT_DIR / "books.ndjson"

print("CSV:", CSV_PATH, "exists:", CSV_PATH.exists())
print("Export dir:", EXPORT_DIR)


CSV: C:\Users\Lenovo\Documents\Program Projects\EDUCLASS\MODULE 15 Data Engineering\COURSEWORK\CW1\Project\data\processed\books.csv exists: True
Export dir: C:\Users\Lenovo\Documents\Program Projects\EDUCLASS\MODULE 15 Data Engineering\COURSEWORK\CW1\Project\data\exports


## 2) Convert CSV → JSON (array + NDJSON)


In [4]:
df = pd.read_csv(CSV_PATH)

expected = ["title", "authors", "year", "star_rating", "price", "source_url"]
missing = [c for c in expected if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}. Found: {list(df.columns)}")

df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
df["star_rating"] = pd.to_numeric(df["star_rating"], errors="coerce")
df["price"] = pd.to_numeric(df["price"], errors="coerce")

if "source" not in df.columns:
    df["source"] = "manning"

records = df.where(pd.notnull(df), None).to_dict(orient="records")

JSON_ARRAY_PATH.write_text(json.dumps(records, ensure_ascii=False, indent=2), encoding="utf-8")
with NDJSON_PATH.open("w", encoding="utf-8") as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("✅ Wrote:", JSON_ARRAY_PATH)
print("✅ Wrote:", NDJSON_PATH)
print("Rows:", len(records))


✅ Wrote: C:\Users\Lenovo\Documents\Program Projects\EDUCLASS\MODULE 15 Data Engineering\COURSEWORK\CW1\Project\data\exports\books.json
✅ Wrote: C:\Users\Lenovo\Documents\Program Projects\EDUCLASS\MODULE 15 Data Engineering\COURSEWORK\CW1\Project\data\exports\books.ndjson
Rows: 15


## 3) Import into MongoDB using Compass (manual step)

### Steps in Compass

1. Open **MongoDB Compass**
2. Connect to: `mongodb://localhost:27017`
3. Create database:
    - Database name: `cw1_de`
    - Collection name: `books`
4. Click `cw1_de.books` → **Import Data**
5. Choose file: `data/exports/books.json` (or `books.ndjson`)
6. Import


## 4) Sample MongoDB queries to run in Compass


In [None]:
# Paste these into Compass 'Filter' and 'Sort' boxes.

# A) Price >= 30, sort by price descending
# Filter: { "price": { "$gte": 30 } }
# Sort:   { "price": -1 }

# B) Year >= 2020, sort by year descending
# Filter: { "year": { "$gte": 2020 } }
# Sort:   { "year": -1 }

# C) Title contains 'data' (case-insensitive)
# Filter: { "title": { "$regex": "data", "$options": "i" } }


## 5) Benchmark query execution time (MySQL vs MongoDB)

This part uses Python.

### Install one extra dependency (once)

In your venv:
`pip install pymongo`


In [None]:
import os
import time
import mysql.connector
from pymongo import MongoClient

# Load ../.env because notebook runs in /notebooks
ENV_PATH = (Path.cwd() / "../.env").resolve()
if ENV_PATH.exists():
    for line in ENV_PATH.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        k, v = line.split("=", 1)
        os.environ.setdefault(k.strip(), v.strip())

DB_HOST = os.getenv("DB_HOST", "127.0.0.1")
DB_PORT = int(os.getenv("DB_PORT", "3306"))
DB_USER = os.getenv("DB_USER", "root")
DB_PASSWORD = os.getenv("DB_PASSWORD", "")

MYSQL_DB = "module15_cw1_de_py"
MYSQL_TABLE = "books_import_py"

MONGO_URI = "mongodb://localhost:27017"
MONGO_DB = "cw1_de"
MONGO_COLLECTION = "books"

MIN_PRICE = 30.0


In [None]:
def mysql_bench(min_price: float) -> tuple[float, int]:
    t0 = time.perf_counter()
    conn = mysql.connector.connect(
        host=DB_HOST, port=DB_PORT, user=DB_USER, password=DB_PASSWORD, database=MYSQL_DB
    )
    cur = conn.cursor()
    sql = f"""
    SELECT title, year, price
    FROM {MYSQL_TABLE}
    WHERE price >= %s
    ORDER BY price DESC;
    """
    cur.execute(sql, (min_price,))
    rows = cur.fetchall()
    cur.close()
    conn.close()
    t1 = time.perf_counter()
    return (t1 - t0), len(rows)

def mongo_bench(min_price: float) -> tuple[float, int]:
    t0 = time.perf_counter()
    client = MongoClient(MONGO_URI)
    coll = client[MONGO_DB][MONGO_COLLECTION]
    rows = list(coll.find(
        {"price": {"$gte": min_price}},
        {"_id": 0, "title": 1, "year": 1, "price": 1}
    ).sort("price", -1))
    client.close()
    t1 = time.perf_counter()
    return (t1 - t0), len(rows)

mysql_time, mysql_count = mysql_bench(MIN_PRICE)
mongo_time, mongo_count = mongo_bench(MIN_PRICE)

print("=== Results ===")
print(f"MySQL  -> {mysql_time:.6f} sec | rows: {mysql_count}")
print(f"MongoDB-> {mongo_time:.6f} sec | rows: {mongo_count}")

if mysql_count != mongo_count:
    print("⚠️ Row counts differ. Double-check that you imported the same dataset into MongoDB.")
else:
    print("✅ Row counts match (fair comparison).")
