### Exercise 1: Check the Lab Environment
##### Task: Verify mongoimport and mongoexport, download catalog.json.
##### NOTE: I have already downloaded the catalog.json into the dataset subfolder to ensure local execution of the project

In [4]:
import os
import subprocess

# Change to Module_2 directory
os.chdir('D:/Personal/Data Science/Projects/IBM Data Engineering Capstone Project/Repository/Module_2')

# Verify catalog.json exists
dataset_path = 'dataset/catalog.json'
if not os.path.exists(dataset_path):
    print(f"Error: {dataset_path} not found. Please ensure the file is in Module_2/dataset/")
    raise FileNotFoundError
print(f"Found {dataset_path}")

# Copy catalog.json to the MongoDB container
copy_result = subprocess.run(['docker', 'cp', dataset_path, 'repository-mongodb-1:/data/db/catalog.json'], check=False, capture_output=True, text=True)
if copy_result.returncode != 0:
    print(f"Copy failed: {copy_result.stderr}")
    raise RuntimeError("File copy to container failed")
print("catalog.json copied to container")

# Check container contents
list_result = subprocess.run(['docker', 'exec', 'repository-mongodb-1', 'ls', '-l', '/data/db/'], capture_output=True, text=True)
print("Container /data/db/ contents:", list_result.stdout)
if list_result.returncode != 0:
    print("Error listing container files:", list_result.stderr)

Found dataset/catalog.json
catalog.json copied to container
Container /data/db/ contents: total 540
-rw------- 1 mongodb mongodb    50 Jul 18 06:03 WiredTiger
-rw------- 1 mongodb mongodb    21 Jul 18 06:03 WiredTiger.lock
-rw------- 1 mongodb mongodb  1471 Jul 18 16:01 WiredTiger.turtle
-rw------- 1 mongodb mongodb 94208 Jul 18 16:01 WiredTiger.wt
-rw------- 1 mongodb mongodb  4096 Jul 18 15:57 WiredTigerHS.wt
-rw------- 1 mongodb mongodb 36864 Jul 18 15:57 _mdb_catalog.wt
-rwxr-xr-x 1 root    root    37503 Jul 15 06:49 catalog.json
-rw------- 1 mongodb mongodb  4096 Jul 18 06:47 collection-0-2408367730910524175.wt
-rw------- 1 mongodb mongodb 20480 Jul 18 15:57 collection-0-2887538976524757469.wt
-rw------- 1 mongodb mongodb 36864 Jul 18 15:58 collection-2-2887538976524757469.wt
-rw------- 1 mongodb mongodb 12288 Jul 18 16:00 collection-4-2887538976524757469.wt
-rw------- 1 mongodb mongodb 53248 Jul 18 15:58 collection-6-2408367730910524175.wt
-rw------- 1 mongodb mongodb 20480 Jul 1

### Exercise 2, Task 1: Import catalog.json
##### Task: Import into catalog database, electronics collection.

In [18]:
# Import data with authentication
import_result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongoimport',
    '--db', 'catalog', '--collection', 'electronics',
    '--file', '/data/db/catalog.json',
    '-u', 'root', '-p', 'root_100411', '--authenticationDatabase', 'admin'
], check=False, capture_output=True, text=True)
if import_result.returncode != 0:
    print(f"Import failed: {import_result.stderr}")
    raise RuntimeError("mongoimport failed")
print("Import completed.")
print("Import output:", import_result.stdout)
if not import_result.stdout:
    print("Note: Import progress details may not be captured in this output. Verify in Mongo Express at http://localhost:8081.")

Import completed.
Import output: 
Note: Import progress details may not be captured in this output. Verify in Mongo Express at http://localhost:8081.


** NOTE: ADD HERE A SCREENSHOT VERIFYING THE IMPORT 

### Task 2: List All Databases
##### Description: List all databases in the MongoDB server.

In [5]:
import subprocess

# List all databases with authentication
result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongo', 'admin', '-u', 'root', '-p', 'root_100411',
    '--eval', 'db.getMongo().getDBNames().forEach(function(db) { print(db); })'
], capture_output=True, text=True)
print("Databases:")
if result.returncode == 0:
    print(result.stdout)
else:
    print(f"Error: {result.stderr}")

Databases:
MongoDB shell version v4.4.29
connecting to: mongodb://127.0.0.1:27017/admin?compressors=disabled&gssapiServiceName=mongodb
Implicit session: session { "id" : UUID("0848acfa-4e62-4de8-9d91-7019c163ff0b") }
MongoDB server version: 4.4.29
admin
catalog
config
local



### Task 3: List Collections in database:"catalog"

In [23]:
# List all collections in the 'catalog' database using docker exec and mongo shell
import subprocess

result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongo', 'catalog',
    '-u', 'root', '-p', 'root_100411',
    '--authenticationDatabase', 'admin',
    '--quiet',
    '--eval', 'db.getCollectionNames().forEach(function(coll) { print(coll); })'
], capture_output=True, text=True)

print("Collections in catalog:")
if result.returncode == 0:
    print(result.stdout)
else:
    print(f"Error: {result.stderr}")

Collections in catalog:
delete_me
electronics



### Task 4: Create an index on the field "type"

In [24]:
# Create an index on the "type" field in the electronics collection
import subprocess

result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongo', 'catalog',
    '-u', 'root', '-p', 'root_100411',
    '--authenticationDatabase', 'admin',
    '--quiet',
    '--eval', 'db.electronics.createIndex({ type: 1 }); printjson(db.electronics.getIndexes());'
], capture_output=True, text=True)

if result.returncode == 0:
    print("Index created on 'type' field.")
    print("Indexes in electronics collection:")
    print(result.stdout)
else:
    print(f"Error creating index: {result.stderr}")

Index created on 'type' field.
Indexes in electronics collection:
[
	{
		"v" : 2,
		"key" : {
			"_id" : 1
		},
		"name" : "_id_"
	},
	{
		"v" : 2,
		"key" : {
			"type" : 1
		},
		"name" : "type_1"
	}
]



### Task 5: Find count of Laptops

In [25]:
import subprocess

# Count laptops in the electronics collection with authentication
result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongo', 'catalog',
    '-u', 'root', '-p', 'root_100411',
    '--authenticationDatabase', 'admin',
    '--quiet',
    '--eval', 'print(db.electronics.countDocuments({ type: "laptop" }));'
], capture_output=True, text=True)
print("Count of laptops:")
if result.returncode == 0:
    print(result.stdout)
else:
    print(f"Error: {result.stderr}")

Count of laptops:
389



### Task 6: Find Number of 6-inch Smartphones

In [27]:
# Show count and up to 10 rows of 6-inch smartphones in the electronics collection
import subprocess

query = '{ type: "smart phone", "screen size": 6 }'

# Count matching documents
count_result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongo', 'catalog',
    '-u', 'root', '-p', 'root_100411',
    '--authenticationDatabase', 'admin',
    '--quiet',
    '--eval', f'print("Count: " + db.electronics.countDocuments({query}));'
], capture_output=True, text=True)

print(count_result.stdout.strip())

# Show up to 10 matching documents
show_result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongo', 'catalog',
    '-u', 'root', '-p', 'root_100411',
    '--authenticationDatabase', 'admin',
    '--quiet',
    '--eval', f'printjson(db.electronics.find({query}).limit(10).toArray());'
], capture_output=True, text=True)

print("Sample documents:")
print(show_result.stdout.strip())

Count: 8
Sample documents:
[
	{
		"_id" : ObjectId("6879f446607b1384f7f2d84b"),
		"type" : "smart phone",
		"model" : "c3",
		"screen size" : 6
	},
	{
		"_id" : ObjectId("6879f446607b1384f7f2d84c"),
		"type" : "smart phone",
		"model" : "bn20",
		"screen size" : 6
	},
	{
		"_id" : ObjectId("6879f446607b1384f7f2d84e"),
		"type" : "smart phone",
		"model" : "c12",
		"screen size" : 6
	},
	{
		"_id" : ObjectId("6879f446607b1384f7f2d851"),
		"type" : "smart phone",
		"model" : "xm23",
		"screen size" : 6
	},
	{
		"_id" : ObjectId("6879f446607b1384f7f2d855"),
		"type" : "smart phone",
		"model" : "cz",
		"screen size" : 6
	},
	{
		"_id" : ObjectId("6879f446607b1384f7f2d864"),
		"type" : "smart phone",
		"model" : "platina",
		"screen size" : 6
	},
	{
		"_id" : ObjectId("6879f446607b1384f7f2d87a"),
		"type" : "smart phone",
		"model" : "k9",
		"screen size" : 6
	},
	{
		"_id" : ObjectId("6879f446607b1384f7f2d892"),
		"type" : "smart phone",
		"model" : "pk2",
		"screen size" : 6
	}
]


### Task 7: Find average screen size of Smartphones

In [28]:
import subprocess

# Calculate average screen size of smartphones with authentication
result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongo', 'catalog',
    '-u', 'root', '-p', 'root_100411',
    '--authenticationDatabase', 'admin',
    '--quiet',
    '--eval', 'var avg = db.electronics.aggregate([{ $match: { type: "smart phone" } }, { $group: { _id: null, average_screen_size: { $avg: "$screen size" } } }]).toArray()[0].average_screen_size; print("Average screen size: " + avg);'
], capture_output=True, text=True)
print("Average screen size of smartphones:")
if result.returncode == 0:
    print(result.stdout.strip())
else:
    print(f"Error: {result.stderr}")

Average screen size of smartphones:
Average screen size: 6


### Task 8: Export Fields to CSV

In [31]:
# Export fields to CSV with authentication
import subprocess
import os

host_output_path = 'dataset/electronics.csv'  # Save to Module_2/dataset

export_result = subprocess.run([
    'docker', 'exec', 'repository-mongodb-1', 'mongoexport',
    '-u', 'root', '-p', 'root_100411', '--authenticationDatabase', 'admin',
    '--db', 'catalog', '--collection', 'electronics',
    '--out', '/data/db/electronics.csv', '--type', 'csv', '--fields', '_id,type,model'
], check=False, capture_output=True, text=True)

if export_result.returncode == 0:
    print("Export completed.")
    # Copy the file back to the host's dataset directory
    subprocess.run([
        'docker', 'cp',
        'repository-mongodb-1:/data/db/electronics.csv',
        host_output_path
    ], check=True)
    print(f"File copied to Module_2/{host_output_path}")
else:
    print(f"Export failed: {export_result.stderr}")

Export completed.
File copied to Module_2/dataset/electronics.csv
