<a href="https://colab.research.google.com/github/ArunaTebel/zarr-to-mongodb/blob/main/Zarr_to_MongoDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install zarr pymongo



In [None]:
import zarr
import xarray as xr
import numpy as np
from datetime import datetime, timedelta
import pymongo
import itertools

In [None]:
def analyze_zarr_structure(store_path):
    """
    Steps 1 & 2: Initialize Connection and Read Source Zarr Structure
    """
    print(f"--- Connecting to: {store_path} ---")

    try:
        # Open the Zarr group
        root_group = zarr.open_group(store_path, mode='r')

        print("\n[Step 2] Zarr Structure Analyzed:")
        print(f"Root Group Info: {root_group.info}")

        # Iterate through arrays to find variables and dimensions
        print("\n--- Variables and Attributes ---")
        for name, item in root_group.arrays():
            print(f"\nVariable Name: {name}")
            print(f"  Shape: {item.shape}")
            print(f"  Chunks: {item.chunks}")
            print(f"  Data Type: {item.dtype}")

            # Check for attributes (metadata)
            if item.attrs:
                print(f"  Attributes: {list(item.attrs.keys())}")

        return root_group

    except Exception as e:
        print(f"Error connecting to Zarr: {e}")
        return None

In [None]:
# Creating a dummy Zarr dataset and analysing it for demo purpose
store = zarr.storage.MemoryStore()
root = zarr.group(store=store)
dset = root.create_array("temperature", shape=(100, 100, 10), chunks=(10, 10, 5), dtype='f4')
dset.attrs['unit'] = 'kelvin'

analyze_zarr_structure(store)

--- Connecting to: memory://133691275452288 ---

[Step 2] Zarr Structure Analyzed:
Root Group Info: Name        : 
Type        : Group
Zarr format : 3
Read-only   : True
Store type  : MemoryStore

--- Variables and Attributes ---

Variable Name: temperature
  Shape: (100, 100, 10)
  Chunks: (10, 10, 5)
  Data Type: float32
  Attributes: ['unit']


<Group memory://133691275452288>

In [None]:
def create_dummy_zarr(store_path='weather_data.zarr'):
    root = zarr.open_group(store_path, mode='w')

    # 2. Define Dimensions
    t_size, y_size, x_size = 40, 100, 100

    # 3. Create Coordinate Arrays
    times = np.array([datetime.now() + timedelta(hours=i) for i in range(t_size)], dtype='M8[ns]')
    time_ds = root.create_array('time', data=times, chunks=(t_size,))
    time_ds.attrs['_ARRAY_DIMENSIONS'] = ['time']

    lats = np.linspace(-90, 90, y_size)
    lat_ds = root.create_array('lat', data=lats, chunks=(y_size,))
    lat_ds.attrs['_ARRAY_DIMENSIONS'] = ['lat']

    lons = np.linspace(-180, 180, x_size)
    lon_ds = root.create_array('lon', data=lons, chunks=(x_size,))
    lon_ds.attrs['_ARRAY_DIMENSIONS'] = ['lon']

    # 4. Create the Main Variable
    temp_ds = root.create_array(
        'temperature',
        shape=(t_size, y_size, x_size),
        chunks=(5, 10, 10),
        dtype='f4'
    )

    # Fill with dummy random data
    temp_ds[:] = 200 + np.random.rand(t_size, y_size, x_size) * 100

    # Metadata
    temp_ds.attrs['_ARRAY_DIMENSIONS'] = ['time', 'lat', 'lon']
    temp_ds.attrs['units'] = 'kelvin'
    temp_ds.attrs['description'] = 'Simulated surface temperature'

    print(f"Created Zarr dataset at {store_path}")
    print(f"Shape: {temp_ds.shape}")
    print(f"Chunks: {temp_ds.chunks}")

    # Calculate total chunks
    # We have chunks of size (5, 10, 10) inside a total array of (40, 100, 100)
    total_chunks = (t_size // 5) * (y_size // 10) * (x_size // 10)
    print(f"Total expected chunks: {total_chunks}")

In [None]:
create_dummy_zarr()

Created Zarr dataset at weather_data.zarr
Shape: (40, 100, 100)
Chunks: (5, 10, 10)
Total expected chunks: 800


In [None]:
def convert_zarr_to_mongo(zarr_path, db_name='climate_db', collection_name='weather_data'):
    # 1. Connect to Zarr
    print(f"--- Opening Zarr: {zarr_path} ---")
    root = zarr.open_group(zarr_path, mode='r')

    # Get the main variable and coordinates
    temp_ds = root['temperature']
    time_ds = root['time']
    lat_ds = root['lat']
    lon_ds = root['lon']

    # Get chunk shape info
    c_time, c_lat, c_lon = temp_ds.chunks

    # Calculate how many chunks exist along each dimension
    # Shape (40, 100, 100) / Chunks (5, 10, 10) = (8, 10, 10)
    n_chunks_time = temp_ds.shape[0] // c_time
    n_chunks_lat = temp_ds.shape[1] // c_lat
    n_chunks_lon = temp_ds.shape[2] // c_lon

    print(f"Grid Layout: {n_chunks_time} x {n_chunks_lat} x {n_chunks_lon} chunks")

    # 2. Connect to MongoDB
    # client = pymongo.MongoClient("mongodb://localhost:27017/")
    client = pymongo.MongoClient("mongodb+srv://arunautebel_db_user:4RU4KMldqWFWxGwg@cluster0.foszjts.mongodb.net/?appName=Cluster0")
    db = client[db_name]
    collection = db[collection_name]

    # Clear old data for testing
    collection.delete_many({})
    print("Connected to MongoDB and cleared old data.")

    # 3. Iterate through the grid (0,0,0) to (7,9,9)
    chunk_indices = itertools.product(
        range(n_chunks_time),
        range(n_chunks_lat),
        range(n_chunks_lon)
    )

    count = 0
    print("Starting conversion...")

    for t_idx, y_idx, x_idx in chunk_indices:
        # A. Calculate the slicing offsets
        # If we are at index 2, and chunk size is 5, we start at 10 (2*5)
        t_start = t_idx * c_time
        y_start = y_idx * c_lat
        x_start = x_idx * c_lon

        # B. Slice the Data (This reads only this specific chunk from disk)
        data_chunk = temp_ds[
            t_start : t_start + c_time,
            y_start : y_start + c_lat,
            x_start : x_start + c_lon
        ]

        # C. Get Coordinate Bounds (Metadata)
        # Look up the real values in the coordinate arrays
        time_vals = time_ds[t_start : t_start + c_time]
        lat_vals = lat_ds[y_start : y_start + c_lat]
        lon_vals = lon_ds[x_start : x_start + c_lon]

        # D. Build the Document
        doc = {
            "_id": f"temp_c{t_idx}_{y_idx}_{x_idx}",
            "chunk_index": {"t": t_idx, "y": y_idx, "x": x_idx},
            "bounds": {
                # Convert numpy types to native Python types for MongoDB
                "time_min": str(time_vals[0]),
                "time_max": str(time_vals[-1]),
                "lat_min": float(lat_vals[0]),
                "lat_max": float(lat_vals[-1]),
                "lon_min": float(lon_vals[0]),
                "lon_max": float(lon_vals[-1])
            },
            # Flatten the 3D chunk into a 1D list (Vectorization)
            "data": data_chunk.flatten().tolist()
        }

        # E. Insert
        collection.insert_one(doc)
        count += 1

        if count % 100 == 0:
            print(f"Processed {count} chunks...")

    print(f"Done! Inserted {count} documents into MongoDB.")

In [None]:
convert_zarr_to_mongo('weather_data.zarr')

--- Opening Zarr: weather_data.zarr ---
Grid Layout: 8 x 10 x 10 chunks
Connected to MongoDB and cleared old data.
Starting conversion...
Processed 100 chunks...
Processed 200 chunks...
Processed 300 chunks...
Processed 400 chunks...
Processed 500 chunks...
Processed 600 chunks...
Processed 700 chunks...
Processed 800 chunks...
Done! Inserted 800 documents into MongoDB.
