# Pipeline 3: Mapping

This is a prototype and test for the third pipeline, and ETL step from ListingDetails to ListingRecord.

## Prerequisites

In [None]:
import sys
from pathlib import Path
import os

# Find the project root
project_root = Path().cwd().parent
print(f"Project root: {project_root}")

# Add project root to Python path (not just sources)
sys.path.insert(0, str(project_root))
print(f"Added to Python path: {project_root}")

# Set environment variables

os.environ["QE_ENV"] = "dev"
os.environ["QE_CONF_FOLDER"] = "sources/resources"
print(f"Added environment variables: QE_ENV={os.environ['QE_ENV']}, QE_CONF_FOLDER={os.environ['QE_CONF_FOLDER']}")

In [None]:
from sources.datamodel.listing_details import ListingDetails
from sources.datamodel.listing_record import ListingRecord
from sources.logging import logging_utils
from sources.storage.abstract_storage import Storage
from sources.config.config_manager import ConfigManager
from sources.mappers.immobiliare_listing_mapper import ListingDataTransformer

## Configuration

In [None]:
logging_utils.setup_logging(config_path="sources/resources/logging.yaml")
logger = logging_utils.get_logger(__name__)

config_manager = ConfigManager()
config_manager.invalidate_caches()

storage_settings = config_manager.get_storage_config()

In [None]:
storage: Storage = Storage.create_storage(
    data_type=ListingRecord, 
    config=storage_settings
)

## Extract ListingIds from MongoDB

In [None]:
from pymongo import MongoClient
from contextlib import contextmanager

from sources.config.model.storage_settings import MongoStorageSettings
from sources.storage.mongo_storage import MongoDBStorage


# Get MongoDB configuration from storage settings
mongo_config: MongoStorageSettings = storage_settings.mongodb_settings  # This should be a MongoStorageSettings instance


# Connect to MongoDB using the same configuration as the storage
@contextmanager
def get_mongo_client():
    """Context manager for MongoDB client with proper resource cleanup."""
    client = MongoClient(mongo_config.connection_string.get_secret_value())
    try:
        yield client
    finally:
        client.close()


# Query for ListingDetails that don't have corresponding ListingRecords using aggregation
batch_size = 3000


with get_mongo_client() as client:

    db = client[mongo_config.database]
    listings_collection = db[mongo_config.collection_listings]
    records_collection = db[mongo_config.collection_records]

    # Use aggregation pipeline with $lookup (left outer join) to find unprocessed IDs
    pipeline = [
        {
            "$lookup": {
                "from": mongo_config.collection_records,  # Join with records collection
                "localField": "id",  # Field from listings collection
                "foreignField": "id",  # Field from records collection
                "as": "listing_records",  # Output array field
            }
        },
        {
            "$match": {
                "listing_records": {
                    "$size": 0
                }  # Filter where no matching listing records found
            }
        },
        {"$sample": {"size": batch_size}},  # Randomly sample from matching documents
        {
            "$project": {
                "listing_records": 0  # Remove the empty listing_records array from output
            }
        },
    ]

    # Execute aggregation pipeline
    unprocessed_docs = list(listings_collection.aggregate(pipeline))

    # Convert documents back to ListingDetails objects
    listingDetails = [ListingDetails.from_dict(doc) for doc in unprocessed_docs]

    print(f"Found {len(listingDetails)} ListingDetails without corresponding ListingRecords")


listingDetails[:5]

## ETL

In [None]:
mapper = ListingDataTransformer()

records = [mapper.map(listing) for listing in listingDetails]
storage.append_data(records)