In [1]:
import os
import hashlib
import logging
import gc
from datetime import datetime
from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.core.exceptions import ResourceExistsError
import tracemalloc
import sys
from concurrent_log_handler import ConcurrentRotatingFileHandler  # Updated log handler

In [2]:
# Load environment variables
load_dotenv()

# Azure configuration
AZURE_ACCOUNT_NAME = os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
AZURE_ACCOUNT_KEY = os.getenv('AZURE_STORAGE_ACCOUNT_KEY')
AZURE_CONTAINER_NAME = os.getenv('AZURE_CONTAINER_NAME')
LOCAL_FOLDER = os.getenv('LOCAL_FOLDER_PATH')

In [3]:
# Blob service client
blob_service_client = BlobServiceClient(
    account_url=f"https://{AZURE_ACCOUNT_NAME}.blob.core.windows.net",
    credential=AZURE_ACCOUNT_KEY
)

In [4]:
# Create or get container
container_client = blob_service_client.get_container_client(AZURE_CONTAINER_NAME)
try:
    container_client.create_container()
except ResourceExistsError:
    pass


In [5]:
# Initialize logger with file handler
log_file = "upload_sync.log"
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)  # Set logging to WARNING to reduce verbosity

# Ensure the logger is only added once
if not logger.handlers:  # Check if handlers are already set
    handler = logging.FileHandler(log_file)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)

In [6]:
def list_local_files_with_relative_paths(local_folder):
    """ Recursively list all files in a local folder with their relative paths """
    local_files = set()
    for root, _, files in os.walk(local_folder):
        for file in files:
            # Get the relative path of the file, relative to the base local folder
            relative_path = os.path.relpath(os.path.join(root, file), local_folder)
            # Normalize for cross-platform compatibility (Unix and Windows)
            relative_path = relative_path.replace("\\", "/")
            local_files.add(relative_path)
    return local_files

local_files = list_local_files_with_relative_paths(LOCAL_FOLDER)

In [7]:
def sync_blob_storage_with_local(batch_size_in_bytes=1024 * 1024):
    # Initialize counters and accumulators
    blobs_to_delete = []
    total_deleted_count = 0
    current_batch_size = 0
    
    # List all blobs in the container
    blob_list = container_client.list_blobs()  # This returns an iterable object for all blobs in Azure
    
    
    # Iterate through blobs in Azure
    for blob in blob_list:
        blob_name = blob.name  # Keep the full blob name (which includes subfolders in Azure)
        blob_size = blob.size  # Get the blob size in bytes

        # Check if the blob name exists in the local file set
        if blob_name not in local_files:
            # Blob does not exist locally, mark it for deletion
            blobs_to_delete.append(blob_name)
            total_deleted_count += 1
        
        # Add the blob's size to the current batch and check if batch size limit is reached
        current_batch_size += blob_size
        if current_batch_size >= batch_size_in_bytes:
            current_batch_size = 0  # Reset batch size accumulator
            gc.collect()
    
    # Deleting blobs that don't exist in the local folder anymore
    if blobs_to_delete:
        for blob_name in blobs_to_delete:
            try:
                container_client.delete_blob(blob_name)
            except Exception as e:
                logger.error(f"Error deleting blob {blob_name}: {e}")

    # Log the summary only once after the deletion process
    logger.warning(f"Total deleted files: {total_deleted_count}")
    final_blob_count = len(list(container_client.list_blobs()))  # Final count of blobs
    logger.warning(f"Final number of blobs in storage: {final_blob_count}")

# Run the sync process
if __name__ == "__main__":
    sync_blob_storage_with_local(batch_size_in_bytes=1024 * 1024)