Mirroring code

In [52]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from azure.storage.filedatalake import DataLakeServiceClient
from azure.identity import ClientSecretCredential
import requests
import json
import os

class OpenMirroringClient:
    def __init__(self, client_id: str, client_secret: str, client_tenant: str, host: str):
        self.client_id = client_id
        self.client_secret = client_secret
        self.client_tenant = client_tenant
        self.host = self._normalize_path(host)
        self.service_client = self._create_service_client()

    def _normalize_path(self, path: str) -> str:
        """
        Normalizes the given path by removing the 'LandingZone' segment if it ends with it.

        :param path: The original path.
        :return: The normalized path.
        """
        if path.endswith("LandingZone"):
            # Remove the 'LandingZone' segment
            return path[:path.rfind("/LandingZone")]
        elif path.endswith("LandingZone/"):
            # Remove the 'LandingZone/' segment
            return path[:path.rfind("/LandingZone/")]
        return path

    def _create_service_client(self):
        """Creates and returns a DataLakeServiceClient."""
        try:
            credential = ClientSecretCredential(self.client_tenant, self.client_id, self.client_secret)            
            return DataLakeServiceClient(account_url=self.host, credential=credential)
        except Exception as e:
            raise Exception(f"Failed to create DataLakeServiceClient: {e}")

    def create_table(self, schema_name: str = None, table_name: str = "", key_cols: list = []):
        """
        Creates a folder in OneLake storage and a _metadata.json file inside it.

        :param schema_name: Optional schema name.
        :param table_name: Name of the table.
        :param key_cols: List of key column names.
        """
        if not table_name:
            raise ValueError("table_name cannot be empty.")

        # Construct the folder path
        folder_path = f"{schema_name}.schema/{table_name}" if schema_name else f"{table_name}"

        try:
            # Create the folder
            file_system_client = self.service_client.get_file_system_client(file_system="LandingZone")  # Replace with your file system name
            directory_client = file_system_client.get_directory_client(folder_path)
            directory_client.create_directory()

            # Create the _metadata.json file
            metadata_content = {"keyColumns": [f'{col}' for col in key_cols]}
            metadata_file_path = os.path.join(folder_path, "_metadata.json")
            file_client = directory_client.create_file("_metadata.json")
            file_client.append_data(data=json.dumps(metadata_content), offset=0, length=len(json.dumps(metadata_content)))
            file_client.flush_data(len(json.dumps(metadata_content)))

            print(f"Folder and _metadata.json created successfully at: {folder_path}")
        except Exception as e:
            raise Exception(f"Failed to create table: {e}")

    def remove_table(self, schema_name: str = None, table_name: str = "", remove_schema_folder: bool = False):
        """
        Deletes a folder in the OneLake storage.

        :param schema_name: Optional schema name.
        :param table_name: Name of the table.
        :param remove_schema_folder: If True, removes the schema folder as well.
        """
        if not table_name:
            raise ValueError("table_name cannot be empty.")

        # Construct the folder path
        folder_path = f"{schema_name}.schema/{table_name}" if schema_name else f"{table_name}"

        try:
            # Get the directory client
            file_system_client = self.service_client.get_file_system_client(file_system="LandingZone")  # Replace with your file system name
            directory_client = file_system_client.get_directory_client(folder_path)

            # Check if the folder exists
            if not directory_client.exists():
                print(f"Warning: Folder '{folder_path}' not found.")
                return

            # Delete the folder
            directory_client.delete_directory()
            print(f"Folder '{folder_path}' deleted successfully.")

            # Check if schema folder exists
            if remove_schema_folder and schema_name:
                schema_folder_path = f"{schema_name}.schema"
                schema_directory_client = file_system_client.get_directory_client(schema_folder_path)
                if schema_directory_client.exists():
                    schema_directory_client.delete_directory()
                    print(f"Schema folder '{schema_folder_path}' deleted successfully.")
                else:
                    print(f"Warning: Schema folder '{schema_folder_path}' not found.")
        except Exception as e:
            raise Exception(f"Failed to delete table: {e}")

    def get_next_file_name(self, schema_name: str = None, table_name: str = "") -> str:
        """
        Finds the next file name for a folder in OneLake storage.

        :param schema_name: Optional schema name.
        :param table_name: Name of the table.
        :return: The next file name padded to 20 digits.
        """
        if not table_name:
            raise ValueError("table_name cannot be empty.")

        # Construct the folder path
        folder_path = f"LandingZone/{schema_name}.schema/{table_name}" if schema_name else f"LandingZone/{table_name}"

        try:
            # Get the system client
            file_system_client = self.service_client.get_file_system_client(file_system=folder_path)

            # List all files in the folder
            file_list = file_system_client.get_paths(recursive=False)
            parquet_files = []

            for file in file_list:
                file_name = os.path.basename(file.name)
                if not file.is_directory and file_name.endswith(".parquet") and not file_name.startswith("_"):
                    # Validate the file name pattern
                    if not file_name[:-8].isdigit() or len(file_name[:-8]) != 20:  # Exclude ".parquet"
                        raise ValueError(f"Invalid file name pattern: {file_name}")
                    parquet_files.append(int(file_name[:-8]))

            # Determine the next file name
            if parquet_files:
                next_file_number = max(parquet_files) + 1
            else:
                next_file_number = 1

            # Return the next file name padded to 20 digits
            return f"{next_file_number:020}.parquet"

        except Exception as e:
            raise Exception(f"Failed to get next file name: {e}")

    def upload_data_file(self, schema_name: str = None, table_name: str = "", local_file_path: str = ""):
        """
        Uploads a file to OneLake storage.

        :param schema_name: Optional schema name.
        :param table_name: Name of the table.
        :param local_file_path: Path to the local file to be uploaded.
        """
        if not table_name:
            raise ValueError("table_name cannot be empty.")
        if not local_file_path or not os.path.isfile(local_file_path):
            raise ValueError("Invalid local file path.")

        # Construct the folder path
        folder_path = f"{schema_name}.schema/{table_name}" if schema_name else f"{table_name}"

        try:
            # Get the directory client
            file_system_client = self.service_client.get_file_system_client(file_system="LandingZone")  # Replace with your file system name
            directory_client = file_system_client.get_directory_client(folder_path)

            # Check if the folder exists
            if not directory_client.exists():
                raise FileNotFoundError(f"Folder '{folder_path}' not found.")

            # Get the next file name
            next_file_name = self.get_next_file_name(schema_name, table_name)

            # Add an underscore to the file name for temporary upload
            temp_file_name = f"_{next_file_name}"

            # Upload the file
            file_client = directory_client.create_file(temp_file_name)
            with open(local_file_path, "rb") as file_data:
                file_contents = file_data.read()
                file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
                file_client.flush_data(len(file_contents))

            print(f"File uploaded successfully as '{temp_file_name}'.")
            
            # Python SDK doesn't handle rename properly for onelake, using REST API to rename the file instead
            self.rename_file_via_rest_api(f"LandingZone/{folder_path}", temp_file_name, next_file_name)
            print(f"File renamed successfully to '{next_file_name}'.")

        except Exception as e:
            raise Exception(f"Failed to upload data file: {e}")
        
    def rename_file_via_rest_api(self, folder_path: str, old_file_name: str, new_file_name: str):
        # Create a ClientSecretCredential
        credential = ClientSecretCredential(self.client_tenant, self.client_id, self.client_secret)            
        # Get a token
        token = credential.get_token("https://storage.azure.com/.default").token

        # Construct the rename URL
        rename_url = f"{self.host}/{folder_path}/{new_file_name}"

        # Construct the source path
        source_path = f"{self.host}/{folder_path}/{old_file_name}"

        # Set the headers
        headers = {
            "Authorization": f"Bearer {token}",
            "x-ms-rename-source": source_path,
            "x-ms-version": "2020-06-12"
        }

        # Send the rename request
        response = requests.put(rename_url, headers=headers)

        if response.status_code in [200, 201]:
            print(f"File renamed from {old_file_name} to {new_file_name} successfully.")
        else:
            print(f"Failed to rename file. Status code: {response.status_code}, Error: {response.text}")

    def get_mirrored_database_status(self):
        """
        Retrieves and displays the status of the mirrored database from Monitoring/replicator.json.

        :raises Exception: If the status file or path does not exist.
        """
        file_system_client = self.service_client.get_file_system_client(file_system="Monitoring")
        try:
            file_client = file_system_client.get_file_client("replicator.json")
            if not file_client.exists():
                raise Exception("No status of mirrored database has been found. Please check whether the mirrored database has been started properly.")

            download = file_client.download_file()
            content = download.readall()
            status_json = json.loads(content)
            print(json.dumps(status_json, indent=4))
        except Exception:
            raise Exception("No status of mirrored database has been found. Please check whether the mirrored database has been started properly.")

    def get_table_status(self, schema_name: str = None, table_name: str = None):
        """
        Retrieves and displays the status of tables from Monitoring/table.json.

        :param schema_name: Optional schema name to filter.
        :param table_name: Optional table name to filter.
        :raises Exception: If the status file or path does not exist.
        """
        file_system_client = self.service_client.get_file_system_client(file_system="Monitoring")
        try:
            file_client = file_system_client.get_file_client("tables.json")
            if not file_client.exists():
                raise Exception("No status of mirrored database has been found. Please check whether the mirrored database has been started properly.")

            download = file_client.download_file()
            content = download.readall()
            status_json = json.loads(content)

            # Treat None as empty string for filtering
            schema_name = schema_name or ""
            table_name = table_name or ""

            if not schema_name and not table_name:
                # Show the whole JSON content
                print(json.dumps(status_json, indent=4))
            else:
                # Filter tables array
                filtered_tables = [
                    t for t in status_json.get("tables", [])
                    if t.get("sourceSchemaName", "") == schema_name and t.get("sourceTableName", "") == table_name
                ]
                print(json.dumps({"tables": filtered_tables}, indent=4))
        except Exception:
            raise Exception("No status of mirrored database has been found. Please check whether the mirrored database has been started properly.")

Code to connect to sharepoint and extract a list

Variables

In [53]:
import requests
import os

def download_excel_files_from_sharepoint_folder(
    tenant_id: str,
    client_id: str,
    client_secret: str,
    sharepoint_domain: str,
    site_name: str,
    folder_name: str,
    lakehouse_subfolder: str = "excelfiles"
):
    """
    Downloads all .xlsx files from a specified SharePoint folder and saves them to a OneLake location.
    
    Args:
        tenant_id (str): Azure AD tenant ID.
        client_id (str): App registration client ID.
        client_secret (str): App registration client secret.
        sharepoint_domain (str): e.g. 'contoso.sharepoint.com'
        site_name (str): e.g. 'demosite'
        folder_name (str): The name of the subfolder (e.g. 'exceldocs') inside Shared Documents
        lakehouse_subfolder (str): OneLake path to save the Excel files (under /lakehouse/default/Files)
    """

    # Create download directory if it doesn't exist
    download_dir = f"/lakehouse/default/Files/{lakehouse_subfolder}"
    os.makedirs(download_dir, exist_ok=True)

    # === 1. Get Access Token ===
    token_url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"
    token_data = {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret,
        'scope': 'https://graph.microsoft.com/.default'
    }
    access_token = requests.post(token_url, data=token_data).json()['access_token']
    headers = {
        'Authorization': f'Bearer {access_token}',
        'Accept': 'application/json'
    }

    # === 2. Get Site ID ===
    site_url = f"https://graph.microsoft.com/v1.0/sites/{sharepoint_domain}:/sites/{site_name}"
    site_id = requests.get(site_url, headers=headers).json()['id']

    # === 3. Get contents of the target folder ===
    folder_path = f"/{folder_name}"
    list_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:{folder_path}:/children"
    response = requests.get(list_url, headers=headers)
    response.raise_for_status()

    items = response.json().get('value', [])

    print(f"\n📂 Contents of '{folder_name}' folder:\n")
    for item in items:
        icon = "📁" if "folder" in item else "📄"
        print(f"{icon} {item['name']}")

    # === 4. Download Excel files ===
    for item in items:
        if "file" in item and item["name"].endswith(".xlsx"):
            file_name = item["name"]
            file_id = item["id"]

            # Download URL
            download_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/items/{file_id}/content"
            file_response = requests.get(download_url, headers=headers)
            file_response.raise_for_status()

            save_path = os.path.join(download_dir, file_name)
            with open(save_path, "wb") as f:
                f.write(file_response.content)

            print(f"✅ Downloaded: {file_name} → {save_path}")


In [62]:
import pandas as pd
import os
def Mirror_Excel_File(folder_path,  tmp_location, clean):
    try:
        # folder_path - where the Excel files are
        # tmp_location - temp storage
        # clean - if true, we delete the tables from the mirrored database first
        # Read the Excel file
        for filename in os.listdir(folder_path):
            if filename.endswith('.xlsx'):  
                excel_file = os.path.join(folder_path, filename)
                file_base = os.path.splitext(os.path.basename(excel_file))[0]
                xls = pd.ExcelFile(excel_file)
                sheet_names = xls.sheet_names
                # Convert each sheet to a separate CSV
                for sheet in sheet_names:
                    if clean == "true" :
                        client.remove_table(schema_name=file_base, table_name=sheet)
                    client.create_table(schema_name=file_base, table_name=sheet, key_cols=["__rowid__"])
                    df = pd.read_excel(xls, sheet_name=sheet)
                    newpath = f"{tmp_location}mirroring/{file_base}.schema/"
                    csv_file = f"{newpath}{sheet}.parquet"
                    df['__rowMarker__'] = '1'
                    df['__rowid__'] = range(1, len(df) + 1)
                    if not os.path.exists(newpath):
                        os.makedirs(newpath)
                    if os.path.exists(csv_file):
                        os.remove(csv_file)
                    df.to_parquet(csv_file, index=False, )
                    print(f"Saved '{sheet}' to '{csv_file}'")
                    client.upload_data_file(schema_name=file_base, table_name=sheet, local_file_path=csv_file)  
                    print(f"Uploaded schema:'{file_base}' to tablename:'{sheet}'")
    except Exception as e:
        print(f"An error occurred: {e}")


In [63]:
tenant_id="{entra_tenant_id}"
client_id="{service_principal_id}"
client_secret="{service_principal_secret}"

sharepoint_domain="{sharepoint_domain}.sharepoint.com"
site_name="{sharepoint_site_name}"
sharepoint_folder = "exceldocs"  # The subfolder you want to inspect
lakehouse_subfolder="excelfiles" # Where its going to store th files

excel_path = f"/lakehouse/default/Files/{lakehouse_subfolder}/"  # Replace with your Excel file name
temp_file = "/lakehouse/default/Files/temp/"      # Replace with your desired temp location file name

landing_zone = "https://onelake.dfs.fabric.microsoft.com/{workspaceid}/{mirrored database id}/Files/LandingZone"

# Setup the OpenMirroring Client
client = OpenMirroringClient(
    client_id=client_id,
    client_secret=client_secret,
    client_tenant=tenant_id,
    host=landing_zone
)


download_excel_files_from_sharepoint_folder(
    tenant_id,
    client_id,
    client_secret,
    sharepoint_domain,
    site_name,
    sharepoint_folder,
    lakehouse_subfolder
)


# Do this for the first time you run it, or you want to reset mirroring
#Mirror_Excel_File( excel_path,  temp_file ,"true")

# do this for the 2nd / 3rd runs.
Mirror_Excel_File( excel_path,  temp_file ,"false")




📂 Contents of 'exceldocs' folder:

📄 Book2.xlsx
✅ Downloaded: Book2.xlsx → /lakehouse/default/Files/excelfiles/Book2.xlsx
Folder and _metadata.json created successfully at: Book.schema/demo
Saved 'demo' to '/lakehouse/default/Files/temp/mirroring/Book.schema/demo.parquet'
File uploaded successfully as '_00000000000000000004.parquet'.
File renamed from _00000000000000000004.parquet to 00000000000000000004.parquet successfully.
File renamed successfully to '00000000000000000004.parquet'.
Uploaded schema:'Book' to tablename:'demo'
Folder and _metadata.json created successfully at: Book.schema/friends
Saved 'friends' to '/lakehouse/default/Files/temp/mirroring/Book.schema/friends.parquet'
File uploaded successfully as '_00000000000000000004.parquet'.
File renamed from _00000000000000000004.parquet to 00000000000000000004.parquet successfully.
File renamed successfully to '00000000000000000004.parquet'.
Uploaded schema:'Book' to tablename:'friends'
Folder and _metadata.json created success