# Load data into Azure Data Lake Storage Gen 2

In [1]:
import os
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv 

In [2]:
# Setting up our functions

# Function to connect to ADLS Gen2
def create_blob_service_client(connection_string):
    return BlobServiceClient.from_connection_string(connection_string)

# Function to upload files to the respective directories in Azure
def upload_file_to_adls(file_path, container_name, azure_directory, blob_service_client):
    blob_path = f"{azure_directory}/{os.path.basename(file_path)}"  # Preserving file name
    print(f"Uploading {file_path} to {blob_path}...")

    # Upload the file to Azure
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path)
    with open(file_path, "rb") as data:
        blob_client.upload_blob(data, overwrite=True)
    print(f"Upload complete for {file_path} to {blob_path}")

# Function to iterate over the files and upload to corresponding directories
def upload_files_to_adls(local_directory, container_name, directory_mapping, blob_service_client):
    for local_subdirectory, azure_directory in directory_mapping.items():
        # Full local path to the subdirectory
        full_local_path = os.path.join(local_directory, local_subdirectory)

        if os.path.isdir(full_local_path):
            # Loop through all files in the current local subdirectory
            for file_name in os.listdir(full_local_path):
                file_path_on_local = os.path.join(full_local_path, file_name)
                if os.path.isfile(file_path_on_local):
                    # Upload each file to the corresponding Azure directory
                    upload_file_to_adls(file_path_on_local, container_name, azure_directory, blob_service_client)

In [3]:
# Set the variables for the local directory, Azure container and connection string

load_dotenv() 

local_directory = os.getenv('DATASET_PATH') # DATASET_PATH stored in .env file
connection_string = os.getenv('AZURE_CONNECTION_STRING') # AZURE_CONNECTION_STRING stored in .env file
container_name = os.getenv('ADLS_CONTAINER_NAME') # ADLS_CONTAINER_NAME stored in .env file


In [4]:
# Create the BlobServiceClient
blob_service_client = create_blob_service_client(connection_string)

In [5]:
# Loading our data from each data folder

local_data_folder_1 = 'counsel_chat_data' # Subfolder within the main local data directory
adls_directory_1 = 'counsel_chat_data' # Target directory in ADLS

# Define the mapping of your local folder to the ADLS directory
directory_mapping_1 = {
    local_data_folder_1: adls_directory_1  # Files in 'local_folder_1' will go to 'adls_directory_2' in ADLS
}

# Call the function to upload files
upload_files_to_adls(local_directory, container_name, directory_mapping_1, blob_service_client)

Uploading C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\counsel_chat_data\counsel_chat_data.json to counsel_chat_data/counsel_chat_data.json...
Upload complete for C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\counsel_chat_data\counsel_chat_data.json to counsel_chat_data/counsel_chat_data.json


In [6]:
 # Loading our data from each data folder

local_data_folder_2 = 'mentalhealth_data' # Subfolder within the main local data directory
adls_directory_2 = 'mentalhealth_data' # Target directory in ADLS

# Define the mapping of your local folder to the ADLS directory
directory_mapping_2 = {
    local_data_folder_2: adls_directory_2  # Files in 'local_folder_2' will go to 'adls_directory_2' in ADLS
}

# Call the function to upload files
upload_files_to_adls(local_directory, container_name, directory_mapping_2, blob_service_client)

Uploading C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data\mentalhealth_data_part1.json to mentalhealth_data/mentalhealth_data_part1.json...
Upload complete for C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data\mentalhealth_data_part1.json to mentalhealth_data/mentalhealth_data_part1.json
Uploading C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data\mentalhealth_data_part10.json to mentalhealth_data/mentalhealth_data_part10.json...
Upload complete for C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data\mentalhealth_data_part10.json to mentalhealth_data/mentalhealth_data_part10.json
Uploading C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data\mentalhealth_data_part11.json to mentalhealth_data/mentalhealth_data_part11.json...
Upload complete for C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data\mentalhe

In [None]:
# Generic cell to load any new data folders. For future use

# Loading our data from each data folder

local_data_folder_x = 'my_folder' # Subfolder within the main local data directory
adls_directory_x = 'my_adls_dir' # Target directory in ADLS

# Define the mapping of your local folder to the ADLS directory
directory_mapping_x = {
    local_data_folder_x: adls_directory_x  # Files in 'local_folder_x' will go to 'adls_directory_x' in ADLS
}

# Call the function to upload files
upload_files_to_adls(local_directory, container_name, directory_mapping_x, blob_service_client) 