In [1]:
import os
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient

# Set up download directory
ACCOUNT_URL = "https://saqdiveassignments.blob.core.windows.net/"
CONTAINER_NAME = "dataengineerfiles"
DATA_DIR_PATH = os.path.join(os.getcwd(), "data")
if not os.path.exists(DATA_DIR_PATH):
    os.makedirs(DATA_DIR_PATH)

In [2]:
class AzureBlobOperations:
    def __init__(self, storage_url: str, credentials=None) -> None:
        self.blob_service_client = BlobServiceClient(storage_url, credential=credentials)
    
    def download_blob_to_file(self, container_name: str, blob_name: str):
        blob_client = self.blob_service_client.get_blob_client(container=container_name, blob=blob_name)
        with open(file=os.path.join(DATA_DIR_PATH, blob_name), mode="wb") as sample_blob:
            download_stream = blob_client.download_blob()
            sample_blob.write(download_stream.readall())

    def download_all_blobs_to_dir(self, container_name: str):
        """Downloads all blobs contained in a container on Azure Storage.
        
        Based on the download blob function from Azure Documentation.

        Args:
            container_name: Name of the storage container containing the blob.
        """
        container_client = self.blob_service_client.get_container_client(container=container_name)
        for blob_name in container_client.list_blob_names():
            blob_client = container_client.get_blob_client(blob=blob_name)
            with open(file=os.path.join(DATA_DIR_PATH, blob_name), mode="wb") as sample_blob:
                download_stream = blob_client.download_blob()
                sample_blob.write(download_stream.readall())

In [None]:
azure_op = AzureBlobOperations(storage_url=ACCOUNT_URL, credentials=None)
container_client = azure_op.blob_service_client.get_container_client(container=CONTAINER_NAME)
for blob in container_client.list_blobs():
    print(f"Name: {blob.name}, Size: {blob.size}")

In [None]:
# azure_op.download_all_blobs_to_dir(CONTAINER_NAME)
azure_op.download_blob_to_file(CONTAINER_NAME, "titles.tsv")

In [None]:
# Load as polars/pandas dataframe


## 2. Data Engineering
To be able to scale the data engineering steps, it is necessary to identify what steps are required. To do this the following needs to be investigated:

### 1. Visualize raw data

In [None]:
# plot dataframe info

In [None]:
# describe dataframe

### 2. Clean data
Bring the data into Best-Practice format for easier manipulation from data scientists down the line.

In [None]:
# Rename columns
# Treat null values sensibly
# One-hot encode categorical columns
# Set consistent schema by converting types

### 3. Aggregate/Transform data
Reduce redundant information and transform data into the most useful and expressive representation considering analysis and ML.

In [None]:
# 

### 4. Write results
After refining the input data, it needs to be made accessible to the data scientists. The optimal methods depends strongly on their tools and use-case. 
For this exercise i will assume they want to generate a dashboard for monitoring (repeating the same analysis), creating estimators based on the data (ML) or both (MLOps).
Therefore it makes sense to host the data in a ready available application like Google BigQuery.

In [None]:
# Connect to BigQuery

# Upload to project QDive