# Download Batch Data

In [None]:
import os
from azure.storage.filedatalake import DataLakeServiceClient

In [None]:
# Define Access values

FILE_PATH = "amazon_reviews"
# FILE_PATH = "amazon_metadata"

STORAGE_ACCOUNT = "safactoreddatathon"
CONTAINER_NAME = "source-files"
SAS_TOKEN = "sp=rle&st=2023-07-25T18:12:36Z&se=2023-08-13T02:12:36Z&sv=2022-11-02&sr=c&sig=l2TCTwPWN8LSM922lR%2Fw78mZWQK2ErEOQDUaCJosIaw%3D"
ACCOUNT_URL = f"https://{STORAGE_ACCOUNT}.dfs.core.windows.net"

In [None]:
# Create Client

service_client = DataLakeServiceClient(
    account_url=ACCOUNT_URL, 
    credential=SAS_TOKEN)

file_system_client = service_client.get_file_system_client(
    file_system=CONTAINER_NAME)

## Data source exploration

In [None]:
# Utils

def get_partitions():
    paths = file_system_client.get_paths(FILE_PATH)
    partitions = [path.name for path in paths if path.is_directory]
    return partitions

def ls_dir(path):
    paths = file_system_client.get_paths(path)
    for element in paths:
        print(element.name)

amazon_reviews: 2500 partitions

amazon_metadata: 1503 partitions

In [None]:
partitions = get_partitions()
num_partitions = len(partitions)
print(f"Number of partitions: {num_partitions}")

In [None]:
ls_dir(partitions[0])

In [None]:
# Verify partitions just contain files

paths = file_system_client.get_paths(FILE_PATH)
for path in paths:
    if len(path.name.split('/')) > 3:
        print(path.name)
        break

In [None]:
# Get files extension

extensions = set()
paths = file_system_client.get_paths(FILE_PATH)
for path in paths:
    ext = os.path.splitext(path.name)[1]
    if ext:
        extensions.add(ext)
print(extensions)

## Data source selection

amazon_reviews: 5230 `.gz` files, 2738 no empty

amazon_metadata: 1503 `.gz` files, 0 empty

In [None]:
# Get .gz files path

total_gz_files = 0
gz_files_path = []

paths = file_system_client.get_paths(FILE_PATH)
for path in paths:
    if path.name.endswith('.gz'):
        if path.content_length > 20:
            gz_files_path.append(path.name)
        total_gz_files += 1
        
print(f'Total .gz files: {total_gz_files}')
print(f'Valid .gz files: {len(gz_files_path)}')

## Data source acquisition

amazon_reviews: 22.4 GB

amazon_metadata: 11.8 GB

In [None]:
# Download .gz files

files_range = (1, 500)

for file_path in gz_files_path[files_range[0]-1:files_range[1]]:
    if file_path.endswith('.gz'):
        file_client = file_system_client.get_file_client(file_path)
        file_name = os.path.basename(file_path)
        local_file_path = os.path.join(FILE_PATH, file_name)
        
        with open(local_file_path, 'wb') as stream:
            download = file_client.download_file()
            download.readinto(stream)