In [1]:
import io
import os
import json
from pathlib import Path

import boto3
import requests
from requests import Session

In [2]:
env = "devpolly"
token = ""

repo_id = "1644896537390"
local_path_to_datasets = "testing/TXT"
subfolder_path_in_datalake = "testing/TXT"

In [3]:
class UnauthorizedException(Exception):
    def __str__(self):
        return f"Authorization failed"


class PollySession(Session):
    def __init__(self, token):
        Session.__init__(self)
        self.headers = {
            "Content-Type": "application/vnd.api+json",
            "Cookie": f"refreshToken={token}",
            "User-Agent": "jupyter-notebook",
        }


class Polly:
    default_session = None

    @classmethod
    def auth(cls, token):
        cls.default_session = PollySession(token)

    @classmethod
    def get_session(cls, token=None):
        if not token:
            if not cls.default_session:
                raise UnauthorizedException
            else:
                return cls.default_session
        else:
            return PollySession(token)

In [4]:
session = Polly.get_session(token)

In [5]:
url_discover = f"https://api.discover.{env}.elucidata.io"
files_api_endpoint = f"{url_discover}/repositories/{repo_id}/files"

In [6]:
def generate_upload_urls():
    response = session.post(files_api_endpoint, json={
        "data": {
            "type": "files",
            "attributes": {
                "folder": subfolder_path_in_datalake
            }
        }
    })
    response.raise_for_status()
    
    response_data = response.json()
    data_upload_details = response_data.get("data", {}).get("attributes", {}).get("data_upload_url")
    metadata_upload_details = response_data.get("data", {}).get("attributes", {}).get("metadata_upload_url")
    return data_upload_details, metadata_upload_details

In [28]:
data_upload, metadata_upload = generate_upload_urls()

In [40]:
directory = os.fsencode(local_path_to_datasets)
for file in os.listdir(directory):
    filename = str(Path(local_path_to_datasets) / Path(os.fsdecode(file)))
    with open(filename, 'rb') as file_to_upload:
        files = {'file': (filename, file_to_upload)}
        upload_response = requests.post(
            data_upload['url'],
            data=data_upload['fields'],
            files=files
        )
    print(f"Upload response for {filename}: {upload_response.status_code}")

Upload response for testing/TXT/test1.txt: 204
Upload response for testing/TXT/test2.txt: 204
Upload response for testing/TXT/test3.txt: 204


In [41]:
metadata_file_content = {
    "data": [
        {
            "id": "testing/TXT/test1.txt",
            "type": "file_metadata",
            "attributes": {
                "dataset_id": "test_text1",
                "prop_1": "value_1",
                "prop_2": "value_2",
            }
        },
        {
            "id": "testing/TXT/test2.txt",
            "type": "file_metadata",
            "attributes": {
                "dataset_id": "test_text2",
                "prop_1": "value_1",
                "prop_2": "value_2",
            }
        },
        {
            "id": "testing/TXT/test3.txt",
            "type": "file_metadata",
            "attributes": {
                "dataset_id": "test_text3",
                "prop_1": "value_1",
                "prop_2": "value_2",
            }
        },
    ]
}

In [42]:
metadata_buffer_str = io.StringIO()
json.dump(metadata_file_content, metadata_buffer_str)
metadata_buffer = io.BytesIO(metadata_buffer_str.getvalue().encode())

metadata_filename = "metadata.json"
metadata_files = {'file': (metadata_filename, metadata_buffer)}
upload_response = requests.post(
    metadata_upload['url'],
    data=metadata_upload['fields'],
    files=metadata_files
)
print(f"Upload response for {metadata_filename}: {upload_response.status_code}")

Upload response for metadata.json: 204
