In [8]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil
from azureml.core import Workspace, Dataset, Datastore
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.data.dataset_factory import FileDatasetFactory

In [9]:
# Step 1: Authenticate and connect
auth = InteractiveLoginAuthentication()
ws = Workspace(subscription_id="0a94de80-6d3b-49f2-b3e9-ec5818862801",
               resource_group="buas-y2",
               workspace_name="NLP8-2025",
               auth=auth)

In [10]:
# Step 2: Access the datastore and dataset
datastore = Datastore.get(ws, 'workspaceblobstore')
dataset = Dataset.get_by_name(workspace=ws, name='RO_Dataset_sample', version='1')

In [11]:
# Step 3: Download dataset locally
os.makedirs("temp_download", exist_ok=True)
dataset.download(target_path="temp_download", overwrite=True)

{'infer_column_types': 'False', 'activity': 'download'}
{'infer_column_types': 'False', 'activity': 'download', 'activityApp': 'FileDataset'}


['c:\\Users\\artjo\\Documents\\BUAS\\Year 2\\2024-25d-fai2-adsai-group-nlp8\\data\\temp_download\\sample_transcript.csv']

In [12]:
# Step 4: Load CSV and split
csv_path = [os.path.join("temp_download", f) for f in os.listdir("temp_download") if f.endswith('.csv')][0]
df = pd.read_csv(csv_path)

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [13]:
# Step 5: Save splits as flat files
os.makedirs("upload_temp", exist_ok=True)
train_df.to_csv("upload_temp/train.csv", index=False)
val_df.to_csv("upload_temp/val.csv", index=False)
test_df.to_csv("upload_temp/test.csv", index=False)

In [14]:
# Step 6: Upload flat files to root of blobstore
Dataset.File.upload_directory(
    src_dir="upload_temp",
    target=(datastore, "/"),
    overwrite=True,
    show_progress=True
)

Message: [NOT_SUPPORTED_API_USE_ATTEMPT] The [_get_steps] API has been deprecated and is no longer supported
Payload: {"pid": 13328, "rslex_version": "2.24.5", "api_name": "_get_steps", "version": "5.3.3"}


Validating arguments.
Arguments validated.
'overwrite' is set to True. Any file already present in the target will be overwritten.
Uploading files from 'c:/Users/artjo/Documents/BUAS/Year 2/2024-25d-fai2-adsai-group-nlp8/data/upload_temp' to '/'
Creating new dataset


{
  "definition": "EnginelessDataflow:\n---\ntype: mltable\npaths:\n  - pattern: \"azureml://subscriptions/0a94de80-6d3b-49f2-b3e9-ec5818862801/resourcegroups/buas-y2/workspaces/NLP8-2025/datastores/workspaceblobstore/paths/\"\nmetadata:\n  infer_column_types: \"False\"\n"
}

In [15]:
# Step 7: Register assets from blob root
Dataset.File.from_files((datastore, 'train.csv')).register(
    ws, name='dataset_train', description='Training split', create_new_version=True)

Dataset.File.from_files((datastore, 'val.csv')).register(
    ws, name='dataset_val', description='Validation split', create_new_version=True)

Dataset.File.from_files((datastore, 'test.csv')).register(
    ws, name='dataset_test', description='Test split', create_new_version=True)

print("Data splits uploaded and registered successfully")

Data splits uploaded and registered successfully
