<a href="https://colab.research.google.com/github/DarthCoder501/GAAP/blob/main/OneDrive_Dataset_Download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install rclone to enable syncing and mounting of OneDrive
!curl https://rclone.org/install.sh | sudo bash

In [None]:
# Launch rclone configuration interface
!rclone config

In [None]:
# Update packages and install FUSE3
!apt-get update && apt-get install -y fuse3

In [None]:
# Create a local directory where OneDrive will be mounted
!mkdir -p /content/MyOneDrive

In [None]:
# Mount the remote OneDrive directory to the local path using rclone
!rclone mount MyOneDrive: /content/MyOneDrive --vfs-cache-mode full --allow-other --daemon

In [None]:
!pip install azure-storage-blob

In [None]:
# Import necessary libraries
from azure.storage.blob import BlobServiceClient, ContainerClient
import pandas as pd
import os
import requests

In [None]:
sas_url = "https://aimistanforddatasets01.blob.core.windows.net/inspect2?sv=2019-02-02&sr=c&sig=vFSGSEsx2MsE4AY3rIEVO%2F0ijrEh7FjwYON1FUU6fUU%3D&st=2025-06-27T18%3A29%3A09Z&se=2025-07-27T18%3A34%3A09Z&sp=rl"
csv_path = "/content/Progression Dataset.csv"
download_dir = "/content/MyOneDrive/GAAP Research Resources/Imaging & Impressions Dataset"  # Changed to OneDrive path
os.makedirs(download_dir, exist_ok=True)

In [None]:
# Load CSV with desired impression_ids
df = pd.read_csv(csv_path)
valid_ids = set(df['impression_id'].astype(str))  # ensure all are strings
expected_files = {f"{impression_id}.nii.gz" for impression_id in valid_ids}

# Connect to blob container using SAS URL
container_client = ContainerClient.from_container_url(container_url=sas_url)

In [None]:
# Verify inputs
print(f"CSV Path: {csv_path}")
print(f"Found {len(valid_ids)} impression IDs in CSV")
print("First 5 IDs:", list(valid_ids)[:5])
print("------------------------------------------------")
sample_blobs = [blob.name for i, blob in enumerate(container_client.list_blobs()) if i < 5]
print("Blob Names")
print(sample_blobs)

In [None]:
'''Matching logic'''

print(f"Found {len(valid_ids)} impression IDs in CSV")
print("First 5 IDs:", list(valid_ids)[:5])

# Update expected_files to include the CTPA/ prefix
expected_files = {f"CTPA/{imp_id}.nii.gz" for imp_id in valid_ids}

# Tracking for future potential debug
downloaded_count = 0
skipped_count = 0

In [None]:
# Get list of files already downloaded
existing_files = {f for f in os.listdir(download_dir) if f.endswith('.nii.gz')}

In [None]:
print(len(existing_files))

In [None]:
# Download files
downloaded_count = 0
for blob in container_client.list_blobs():
    if blob.name in expected_files:
        # Extract just the filename part (removes CTPA/ prefix)
        filename = blob.name.split('/')[-1]
        dest_path = os.path.join(download_dir, filename)

        print(f"Downloading: {blob.name} -> {dest_path}")
        blob_client = container_client.get_blob_client(blob)

        with open(dest_path, "wb") as f:
            f.write(blob_client.download_blob().readall())
        downloaded_count += 1

print(f"\nDownload complete! {downloaded_count} files saved to {download_dir}")

# Verify
if downloaded_count > 0:
    print("\nFirst 5 downloaded files:")
    !ls -lh "{download_dir}" | head -5

# To download additional files

In [None]:
for blob in container_client.list_blobs():
    if blob.name in expected_files:
        # Extract just the filename part (removes CTPA/ prefix)
        filename = blob.name.split('/')[-1]
        dest_path = os.path.join(download_dir, filename)

        # Skip if file already exists
        if filename in existing_files:
            skipped_count += 1
            continue

        print(f"Downloading: {blob.name} -> {dest_path}")
        blob_client = container_client.get_blob_client(blob)

        with open(dest_path, "wb") as f:
            f.write(blob_client.download_blob().readall())
        downloaded_count += 1

print(f"\nDownload complete! {downloaded_count} new files saved to {download_dir}")
print(f"Skipped {skipped_count} files that already existed")

# Verify
if downloaded_count > 0:
    print("\nFirst 5 downloaded files:")
    !ls -lh "{download_dir}" | head -5