In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Segger Dataset Creation\n",
    "\n",
    "This notebook demonstrates how to create a dataset using Segger, a tool for processing and analyzing spatial transcriptomics data."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1: Setup and Environment\n",
    "\n",
    "First, we set up the environment by importing necessary libraries and ensuring that required directories exist."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "from pathlib import Path\n",
    "from urllib import request\n",
    "from src.segger.data.utils import XeniumSample\n",
    "\n",
    "# Ensure PyGEOS is not used\n",
    "os.environ[\"USE_PYGEOS\"] = \"0\"\n",
    "\n",
    "# Add the src directory to the Python path\n",
    "sys.path.insert(0, os.path.abspath('../../src'))\n",
    "\n",
    "# Define the data directory paths\n",
    "raw_data_dir = Path('data_raw/pancreatic')\n",
    "processed_data_dir = Path('data_tidy/pyg_datasets/pancreatic')\n",
    "\n",
    "# Create directories if they don't exist\n",
    "raw_data_dir.mkdir(parents=True, exist_ok=True)\n",
    "processed_data_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: Download Data\n",
    "\n",
    "Download the required transcript and nuclei data files from 10x Genomics."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define URLs for the data\n",
    "transcripts_url = \"https://cf.10xgenomics.com/samples/xenium/1.3.0/xenium_human_pancreas/analysis/transcripts.csv.gz\"\n",
    "nuclei_url = \"https://cf.10xgenomics.com/samples/xenium/1.3.0/xenium_human_pancreas/analysis/nucleus_boundaries.csv.gz\"\n",
    "\n",
    "# Define paths for the downloaded data\n",
    "transcripts_path = raw_data_dir / \"transcripts.csv.gz\"\n",
    "nuclei_path = raw_data_dir / \"nucleus_boundaries.csv.gz\"\n",
    "\n",
    "# Function to download files\n",
    "def download_file(url, dest):\n",
    "    if not dest.exists():\n",
    "        print(f\"Downloading {url} to {dest}...\")\n",
    "        request.urlretrieve(url, dest)\n",
    "        print(\"Download completed.\")\n",
    "\n",
    "# Download the data files\n",
    "download_file(transcripts_url, transcripts_path)\n",
    "download_file(nuclei_url, nuclei_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3: Load and Process Data\n",
    "\n",
    "Load the downloaded transcript and nuclei data into the `XeniumSample` object and save the processed dataset for Segger."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and process the data\n",
    "xs = XeniumSample().load_transcripts(path=transcripts_path, min_qv=30)\n",
    "xs.load_nuclei(path=nuclei_path)\n",
    "xs.save_dataset_for_segger(\n",
    "    processed_data_dir, \n",
    "    d_x=180, d_y=180, x_size=200, y_size=200, \n",
    "    margin_x=None, margin_y=None,\n",
    "    r=3,\n",
    "    val_prob=0.1,\n",
    "    test_prob=0.1,\n",
    "    k_nc=3, \n",
    "    dist_nc=10,\n",
    "    k_tx=5,\n",
    "    dist_tx=3,\n",
    "    compute_labels=True,\n",
    "    sampling_rate=1\n",
    ")\n",
    "\n",
    "print(\"Dataset creation completed.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 4: Verify Data\n",
    "\n",
    "Ensure that the data has been saved correctly and verify the structure of the processed dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Verify the processed dataset\n",
    "import torch\n",
    "\n",
    "processed_files = list(processed_data_dir.glob('**/*.pt'))\n",
    "print(f\"Number of processed files: {len(processed_files)}\")\n",
    "\n",
    "# Load a sample file to inspect\n",
    "sample_data = torch.load(processed_files[0])\n",
    "print(sample_data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
