In [None]:
import pandas as pd
import scanpy as sc

import time

import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
from datetime import datetime
import os
import hashlib
import json
from pathlib import Path
import re


class WebScraper:
    def __init__(self, base_url: str, cache_dir: str = ".cache"):
        self.base_url = base_url
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)

        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def _get_cache_path(self, url: str) -> Path:
        """Generate a cache file path for a given URL."""
        url_hash = hashlib.md5(url.encode()).hexdigest()
        return self.cache_dir / f"{url_hash}.html"

    def fetch_page(self, url: str) -> BeautifulSoup:
        """
        Fetch and parse a webpage with caching.

        Args:
            url (str): The URL to fetch

        Returns:
            BeautifulSoup: Parsed HTML content
        """
        cache_path = self._get_cache_path(url)

        # Try to load from cache first
        if cache_path.exists():
            with open(cache_path, 'r', encoding='utf-8') as f:
                print(f"Loading from cache: {url}")
                return BeautifulSoup(f.read(), 'html.parser')

        # If not in cache, fetch and store
        try:
            print(f"Downloading: {url}")
            response = self.session.get(url)
            response.raise_for_status()

            # Save to cache
            with open(cache_path, 'w', encoding='utf-8') as f:
                f.write(response.text)

            return BeautifulSoup(response.text, 'html.parser')

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {str(e)}")
            raise

    def extract_data(self, soup: BeautifulSoup) -> List[Dict]:
        """
        Extract desired data from the parsed HTML.
        Override this method based on your specific needs.

        Args:
            soup (BeautifulSoup): Parsed HTML content

        Returns:
            List[Dict]: Extracted data as a list of dictionaries
        """
        # Example extraction - modify according to your needs
        data = []
        # Add your extraction logic here
        return data

    def save_to_csv(self, data: List[Dict], filename: str):
        """
        Save extracted data to CSV file.

        Args:
            data (List[Dict]): Data to save
            filename (str): Output filename
        """
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")


def extract_data(soup: BeautifulSoup) -> List[Dict]:
    """
    Extract all cancer case data including metadata fields.

    Args:
        soup (BeautifulSoup): Parsed HTML content

    Returns:
        List[Dict]: List of cases with their details
    """

    def extract_field(soup: BeautifulSoup, field_name: str) -> str:
        """Helper function to extract field value from table row"""
        field_cell = soup.find('td', string=f"{field_name}:")
        if field_cell:
            value_cell = field_cell.find_next('td')
            return value_cell.text.strip() if value_cell else None
        return None
    case_data = {}
    # Extract publication data (keeping the existing publication extraction)
    pub_header = soup.find('td', string='Publications:')
    if pub_header:
        pub_cell = pub_header.find_next('td', class_='text-justify')
        if pub_cell:
            for div in pub_cell.find_all('div'):
                link = div.find('a')
                if link:
                    url = link.get('href', '')
                    authors_text = link.text.strip()

                    last_name_match = re.search(r'\s*(\w+) et al\.', authors_text)
                    last_name = last_name_match.group(1) if last_name_match else None

                    span = div.find('span')
                    details = span.text.strip() if span else ''

                    year_match = re.search(r',\s*((?:19|20)\d{2})', details)
                    year = year_match.group(1) if year_match else None
                    case_data.update({
                                    'authors': authors_text,
                                    'author_last_name': last_name,
                                    'url': url,
                                    'details': details,
                                    'year': year,
                                    'full_citation': authors_text + details})
                    # Extract all the additional fields
    case_data.update({
            # Cancer and tumor details
            'cancer_type': extract_field(soup, 'Cancer type'),
            'cancer_type_abbr': extract_field(soup, 'Cancer type abbreviation'),
            'primary_site': extract_field(soup, 'Primary site'),
            'sampling_site': extract_field(soup, 'Sampling site'),
            'tumor_grade': extract_field(soup, 'Tumor grade'),
            'tumor_status': extract_field(soup, 'Tumor status'),
            'strategy': extract_field(soup, 'Strategy'),
            'protocol': extract_field(soup, 'Protocol'),
            'instrument': extract_field(soup, 'Instrument'),
            'GEO accession': extract_field(soup, 'GEO accession'),
            # Patient details
            'age': extract_field(soup, 'Donor age'),
            'gender': extract_field(soup, 'Donor gender'),

            # Treatment and additional info
            'treatment': extract_field(soup, 'Treatment'),
            'other_metadata': extract_field(soup, 'Other metadata')
        })
    return case_data


IndentationError: expected an indented block (1617220460.py, line 173)

In [None]:
metadata = pd.read_csv("CancerSCEM-Browse-Table.csv").iloc[:, :-3]
project_counts = metadata["Project ID"].value_counts().to_frame()
project_counts.columns = ["n_samples"]

metadata = metadata.merge(project_counts, how="left", left_on="Project ID", right_index=True)
metadata = metadata[~metadata["Project ID"].duplicated(keep="first")]
metadata = metadata[metadata["n_samples"] >= 2]

# Example usage


results = []
for sample_id in metadata["Sample ID"]:
    print(sample_id)
    subset = metadata[metadata["Sample ID"]==sample_id]
    target_url = f"https://ngdc.cncb.ac.cn/cancerscem/sample-details/{sample_id}"
    try:
        scraper = WebScraper(target_url, cache_dir=".cache")

        # Fetch and parse the webpage (will use cache if available)
        soup = scraper.fetch_page(target_url)

        # Extract data
        data = extract_data(soup)
        if data is not None:
            data["sample ID"] = sample_id
            data["n_samples"] = subset["n_samples"].item()
            data["Accession No."] = subset["Accession No."].item()
            data["Project ID"] = subset["Project ID"].item()
            results.append(data)
        else:
            print("Edge case", sample_id)
    except Exception as e:
        print(f"Scraping failed: {str(e)}")
results = pd.DataFrame(results)
results = results.drop(["age", "gender", "treatment", "sample ID"], axis=1)
results = results.set_index("Project ID")
results.to_csv("study_overview.csv")

In [2]:
_malignant_cell = "Malignant cells"
_cell_type = "cell_type"

In [3]:
study_overview = pd.read_csv("study_overview.csv", index_col=0)

In [4]:
study_overview = study_overview[(study_overview["Source"] == "CancerSCEM") & (study_overview["Include"] == "Yes")]

In [5]:
metadata = pd.read_csv("CancerSCEM-Browse-Table.csv").iloc[:, :-3]


In [6]:
metadata = metadata[metadata["Project ID"].isin(study_overview["Project ID"])]

In [7]:
study_overview

Unnamed: 0_level_0,Tissue,Article,Disease,Technology,N_samples,Source,author_last_name,year,Project ID,Include,Reason
Spalte 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alchahin_2022_kidney,kidney,https://pubmed.ncbi.nlm.nih.gov/36180422\n3828...,RCC,10x,15,CancerSCEM,Alchahin,2022,ccRCC-061,Yes,
Aubin_2022_brain,brain,https://pubmed.ncbi.nlm.nih.gov/35803925/,Pediatric ependymoma,Drop-seq,8,CancerSCEM,Aubin,2022,EPN-102,Yes,
Barrett_2024_brain,brain,https://pubmed.ncbi.nlm.nih.gov/38216553/,Sporadic Vestibular Schwannomas,10x,15,CancerSCEM,Barrett,2024,VS-105,Yes,
Becker_2022_colorectal,colorectal,https://pubmed.ncbi.nlm.nih.gov/35726067/,Colorectal cancer,10x,4,CancerSCEM,Becker,2022,CRC-067,Yes,
Becker_2022_colorectal,colorectal,https://pubmed.ncbi.nlm.nih.gov/35726067/,Colorectal cancer,10x,5,CancerSCEM,Becker,2022,CRC-101,Yes,
...,...,...,...,...,...,...,...,...,...,...,...
Zhang_2022_skin,skin,https://pubmed.ncbi.nlm.nih.gov/36433984/,Acral Melanoma,10x,7,CancerSCEM,Zhang,2022,AM-036,Yes,
Zhang_2023_pancreas,pancreas,https://pubmed.ncbi.nlm.nih.gov/37612267/,PDAC,10x,7,CancerSCEM,Zhang,2023,PDAC-106,Yes,
Zhou_2023_head-and-neck,head-and-neck,https://pubmed.ncbi.nlm.nih.gov/36879115/,Salivary Adenoid Cystic Carcinoma,10x,4,CancerSCEM,Zhou,2023,SACC-103,Yes,
Zhu_2023_liverbiliary,liverbiliary,https://pubmed.ncbi.nlm.nih.gov/36878933/,HCC,10x,6,CancerSCEM,Zhu,2023,HCC-059,Yes,


In [18]:

results = []
for sample_id in metadata["Sample ID"]:
    print(sample_id)
    subset = metadata[metadata["Sample ID"]==sample_id]
    target_url = f"https://ngdc.cncb.ac.cn/cancerscem/sample-details/{sample_id}"
    try:
        scraper = WebScraper(target_url, cache_dir=".cache")

        # Fetch and parse the webpage (will use cache if available)
        soup = scraper.fetch_page(target_url)

        # Extract data
        data = extract_data(soup)
        if data is not None:
            data["sample ID"] = sample_id
            data["Accession No."] = subset["Accession No."].item()
            data["Project ID"] = subset["Project ID"].item()
            results.append(data)
        else:
            print("Edge case", sample_id)
    except Exception as e:
        print(f"Scraping failed: {str(e)}")
results = pd.DataFrame(results)


LUAD-004-01-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUAD-004-01-1A
LUAD-004-02-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUAD-004-02-1A
LUAD-004-03-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUAD-004-03-1A
LUAD-004-06-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUAD-004-06-1A
LUAD-004-07-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUAD-004-07-1A
LUAD-004-08-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUAD-004-08-1A
LUSC-005-01-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUSC-005-01-1A
LUSC-005-02-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUSC-005-02-1A
LUSC-005-03-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUSC-005-03-1A
LUSC-005-04-1A
Loading from cache: https://ngdc.cncb.ac.cn/cancerscem/sample-details/LUSC-005-04-1A


In [21]:
results.to_csv("sample_metadata.csv")

In [8]:
results = pd.read_csv("sample_metadata.csv")
results = results[results["included"]=="yes"]

In [None]:
from io import BytesIO
from typing import Optional

import requests
from scipy import sparse


def get_dataframe(url, compression = None) -> Optional[pd.DataFrame]:
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        content = BytesIO(response.content)
        df = pd.read_csv(content, compression=compression, sep='\t', index_col=0)
        return df


def get_anndata_from_sample_id(sample_id: str) -> Optional[sc.AnnData]:
    count_url = f'https://ngdc.cncb.ac.cn/cancerscem/downloads/expMatrixes/{sample_id}.counts.matrix.tsv.gz'
    celltype_url = f'https://ngdc.cncb.ac.cn/cancerscem/downloads/cell_annotation/{sample_id}.cell.type.txt'

    cell_type = get_dataframe(celltype_url)

    cell_type_counts = cell_type.value_counts()

    if not _malignant_cell in cell_type_counts.index:
        return 

    if cell_type_counts[_malignant_cell] < 50:
        return

    count_data = get_dataframe(count_url, compression="gzip")

    if (count_data is None) or (cell_type is None):
        return

    adata = sc.AnnData(count_data.transpose())
    adata.X = sparse.csr_matrix(adata.X)
    adata.obs = cell_type.loc[adata.obs_names].copy()
    adata.obs["sample_id"] = sample_id
    return adata



In [11]:
from genericpath import isfile
import pathlib

import tqdm

results_dir = pathlib.Path("data/CancerSCEM/")
skipped_samples = []
for _, (sample_id, project_id) in tqdm.tqdm(results[["sample ID", "Project ID"]].iterrows(), total=results.shape[0]):
    project_dir = results_dir.joinpath(project_id)
    if not project_dir.is_dir():
        project_dir.mkdir(parents=True)
    
    sample_path = project_dir.joinpath(f"{sample_id}.h5ad")
    if sample_path.is_file():
        continue
    adata = get_anndata_from_sample_id(sample_id)
    if adata is None:
        skipped_samples.append(sample_id)
        print(sample_id)
        continue
    adata.write_h5ad(sample_path)

 31%|███▏      | 232/737 [11:22<2:17:32, 16.34s/it]

WT-070-01-1A


 32%|███▏      | 233/737 [11:24<1:54:45, 13.66s/it]

WT-070-02-1A


 32%|███▏      | 234/737 [11:25<1:33:02, 11.10s/it]

WT-070-03-1A


 32%|███▏      | 235/737 [11:26<1:14:09,  8.86s/it]

WT-070-04-1A


 32%|███▏      | 236/737 [11:28<58:36,  7.02s/it]  

WT-070-05-1A


 32%|███▏      | 237/737 [11:29<46:08,  5.54s/it]

WT-070-06-1A


 32%|███▏      | 238/737 [11:31<36:39,  4.41s/it]

WT-070-07-1A


 32%|███▏      | 239/737 [11:32<29:32,  3.56s/it]

WT-070-08-1A


 33%|███▎      | 240/737 [11:33<24:15,  2.93s/it]

WT-070-09-1A


 33%|███▎      | 241/737 [11:35<21:02,  2.54s/it]

WT-070-10-1A


 33%|███▎      | 242/737 [11:36<17:33,  2.13s/it]

WT-070-11-1A


 33%|███▎      | 244/737 [12:21<1:27:15, 10.62s/it]

WT-070-13-1A


 33%|███▎      | 246/737 [13:27<2:36:57, 19.18s/it]

WT-070-15-1A


 34%|███▎      | 248/737 [14:16<2:41:43, 19.84s/it]

WT-070-17-1A


 34%|███▍      | 249/737 [14:18<1:56:20, 14.30s/it]

WT-070-18-1A


 34%|███▍      | 250/737 [14:19<1:24:34, 10.42s/it]

WT-070-19-1A


 34%|███▍      | 251/737 [14:20<1:02:23,  7.70s/it]

WT-070-20-1A


 34%|███▍      | 252/737 [14:22<46:52,  5.80s/it]  

WT-070-21-1A


 34%|███▍      | 253/737 [14:23<37:03,  4.59s/it]

WT-070-22-1A


 34%|███▍      | 254/737 [14:24<28:35,  3.55s/it]

WT-070-23-1A


 35%|███▍      | 257/737 [17:03<3:41:32, 27.69s/it]

WT-070-26-1A


 35%|███▌      | 260/737 [18:29<3:20:18, 25.20s/it]

WT-070-29-1A


 35%|███▌      | 261/737 [18:30<2:23:09, 18.05s/it]

WT-070-30-1A


 36%|███▌      | 262/737 [18:31<1:43:12, 13.04s/it]

WT-070-31-1A


 36%|███▌      | 263/737 [18:33<1:15:48,  9.60s/it]

WT-070-32-1A


 36%|███▌      | 266/737 [19:18<1:33:02, 11.85s/it]

WT-070-35-1A


 36%|███▌      | 267/737 [19:20<1:08:42,  8.77s/it]

WT-070-36-1A


 36%|███▋      | 268/737 [19:21<51:49,  6.63s/it]  

WT-070-37-1A


 36%|███▋      | 269/737 [19:23<39:22,  5.05s/it]

WT-070-38-1A


 37%|███▋      | 270/737 [19:24<31:07,  4.00s/it]

WT-070-39-1A


 37%|███▋      | 271/737 [19:26<25:54,  3.34s/it]

WT-070-40-1A


 38%|███▊      | 279/737 [28:17<6:57:02, 54.63s/it]

GBM-077-01-1A


 38%|███▊      | 280/737 [28:19<4:55:26, 38.79s/it]

GBM-077-02-1A


 38%|███▊      | 281/737 [28:20<3:29:58, 27.63s/it]

GBM-077-03-1A


 38%|███▊      | 282/737 [28:22<2:30:16, 19.82s/it]

GBM-077-04-1A


 38%|███▊      | 283/737 [28:24<1:48:34, 14.35s/it]

GBM-077-05-1A


 39%|███▊      | 284/737 [28:25<1:19:26, 10.52s/it]

GBM-077-06-1A


 39%|███▊      | 285/737 [28:27<59:04,  7.84s/it]  

GBM-077-07-1A


 39%|███▉      | 286/737 [28:28<44:20,  5.90s/it]

GBM-077-08-1A


 39%|███▉      | 287/737 [28:29<34:01,  4.54s/it]

GBM-077-09-1A


 39%|███▉      | 288/737 [28:31<27:23,  3.66s/it]

GBM-077-10-1A


 39%|███▉      | 289/737 [28:32<22:12,  2.97s/it]

GBM-077-11-1A


 39%|███▉      | 290/737 [28:34<19:33,  2.62s/it]

GBM-077-12-1A


 39%|███▉      | 291/737 [28:36<17:14,  2.32s/it]

GBM-077-13-1A


 40%|███▉      | 292/737 [28:37<15:35,  2.10s/it]

GBM-077-14-1A


 40%|███▉      | 293/737 [28:39<14:26,  1.95s/it]

GBM-077-15-1A


 40%|███▉      | 294/737 [28:41<13:37,  1.85s/it]

GBM-077-16-1A


 40%|████      | 295/737 [28:42<13:27,  1.83s/it]

GBM-077-17-1A


 40%|████      | 296/737 [28:44<12:49,  1.75s/it]

GBM-077-18-1A


 43%|████▎     | 315/737 [40:19<2:33:29, 21.82s/it]

GBM-080-01-1F


 43%|████▎     | 316/737 [40:21<1:49:59, 15.68s/it]

GBM-080-02-1F


 43%|████▎     | 317/737 [40:22<1:20:09, 11.45s/it]

MB-081-01-1A


 43%|████▎     | 318/737 [40:24<59:14,  8.48s/it]  

MB-081-02-1A


 43%|████▎     | 319/737 [40:26<44:41,  6.41s/it]

MB-081-03-1A


 43%|████▎     | 320/737 [40:27<34:54,  5.02s/it]

MB-081-04-1A


 44%|████▎     | 321/737 [40:29<28:42,  4.14s/it]

MB-081-05-1A


 44%|████▎     | 322/737 [40:32<25:19,  3.66s/it]

MB-081-06-1A


 44%|████▍     | 323/737 [40:34<21:25,  3.11s/it]

MB-081-07-1A


 44%|████▍     | 324/737 [40:35<18:14,  2.65s/it]

MB-081-08-1A


 44%|████▍     | 325/737 [40:38<18:33,  2.70s/it]

MB-081-09-1A


 44%|████▍     | 326/737 [40:42<19:55,  2.91s/it]

MB-081-10-1A


 44%|████▍     | 327/737 [40:44<19:32,  2.86s/it]

MB-081-11-1A


 45%|████▍     | 328/737 [40:46<17:16,  2.53s/it]

MB-081-12-1A


 45%|████▍     | 329/737 [40:48<15:45,  2.32s/it]

MB-081-13-1A


 45%|████▌     | 333/737 [41:55<1:16:18, 11.33s/it]

PRAD-082-04-1A


 45%|████▌     | 334/737 [41:56<55:34,  8.28s/it]  

PRAD-082-05-1A


 46%|████▋     | 341/737 [46:32<3:43:11, 33.82s/it]

PRAD-082-12-1A


 47%|████▋     | 344/737 [47:04<1:50:40, 16.90s/it]

PRAD-082-15-1A


 48%|████▊     | 356/737 [55:27<2:55:57, 27.71s/it]

PRAD-082-27-1A


 49%|████▊     | 359/737 [56:31<2:13:17, 21.16s/it]

MCC-084-01-1A


 49%|████▉     | 362/737 [57:01<1:20:07, 12.82s/it]

MCC-084-04-1A


 49%|████▉     | 364/737 [57:14<57:35,  9.26s/it]  

MCC-084-06-1A


 50%|████▉     | 365/737 [57:16<43:09,  6.96s/it]

MCC-084-07-1A


 50%|████▉     | 366/737 [57:17<32:37,  5.28s/it]

MCC-084-08-1A


 50%|████▉     | 367/737 [57:19<25:18,  4.10s/it]

MCC-084-09-1A


 50%|████▉     | 368/737 [57:20<20:09,  3.28s/it]

MCC-084-10-1A


 52%|█████▏    | 380/737 [1:03:08<2:37:09, 26.41s/it]

GBM-085-12-1A


 52%|█████▏    | 381/737 [1:03:09<1:52:09, 18.90s/it]

GBM-085-13-1A


 53%|█████▎    | 387/737 [1:05:50<2:09:25, 22.19s/it]

CRC-086-01-1A


 53%|█████▎    | 391/737 [1:06:28<1:09:46, 12.10s/it]

HNSCC-087-01-1A


 53%|█████▎    | 394/737 [1:07:01<56:16,  9.85s/it]  

PDAC-091-02-1A


 54%|█████▎    | 395/737 [1:07:03<41:42,  7.32s/it]

PDAC-091-03-1A


 54%|█████▎    | 396/737 [1:07:04<31:24,  5.53s/it]

PDAC-091-04-1A


 54%|█████▍    | 397/737 [1:07:05<24:12,  4.27s/it]

PDAC-091-05-1A


 55%|█████▍    | 402/737 [1:08:11<1:07:59, 12.18s/it]

PDAC-091-10-1A


 56%|█████▌    | 414/737 [1:10:28<47:01,  8.73s/it]  

PDAC-091-22-1A


 56%|█████▋    | 416/737 [1:11:42<1:47:05, 20.02s/it]

LUSC-092-02-1A


 58%|█████▊    | 430/737 [1:26:32<3:30:01, 41.05s/it]

LUSC-092-16-1A


 59%|█████▊    | 432/737 [1:27:04<2:17:23, 27.03s/it]

LUSC-092-18-1A


 60%|█████▉    | 441/737 [1:33:01<2:19:06, 28.20s/it]

SKCM-095-01-1A


 60%|█████▉    | 442/737 [1:33:03<1:40:03, 20.35s/it]

SKCM-095-02-1A


 60%|██████    | 443/737 [1:33:06<1:13:21, 14.97s/it]

SKCM-095-03-1A


 61%|██████    | 446/737 [1:34:25<1:34:22, 19.46s/it]

LUAD-096-03-1A


 61%|██████    | 451/737 [1:37:11<2:01:20, 25.46s/it]

LUAD-096-08-1A


 61%|██████▏   | 452/737 [1:37:12<1:26:51, 18.29s/it]

LUAD-096-09-1A


 61%|██████▏   | 453/737 [1:37:13<1:02:31, 13.21s/it]

LUAD-096-10-1A


 62%|██████▏   | 454/737 [1:37:15<45:48,  9.71s/it]  

LUAD-096-11-1A


 62%|██████▏   | 456/737 [1:37:40<47:40, 10.18s/it]  

LUAD-096-13-1A


 62%|██████▏   | 457/737 [1:37:42<35:29,  7.61s/it]

LUAD-096-14-1A


 62%|██████▏   | 458/737 [1:37:43<26:58,  5.80s/it]

LUAD-096-15-1A


 64%|██████▍   | 473/737 [1:45:30<1:34:45, 21.54s/it]

LUAD-096-30-1A


 64%|██████▍   | 474/737 [1:45:33<1:08:59, 15.74s/it]

LUAD-096-31-1A


 65%|██████▍   | 476/737 [1:46:05<1:04:02, 14.72s/it]

LUAD-096-33-1A


 65%|██████▍   | 478/737 [1:47:03<1:24:41, 19.62s/it]

LUAD-096-35-1A


 65%|██████▍   | 479/737 [1:47:05<1:01:04, 14.20s/it]

LUAD-096-36-1A


 65%|██████▌   | 480/737 [1:47:07<45:24, 10.60s/it]  

LUAD-096-37-1A


 65%|██████▌   | 481/737 [1:47:08<33:23,  7.83s/it]

LUAD-096-38-1A


 65%|██████▌   | 482/737 [1:47:10<25:00,  5.89s/it]

LUAD-096-39-1A


 66%|██████▌   | 483/737 [1:47:11<19:11,  4.54s/it]

LUAD-096-40-1A


 66%|██████▌   | 484/737 [1:47:13<15:23,  3.65s/it]

LUAD-096-41-1A


 68%|██████▊   | 501/737 [1:56:47<2:07:11, 32.33s/it]

RB-098-02-1A


 69%|██████▉   | 510/737 [2:02:47<1:38:19, 25.99s/it]

EPN-102-01-1C


 69%|██████▉   | 511/737 [2:02:51<1:13:25, 19.49s/it]

EPN-102-02-1C


 69%|██████▉   | 512/737 [2:02:52<52:29, 14.00s/it]  

EPN-102-03-1C


 70%|██████▉   | 513/737 [2:02:54<38:06, 10.21s/it]

EPN-102-04-1C


 70%|██████▉   | 514/737 [2:02:55<27:48,  7.48s/it]

EPN-102-05-1C


 70%|██████▉   | 515/737 [2:02:56<21:06,  5.70s/it]

EPN-102-06-1C


 70%|███████   | 516/737 [2:02:58<17:05,  4.64s/it]

EPN-102-07-1C


 70%|███████   | 517/737 [2:03:00<13:49,  3.77s/it]

EPN-102-08-1C


 71%|███████   | 522/737 [2:10:45<3:27:57, 58.03s/it]

VS-105-01-1A


 71%|███████   | 523/737 [2:10:46<2:26:39, 41.12s/it]

VS-105-02-1A


 71%|███████▏  | 526/737 [2:17:16<5:30:27, 93.97s/it] 

VS-105-05-1A


 72%|███████▏  | 527/737 [2:17:17<3:52:04, 66.31s/it]

VS-105-06-1A


 72%|███████▏  | 529/737 [2:18:25<2:43:23, 47.13s/it]

VS-105-08-1A


 72%|███████▏  | 531/737 [2:19:38<2:12:38, 38.63s/it]

VS-105-10-1A


 72%|███████▏  | 532/737 [2:19:40<1:34:14, 27.58s/it]

VS-105-11-1A


 72%|███████▏  | 533/737 [2:19:42<1:07:40, 19.90s/it]

VS-105-12-1A


 72%|███████▏  | 534/737 [2:19:44<49:22, 14.59s/it]  

VS-105-13-1A


 73%|███████▎  | 535/737 [2:19:46<36:13, 10.76s/it]

VS-105-14-1A


 73%|███████▎  | 536/737 [2:19:48<27:02,  8.07s/it]

VS-105-15-1A


 75%|███████▌  | 554/737 [2:43:01<3:32:59, 69.84s/it] 

BRCA-107-11-1A


 76%|███████▌  | 560/737 [2:58:02<6:28:12, 131.59s/it]

BCC-109-03-1A


 76%|███████▋  | 562/737 [2:59:39<4:07:56, 85.01s/it] 

BCC-109-05-1A


 76%|███████▋  | 563/737 [2:59:41<2:53:57, 59.98s/it]

BCC-109-06-1A


 77%|███████▋  | 564/737 [2:59:43<2:02:47, 42.59s/it]

BCC-109-07-1A


 77%|███████▋  | 571/737 [3:07:05<2:24:54, 52.37s/it]

ESCC-111-06-1A


 81%|████████▏ | 600/737 [3:31:28<1:14:48, 32.76s/it]

ESCC-111-35-1A


 83%|████████▎ | 614/737 [3:40:44<1:04:12, 31.32s/it]

ESCC-111-49-1A


 85%|████████▌ | 630/737 [3:57:37<2:31:25, 84.91s/it] 

SCE-112-07-1A


 86%|████████▌ | 631/737 [3:57:39<1:46:06, 60.06s/it]

SCE-112-08-1A


 86%|████████▌ | 632/737 [3:57:41<1:14:38, 42.65s/it]

SCE-112-09-1A


 86%|████████▌ | 633/737 [3:57:43<52:41, 30.40s/it]  

SCE-112-10-1A


 86%|████████▌ | 634/737 [3:57:45<37:33, 21.88s/it]

SCE-112-11-1A


 86%|████████▌ | 635/737 [3:57:47<27:04, 15.93s/it]

SCE-112-12-1A


 86%|████████▋ | 636/737 [3:57:49<19:47, 11.76s/it]

SCE-112-13-1A


 87%|████████▋ | 639/737 [4:00:25<57:35, 35.26s/it]  

MPNST-114-01-1A


 87%|████████▋ | 640/737 [4:00:27<40:41, 25.17s/it]

MPNST-114-02-1A


 87%|████████▋ | 641/737 [4:00:28<28:50, 18.03s/it]

MPNST-114-03-1A


 87%|████████▋ | 642/737 [4:00:30<20:43, 13.09s/it]

MPNST-114-04-1A


 88%|████████▊ | 646/737 [4:01:26<17:29, 11.53s/it]

EAC-116-01-1A


 88%|████████▊ | 648/737 [4:02:52<35:12, 23.74s/it]

EAC-116-03-1A


 88%|████████▊ | 649/737 [4:02:53<25:03, 17.09s/it]

EAC-116-04-1A


 88%|████████▊ | 650/737 [4:02:55<18:01, 12.43s/it]

EAC-116-05-1A


 88%|████████▊ | 651/737 [4:02:56<13:09,  9.18s/it]

EAC-116-06-1A


 88%|████████▊ | 652/737 [4:02:58<09:52,  6.97s/it]

EAC-116-07-1A


 89%|████████▊ | 653/737 [4:03:00<07:39,  5.47s/it]

EAC-116-08-1A


 89%|████████▊ | 654/737 [4:03:01<05:51,  4.24s/it]

GBM-117-01-1A


 89%|████████▉ | 655/737 [4:03:03<04:30,  3.30s/it]

GBM-117-02-1A


 89%|████████▉ | 656/737 [4:03:04<03:40,  2.72s/it]

ccRCC-118-01-1A


 89%|████████▉ | 659/737 [4:05:28<31:22, 24.14s/it]

ccRCC-118-04-1A


 90%|████████▉ | 661/737 [4:05:51<21:05, 16.65s/it]

ccRCC-118-06-1A


 90%|████████▉ | 663/737 [4:07:03<29:00, 23.52s/it]

ccRCC-118-08-1A


 93%|█████████▎| 687/737 [4:25:36<27:01, 32.43s/it]  

MCC-122-06-1A


 93%|█████████▎| 689/737 [4:26:18<19:48, 24.77s/it]

MCC-122-08-1A


 96%|█████████▋| 710/737 [4:39:02<13:40, 30.40s/it]

HNSCC-127-03-1A


 96%|█████████▋| 711/737 [4:39:04<09:29, 21.89s/it]

BRCA-128-01-1A


 97%|█████████▋| 712/737 [4:39:05<06:33, 15.72s/it]

BRCA-128-02-1A


 97%|█████████▋| 714/737 [4:40:12<08:23, 21.89s/it]

BRCA-128-04-1A


 97%|█████████▋| 716/737 [4:41:44<10:32, 30.11s/it]

BRCA-128-06-1A


 98%|█████████▊| 721/737 [4:45:14<08:45, 32.85s/it]

HSCC-130-03-1H


 99%|█████████▉| 728/737 [4:50:03<04:47, 31.89s/it]

HSCC-130-10-1H


 99%|█████████▉| 731/737 [4:52:06<03:16, 32.71s/it]

CRC-133-01-1A


100%|██████████| 737/737 [4:54:15<00:00, 23.96s/it]


In [12]:
results.loc[results["sample ID"].isin(skipped_samples), ["included"]] = "no"
results.loc[results["sample ID"].isin(skipped_samples), ["reason"]] = "Too few malignant cells"
results = results.drop("Unnamed: 0", axis=1)
results.to_csv("sample_metadata.csv")

In [47]:
adata.var_names

Index(['AL627309.1', 'AL627309.5', 'AP006222.2', 'LINC01409', 'LINC01128',
       'LINC00115', 'FAM41C', 'AL645608.2', 'LINC02593', 'SAMD11',
       ...
       'MT-ND6', 'MT-CYB', 'BX004987.1', 'MAFIP', 'AC011043.1', 'AL354822.1',
       'AL592183.1', 'AC240274.1', 'AC233755.2', 'AC007325.4'],
      dtype='object', length=20506)

In [13]:
results["included"].value_counts()

included
yes    565
no     172
Name: count, dtype: int64

In [63]:
results.columns

Index(['Unnamed: 0', 'authors', 'author_last_name', 'url', 'details', 'year',
       'full_citation', 'cancer_type', 'cancer_type_abbr', 'primary_site',
       'sampling_site', 'tumor_grade', 'tumor_status', 'strategy', 'protocol',
       'instrument', 'GEO accession', 'age', 'gender', 'treatment',
       'other_metadata', 'sample ID', 'Accession No.', 'Project ID',
       'included', 'reason'],
      dtype='object')

In [3]:
import pathlib
import scanpy as sc

m_cells = 0
for project in pathlib.Path("data/CancerSCEM/").iterdir():
    print(project.stem)
    for dataset_path in project.iterdir():
        adata = sc.read_h5ad(dataset_path)
        m_cells += adata.obs["cell_type"].value_counts()[_malignant_cell]

TS-039
MPAL-022
TGCT-052
CM-036
LSCC-034
PDAC-044
UCEC-024
LUSC-092
BRCA-071
CRC-133
GBC-066
LUAD-096
GBM-077
PDAC-106
PDAC-132
RB-098
ESCC-079
OV-020
PAAD-038
ATC-026
PDAC-046
MB-078
UL-050
iCCA-066
CRC-086
SKCM-095
BLCA-115
PRAD-063
WT-070
CRC-030
ccRCC-061
ESCC-126
ccRCC-060
MPNST-114
MCC-122
GCTB-045
HCC-059
BRCA-135
ESCC-111
CaCx-120
SCE-112
HNSCC-087
HNSCC-127
SACC-103
HNSCC-121
MB-081
MPLC-068
HSCC-130
LUSC-005
EPN-102
RMC-125
VS-105
BRCA-107
EAC-116
BRCA-134
PDAC-064
GBM-117
PDAC-091
cSCC-099
BCC-109
NSCLC-006
MCC-084
AM-036
HNSCC-043
NSCLC-007
HGSOC-042
HGSOC-049
PDAC-051
PTC-069
MESO-065
GBM-085
LUAD-004
CRC-101
ccRCC-118
TNBC-075
PRAD-082
BRCA-128
BLCA-031
GCTB-052
CRC-067
ESCC-032
ESCC-048
GBM-080


In [4]:
m_cells

818940