# Melanoma Single-Cell Landscape
## 01 - Data Ingestion and Quality Control (QC)

### Objective
Load and perform quality control on melanoma scRNA-seq data.

### Dataset
- Study: Jerby-Arnon et al. (Cell, 2018)
- GEO: GSE115978

In [None]:
# Library Imports
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

warnings.filterwarnings('ignore')
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white')
sc.settings.figdir = '../results/figures/'
np.random.seed(42)

print(f'Scanpy version: {sc.__version__}')

In [None]:
# Data Loading
# Using PBMC3K as placeholder - replace with melanoma data
MELANOMA_PATH = '../data/raw/melanoma_counts.h5ad'

if os.path.exists(MELANOMA_PATH):
    adata = sc.read_h5ad(MELANOMA_PATH)
    print('Melanoma data loaded!')
else:
    print('Loading PBMC3K as placeholder...')
    adata = sc.datasets.pbmc3k()

adata.var_names_make_unique()
print(f'Dimensions: {adata.shape[0]:,} cells x {adata.shape[1]:,} genes')

In [None]:
# Calculate QC Metrics
# Mitochondrial genes start with MT- (human) or mt- (mouse)
adata.var['mt'] = adata.var_names.str.startswith('MT-')
print(f'MT genes found: {adata.var["mt"].sum()}')

sc.pp.calculate_qc_metrics(
    adata, 
    qc_vars=['mt'],
    percent_top=None,
    log1p=False,
    inplace=True
)
print('QC metrics calculated!')

In [None]:
# QC Violin Plots
fig, axes = plt.subplots(1, 3, figsize=(14, 5))

sc.pl.violin(adata, 'n_genes_by_counts', ax=axes[0], show=False)
axes[0].set_title('Genes per Cell', fontweight='bold')
axes[0].axhline(y=200, color='red', linestyle='--')
axes[0].axhline(y=5000, color='red', linestyle='--')

sc.pl.violin(adata, 'total_counts', ax=axes[1], show=False)
axes[1].set_title('UMIs per Cell', fontweight='bold')

sc.pl.violin(adata, 'pct_counts_mt', ax=axes[2], show=False)
axes[2].set_title('% Mitochondrial', fontweight='bold')
axes[2].axhline(y=20, color='red', linestyle='--')

plt.tight_layout()
plt.savefig('../results/figures/01_qc_violin.png', dpi=150)
plt.show()

In [None]:
# Scatter Plot: Complexity vs Depth
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sc1 = axes[0].scatter(
    adata.obs['total_counts'],
    adata.obs['n_genes_by_counts'],
    c=adata.obs['pct_counts_mt'],
    cmap='RdYlBu_r', s=1, alpha=0.5
)
axes[0].set_xlabel('Total Counts')
axes[0].set_ylabel('Genes Detected')
axes[0].set_title('Complexity vs Depth')
plt.colorbar(sc1, ax=axes[0], label='% Mito')

axes[1].hist(adata.obs['pct_counts_mt'], bins=50, color='steelblue')
axes[1].axvline(x=20, color='red', linestyle='--', label='20% threshold')
axes[1].set_xlabel('% Mitochondrial')
axes[1].set_ylabel('Cells')
axes[1].legend()

plt.tight_layout()
plt.savefig('../results/figures/01_qc_scatter.png', dpi=150)
plt.show()

In [None]:
# Apply Filtering
MIN_GENES = 200
MAX_GENES = 5000
MAX_MT = 20

print(f'Cells before: {adata.n_obs:,}')
n_before = adata.n_obs

# Filter cells
adata = adata[adata.obs['n_genes_by_counts'] >= MIN_GENES, :].copy()
adata = adata[adata.obs['n_genes_by_counts'] <= MAX_GENES, :].copy()
adata = adata[adata.obs['pct_counts_mt'] <= MAX_MT, :].copy()

n_after = adata.n_obs
print(f'Cells after: {n_after:,}')
print(f'Removed: {n_before - n_after:,} ({100*(n_before-n_after)/n_before:.1f}%)')

In [None]:
# Filter genes (min 3 cells)
print(f'Genes before: {adata.n_vars:,}')
sc.pp.filter_genes(adata, min_cells=3)
print(f'Genes after: {adata.n_vars:,}')

In [None]:
# Save processed data
output_path = '../data/processed/melanoma_qc.h5ad'

adata.uns['qc_params'] = {
    'min_genes': MIN_GENES,
    'max_genes': MAX_GENES,
    'max_mt': MAX_MT,
    'cells_before': n_before,
    'cells_after': n_after
}

adata.write_h5ad(output_path)
print(f'Saved to: {output_path}')
print(f'Final: {adata.n_obs:,} cells x {adata.n_vars:,} genes')

In [None]:
# Summary
print('=' * 50)
print('PHASE 1 COMPLETED - QC CHECKLIST')
print('=' * 50)
print(f'Final cells: {adata.n_obs:,}')
print(f'Final genes: {adata.n_vars:,}')
print(f'Median genes/cell: {adata.obs["n_genes_by_counts"].median():.0f}')
print(f'Median UMIs/cell: {adata.obs["total_counts"].median():.0f}')
print(f'Median % mito: {adata.obs["pct_counts_mt"].median():.2f}%')