- title: "Workflow"
- author: "Margaret Paiva"
- output: Python notebook

In [30]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path
from statistics import median
import gzip
import scipy.io
import scanpy as sc

# Part 1. Load and filter data

In [None]:
######
# data at Amazon S3/Buckets/championsoncology.integration/omics/
######
# The dataset is very large - please don't commit to github
######
filename = "../data/S0001_astrocytoma_GSE89567_RAW.txt.gz"
raw = pd.read_csv(filename, compression='gzip', sep='\t', index_col=0)
raw.index = [gene.strip("'") for gene in raw.index]
raw

In [None]:
# start = raw.columns.str.find('_') + 1
# names = [s[i:(i+3)].replace('_', '') for s, i in zip(raw.columns, start)]

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

## Library size

In [None]:
ls = np.sum(raw, axis=0)  # library size
ls.shape

In [None]:
sc.settings.set_figure_params(dpi=80)
sns.set(color_codes=True)

plt.figure(figsize=(8, 8))
sns.displot(np.log10(ls))
plt.title('Histogram of log10 of library size')
plt.ylabel('Frequency')

## Number of cells per gene

In [None]:
num_cells = np.sum(raw>0, axis=1)

# Add 1 to those genes that are expressed in 0 cells for log calculation
num_cells[num_cells==0] = 1  
num_cells.shape

In [None]:
plt.figure(figsize=(8, 8))
sns.displot(np.log10(num_cells))
plt.title('Histogram of log10 of the number of cells per gene')
plt.ylabel('Density')

# Part 2. Normalization

In [None]:
# Convert to AnnData
adata = sc.AnnData(raw.transpose())

# # Add library size as n_counts column
# adata.obs['n_counts'] = np.nansum(raw, axis=0)

adata

In [None]:
min_counts = 10**3.8 # keep cells with at least min_counts counts
min_cells = 10**0 # Keep genes that have at least min_cells cells
sc.pp.filter_cells(adata, min_counts=min_counts)
sc.pp.filter_genes(adata, min_cells=min_cells)

adata

In [None]:
# Normalization
sc.pp.normalize_total(adata)

In [None]:
# Double check the normalization result - each cell should have total counts = median_ls
adata.X.sum(axis=1)

In [None]:
np.any(np.isnan(raw))

# Part 3. Dimensionality reduction

In [None]:
sc.settings.set_figure_params(dpi=100)

sc.pp.log1p(adata)

# Use PCA and decide the number of principle components in Umap
sc.tl.pca(adata)
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
sc.pl.pca(adata, color='EGFR')

In [None]:
# Based on the figures above, choose the number of principle components
sc.pp.neighbors(adata, n_neighbors=50, n_pcs=30)
sc.tl.umap(adata)
sc.pl.umap(adata)

# Part 4. Cluster cells based on marker genes

In [None]:
sc.tl.leiden(adata, resolution=0.5)
adata.obs

In [None]:
sc.pl.umap(adata, color='leiden')

In [None]:
# start = adata.obs.index.str.find('_') + 1
# adata.obs['batch'] = [s[i:(i+3)].replace('_', '') for s, i in zip(adata.obs.index, start)]
adata.obs['batch'] = adata.obs.index.str[0:6]
adata.obs

In [None]:
# sc.pl.umap(adata, color='batch')

In [None]:
sc.pl.umap(adata, color=['EGFR', 'CD74'])

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
# Decide what marker genes to be marked for each cluster
sc.settings.set_figure_params(dpi=150)
marker_genes = [
                "EGFR", # tumor cell markers
                "NOVA1",  
                "UCHL1",
                "CD14",   # macrophage markers
                "CD68",
                "CD74"
                ]

sc.pl.dotplot(adata, marker_genes, groupby='leiden')

In [None]:
cluster_names = [
                 'Transformed1', #0
                 'Transformed2', #1
                 'Macrophage1', #2
                 'Transformed3', #3
                 'Transformed4', #4
                 'Transformed5', #5
                 'Transformed6', #6
                 'Transformed7', #7
                 'Transformed8', #8
                 'Macrophage2', #9
                 'Transformed9', #10
                 'Transformed10', #11
                 'Transformed11', #12
                 'Transformed12', #13
                 'Macrophage', #14
                ]

adata.rename_categories('leiden', cluster_names)

sc.settings.set_figure_params(dpi=200)
sc.pl.umap(adata, color='leiden', legend_loc='on data', 
           title='', legend_fontsize=5)