# Run with scib-pipeline-R4.0 conda environment

In [None]:
# Import dependencies
%matplotlib inline
import os
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scib
import anndata

import matplotlib.pyplot as plt
from typing import List

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
wdir = "/ceph/project/tendonhca/akurjan/analysis/"
os.chdir( wdir )

# folder structures
QC_FOLDERNAME = "foetal/results/SingleCellQC/"
RESULTS_FOLDERNAME = "foetal/results/Embryo Normalisation/"
FIGURES_FOLDERNAME = "foetal/figures/Embryo Normalisation/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)


# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.savefig(os.path.join(folder, fname), format='svg')

# DIMENSIONALITY REDUCTION

In [None]:
adata = sc.read_h5ad(os.path.join(QC_FOLDERNAME, 'sc_adata_concat_mfiltered.h5ad'))
adata

In [None]:
del adata.layers['ambiguous']
del adata.layers['matrix']

In [None]:
print(adata.X[1:5, 1:5])

In [None]:
print(adata.layers['log1p_norm'][1:5, 1:5])

In [None]:
adata.X = adata.layers['log1p_norm'].copy()

# Batch-aware Feature Selection

### scIB:
Batch-aware highly variable gene selection

Method to select HVGs based on mean dispersions of genes that are highly variable genes in all batches. Using a the top target_genes per batch by average normalize dispersion. If target genes still hasn’t been reached, then HVGs in all but one batches are used to fill up. This is continued until HVGs in a single batch are considered.

Parameters:
- adata – anndata object
- batch – adata.obs column
- target_genes – maximum number of genes (intersection reduces the number of genes)
- flavor – parameter for scanpy.pp.highly_variable_genes
- n_bins – parameter for scanpy.pp.highly_variable_genes
- adataOut – whether to return an anndata object or a list of highly variable genes

In [None]:
scib.preprocessing.hvg_batch(adata, 
                             batch_key="samplename",
                             target_genes=3000, 
                             flavor='cell_ranger', 
                             n_bins=20, 
                             adataOut=True
                            )

sc.pl.highly_variable_genes(adata)

In [None]:
n_batches = adata.var["highly_variable_nbatches"].value_counts()
ax = n_batches.plot(kind="bar")
n_batches

In [None]:
# check that variables are unique:
assert len(adata.var_names) == len(set(adata.var_names))

In [None]:
def split_and_scale(anndata_obj: anndata.AnnData, obs_var: str) -> anndata.AnnData:
    """
    Splits anndata object into separate objects based on the given observation variable, scales each object using
    `sc.pp.scale` and merges them back together using `anndata.concat`.

    Parameters:
    -----------
    anndata_obj: anndata.AnnData
        Annotated data matrix with normalized, log-transformed counts.
    obs_var: str
        Observation variable to split the data on.
    """
    
    # Check if anndata.X is log-transformed and normalised
    if np.min(anndata_obj.X) >= 1:
        raise ValueError("Anndata object X is not log-transformed.")
    if np.max(anndata_obj.X) > 10:
        raise ValueError("Anndata object X is not normalised.")
    
    # First, make a copy of the input anndata object
    anndata_copy = anndata_obj.copy()
    
    # Split the data by the provided observation variable
    groups = anndata_copy.obs[obs_var].unique()
    split_data = [anndata_copy[anndata_copy.obs[obs_var] == group] for group in groups]
    
    # Scale each split data object using `sc.pp.scale`
    for i, data in enumerate(split_data):
        sc.pp.scale(data)
        split_data[i] = data
        
    # Merge the split data back together using `anndata.concat`
    merged_data = anndata.concat(split_data, join='outer', index_unique=None)
    
    del anndata_copy
    del split_data
    del groups
    
    return merged_data

In [None]:
scaled_adata = split_and_scale(adata, 'samplename')
scaled_adata
print(scaled_adata.X[1:10,1:10])

In [None]:
adata.layers['scaled'] = scaled_adata.X.copy()
adata

In [None]:
del scaled_adata

In [None]:
adata.obsm["X_pca"] = sc.pp.pca(adata[:,adata.var.highly_variable].layers["scaled"], n_comps=50, svd_solver="arpack")

In [None]:
adata.var

In [None]:
annot = sc.queries.biomart_annotations(
    "hsapiens",
    ["ensembl_gene_id", "external_gene_name", "start_position", "end_position", "chromosome_name"],
).set_index("ensembl_gene_id")

adata.var[annot.columns] = annot

adata.var.rename(columns={"external_gene_name": "Gene"}, inplace=True)
adata.var['ensembl_gene_id'] = adata.var.index
adata.var['Gene'] = adata.var['Gene'].fillna(adata.var['ensembl_gene_id'])
adata.obs.index.name = 'CellID'
adata.var.index = adata.var["Gene"]
adata.var_names_make_unique()
adata.var

In [None]:
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
# adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]"))

# sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=False)

In [None]:
# Filter out mitochondrial and ribosomal genes
print(f"Before filtering: {adata.n_vars} genes")
mt_genes = adata.var_names[adata.var['mt']]  # list of mitochondrial genes
ribo_genes = adata.var_names[adata.var['ribo']]  # list of ribosomal genes
malat1 = adata.var_names.str.startswith('MALAT1')
genes_to_remove = np.concatenate([mt_genes, ribo_genes, malat1])
adata = adata[:, ~adata.var_names.isin(genes_to_remove)]
print(f"After filtering: {adata.n_vars} genes")

# Calculate n_counts and n_genes
adata.obs['n_counts'] = adata.X.sum(axis=1)
adata.obs['n_genes'] = (adata.X > 0).sum(axis=1)

## SAMPLE SEX DETERMINATION

In [None]:
# check if there is XIST expression
if any(adata.var_names.str.match('XIST')) == True:
    chrY_genes = adata.var_names.intersection(annot.index[annot.chromosome_name == "Y"])

    adata.obs['percent_chrY'] = np.sum(
        adata[:, chrY_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 * 100

    # color inputs must be from either .obs or .var, so add in XIST expression to obs.
    adata.obs["XIST-counts"] = adata.X[:,adata.var_names.str.match('XIST')].toarray()

    sc.pl.violin(adata, ["XIST-counts", "percent_chrY"], jitter=0.4, groupby = 'samplename', rotation= 90, save='_XIST.svg')
else:
    pass

In [None]:
# Calculate median XIST-counts and percent_chrY values for each sample
sample_medians = adata.obs.groupby('samplename')['XIST-counts', 'percent_chrY'].median()

# Define female and male criteria based on median XIST-counts and percent_chrY
female_criteria = (sample_medians['XIST-counts'] > 0.1)
male_criteria = (sample_medians['XIST-counts'] < 0.1)

# Create a new categorical variable 'sex'
adata.obs['sex'] = 'unknown'

# Update 'sex' based on the female and male criteria
for sample in sample_medians.index:
    if female_criteria[sample]:
        adata.obs.loc[adata.obs['samplename'] == sample, 'sex'] = 'female'
    elif male_criteria[sample]:
        adata.obs.loc[adata.obs['samplename'] == sample, 'sex'] = 'male'

# Print the names of female and male samples
female_samples = adata.obs.loc[adata.obs['sex'] == 'female', 'samplename'].unique()
male_samples = adata.obs.loc[adata.obs['sex'] == 'male', 'samplename'].unique()

print(f"Female samples: {', '.join(female_samples)}")
print(f"Male samples: {', '.join(male_samples)}")

In [None]:
adata.obs[['sex', 'samplename']].value_counts()

## CELL CYCLE PHASE DETERMINATION

In [None]:
# Count number of genes before removing zero count genes
print(adata.shape[1])
# Remove genes with zero counts
sc.pp.filter_genes(adata, min_counts=5, inplace=True)
sc.pp.filter_cells(adata, min_genes=200)
# Count number of genes after removing zero count genes
print(adata.shape[1])

In [None]:
adata.raw = adata.copy()

In [None]:
#adata.obs['libbatch'] = adata.obs['libbatch'].astype('category')
adata.obs['samplename'] = adata.obs['samplename'].astype('category')

scib.preprocessing.score_cell_cycle(adata, organism='human')
sc.pl.violin(adata, ['S_score', 'G2M_score'],
             jitter=0.4, groupby = 'samplename', rotation=90, 
             save='_cell_cycle.svg'
            )

In [None]:
def plot_pca(anndata, parameters: list, components: list, filename: str):
    n_plots = len(parameters)
    fig, axs = plt.subplots(n_plots, 1, figsize=(4, 5*n_plots))
    for i, param in enumerate(parameters):
        sc.pl.pca(anndata, color=param, ax=axs[i], show=False, components = components)
        axs[i].set_title(param)
    plt.tight_layout()
    savesvg(filename, fig)
    plt.show()

In [None]:
plot_pca(adata, ['samplename', 'Clusters', 'sample_stage', 'norm_sample_stage', 'hospital_id', 'phase', 'sex', 'kit', 'seq_protocol'], 
         components = ['1,2'], filename = 'PC1vs2_plots.svg')

In [None]:
plot_pca(adata, ['samplename', 'Clusters', 'sample_stage', 'norm_sample_stage', 'hospital_id', 'phase', 'sex', 'kit', 'seq_protocol'], 
         components = ['3,4'], filename='PC3vs4_plots.svg')

In [None]:
adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'sc_normalized_adata.h5ad'))
adata

In [None]:
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=15)
sc.tl.umap(adata)

In [None]:
def plot_umaps(anndata, parameters: list, filename: str):
    n_plots = len(parameters)
    fig, axs = plt.subplots(n_plots, 1, figsize=(9, 5*n_plots))
    for i, param in enumerate(parameters):
        sc.pl.umap(anndata, color=param, ax=axs[i], show=False, frameon=False)
        axs[i].set_title(param)
    plt.tight_layout()
    savesvg(filename, fig)
    plt.show()

In [None]:
plot_umaps(adata, ['samplename', 'Clusters', 'sample_stage', 'norm_sample_stage', 'hospital_id', 'phase', 'sex', 'kit', 'seq_protocol'], 
          filename = 'UMAP_plots.svg')

In [None]:
def plot_umaps2(anndata, parameters: list, filename: str):
    n_plots = len(parameters)
    fig, axs = plt.subplots(n_plots, 1, figsize=(4, 10))
    for i, param in enumerate(parameters):
        sc.pl.umap(anndata, color=param, ax=axs[i], show=False, frameon=False)
        axs[i].set_title(param)
    plt.tight_layout()
    savesvg(filename, fig)
    plt.show()

In [None]:
plot_umaps2(adata, ["n_counts", "n_genes", "pct_counts_mt"], 
            filename = 'UMAPparameter_plots.svg')

In [None]:
adata.var_names_make_unique()

In [None]:
# check if MALAT1 gene is in the gene list
if 'MALAT1' in adata.var_names:
    # create a list of genes to keep
    gene_list = adata.var_names != 'MALAT1'
    # slice the anndata object to select all genes except for MALAT1
    adata = adata[:, gene_list]

print(f"After MALAT1 filtering: {adata.n_vars} genes")

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'sc_normalized_adata.h5ad'))