<a href="https://colab.research.google.com/github/Droslj/scATAC-seq-complete-/blob/Google-colab/scATAC_seq_(2)_DA_diffxpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

scATAC seq, based on scATAC seq processing Galaxy tutorials (scATAC preprocessing (2), Standard scATAC seq processing pipeline (1) )
AD Objects created in Galaxy using customized Galaxy WF with Snapatac2 and imported
(1) https://usegalaxy.eu/training-material/topics/single-cell/tutorials/scatac-preprocessing-tenx/tutorial.html#mapping-reads-to-a-reference-genome, (2) https://usegalaxy.eu/training-material/topics/single-cell/tutorials/scatac-standard-processing-snapatac2/tutorial.html
Data taken from the following NCBI study:
Metabolic adaptation pilots the differentiation of human hematopoietic cells (https://www.ncbi.nlm.nih.gov/bioproject/PRJNA1015713)
Import Anndata objects for two biological replicates, SRR26046013 (cells treated with AOA inhibitor) and SRR26046019 (untreated cells)
Perform following steps:
(1) Import matrices
(2) Compute fragment size distribution
(3) Compute TSS enrichment
(4) Filter cell counts based on TSSe
(5) Create cell by bin matrix based on 500 bp wide bins accross the whole genome
(6) Perform feature selection
(7) Perform Doublet removal
(8) Perform Dim reduction (spectral)
(9) Perform Clustering (neighborhood, UMAP, leiden)
(10) Create a cell by gene matrix
(11) Concatenate matrices using Inner join
(12) Remove batch effects

In [None]:
!pip install -q condacolab

In [None]:
import condacolab

In [None]:
condacolab.install()

In [None]:
!conda --version

In [None]:
!which conda

In [None]:
!conda config --add channels conda-forge

In [None]:
!conda config --add channels bioconda

In [None]:
!pip install snapatac2 -q

In [None]:
!pip show snapatac2

In [None]:
import snapatac2 as snap

In [None]:
!pip install umap-learn



In [None]:
import umap.umap_ as umap


In [None]:
from umap import UMAP

In [None]:
!pip install scanpy -q

In [None]:
import scanpy as sc

In [None]:
pip show scanpy

In [None]:
import numpy as np

In [None]:
import anndata as ad

In [None]:
!pip install diffxpy -q

In [None]:
import diffxpy.api as de

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
import plotly.subplots as sp
import plotly.graph_objects as go

In [None]:
from scipy import stats

In [None]:
import pandas as pd

# Import reads from google drive, three samples treated with energy metabolism inhibitors and one untreated

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Load AD matrix from google drive, PCA and Batch corrected
adata_concat = sc.read_h5ad('/content/drive/MyDrive/Colab Notebooks/MTXmerged_PCA_BC.h5ad')

# Make observation names unique



In [None]:
adata_concat

In [None]:
# Check if any gene names are duplicated
has_duplicates = adata_concat.var_names.duplicated().any()

if has_duplicates:
    print("var_names are not unique.")
else:
    print("var_names are unique.")

In [None]:
# Check if the number of unique gene names is equal to the total number of gene names
is_unique = len(adata_concat.var_names) == pd.Series(adata_concat.var_names).nunique()

if is_unique:
    print("var_names are unique.")
else:
    print("var_names are not unique.")

In [None]:
# Check if any observations are duplicated
has_duplicates = adata_concat.obs_names.duplicated().any()

if has_duplicates:
    print("Observations are not unique.")
else:
    print("Observations are unique.")

In [None]:
# Check if the number of unique observations is equal to the total number of observations
is_unique = len(adata_concat.obs_names) == pd.Series(adata_concat.obs_names).nunique()

if is_unique:
    print("Observations are unique.")
else:
    print("Observations are not unique.")

In [None]:
adata_concat.var_names

In [None]:
#Create unique observation names
experiment_names = ["Treated w/AOA", "Treated w/DON", "Treated w/DG", "Untreated"]

# Create a new column to store combined barcode and treatment information
adata_concat.obs['barcode_treatment'] = adata_concat.obs_names.astype(str)

# Modify the 'barcode_treatment' column
for i, treatment in enumerate(experiment_names):
    cell_indices = adata_concat.obs.index[adata_concat.obs["Treatment"] == treatment]
    adata_concat.obs.loc[cell_indices, 'barcode_treatment'] = adata_concat.obs.loc[cell_indices, 'barcode_treatment'] + "_" + str(i + 1)

In [None]:
#Set 'barcode_treatment' as the new index:
adata_concat.obs_names = pd.Index(adata_concat.obs['barcode_treatment'])

In [None]:
adata_concat.obs

# Differential accessibility analysis using diffxpy

In [None]:
#numpy type aliases
np.float = float
np.int = int   #module 'numpy' has no attribute 'int'
np.object = object    #module 'numpy' has no attribute 'object'
np.bool = bool    #module 'numpy' has no attribute 'bool'

In [None]:
import dask.array as da

In [None]:
import scipy.sparse as sparse

In [None]:
adata_concat

# Run diffxpy wald test

In [None]:
# Subsample features (e.g., top 1000 highly variable genes)
adata_subset = adata_concat[:, adata_concat.var['highly_variable'][:1000].index]

# Then, perform the Wald test on the subset:
test = de.test.wald(
    data=adata_subset,
    formula_loc="~ 1 + Treatment",
    factor_loc_totest="Treatment"
)