In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import squidpy as sq
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import cell2location
import scvi

from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text for PDFs

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set variables for file paths to read from and write to:

# set a working directory
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks/embryonic ScAndSp/"
os.chdir( wdir )

# folder structures
RESULTS_FOLDERNAME = "Spatial/results"
FIGURES_FOLDERNAME = "Spatial/figures"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME
    
sp_data_folder = "../../files/Teichmann Group Spatial/Spatial/"


def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.tight_layout()
    fig.savefig(os.path.join(folder, fname), format='svg')

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

# File Preparation

In [None]:
def read_and_qc(sample_name, path=sp_data_folder):
    """ 
    This function reads the data for one 10X spatial experiment into the anndata object.
    It also calculates QC metrics. Modify this function if required by your workflow.

    :param sample_name: Name of the sample
    :param path: path to data
    """

    adata = sc.read_visium(path + str(sample_name) + '/outs/',
                           count_file='filtered_feature_bc_matrix.h5', load_images=True)
    adata.obs['sample'] = sample_name
    adata.var['SYMBOL'] = adata.var_names
    adata.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True)
    adata.var['Gene'] = adata.var['SYMBOL'].fillna(adata.var['ENSEMBL'])
    adata.var_names = adata.var['Gene']
    adata.var.drop(columns='Gene', inplace=True)
    adata.var_names_make_unique()
    # adata.var_names = adata.var['ENSEMBL']
    # adata.var.drop(columns='ENSEMBL', inplace=True)

    # Calculate QC metrics
    from scipy.sparse import csr_matrix
    adata.X = adata.X.toarray()
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.X = csr_matrix(adata.X)
    adata.var['mt'] = [gene.startswith('MT-') for gene in adata.var['SYMBOL']]
    adata.var["ribo"] = adata.var['SYMBOL'].str.startswith(("RPS", "RPL"))
    adata.var["mtrnr"] = adata.var['SYMBOL'].str.startswith(("MTRNR"))
    adata.obs['mt_frac'] = adata[:, adata.var['mt'].tolist()].X.sum(1).A.squeeze()/adata.obs['total_counts']

    # add sample name to obs names
    adata.obs["sample"] = [str(i) for i in adata.obs['sample']]
    adata.obs_names = adata.obs["sample"] \
                          + '_' + adata.obs_names
    adata.obs.index.name = 'spot_id'

    return adata

In [None]:
sample_name = [i for i in os.listdir(sp_data_folder)]

In [None]:
# Read the data into anndata objects
slides = []
for i in sample_name:
    slides.append(read_and_qc(i, path=sp_data_folder))
slides

In [None]:
slides[0].var

In [None]:
slides[0].obs['sample'][0]

In [None]:
for adata in slides:
    adata.var_names_make_unique()
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo"], inplace=True)

In [None]:
for adata in slides:
    fig, axs = plt.subplots(1, 4, figsize=(15, 4))
    sns.distplot(adata.obs["total_counts"], kde=False, ax=axs[0])
    sns.distplot(adata.obs["total_counts"][adata.obs["total_counts"] < 10000], kde=False, bins=40, ax=axs[1])
    sns.distplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
    sns.distplot(adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 4000], kde=False, bins=60, ax=axs[3])
    print(adata.obs['sample'][0])

In [None]:
sc.pp.filter_cells(slides[0], min_counts=500)
sc.pp.filter_cells(slides[0], max_counts=10000)
sc.pp.filter_genes(slides[0], min_cells=10)

sc.pp.filter_cells(slides[1], min_counts=700)
sc.pp.filter_cells(slides[1], max_counts=10000)
sc.pp.filter_genes(slides[1], min_cells=10)

sc.pp.filter_cells(slides[2], min_counts=1000)
sc.pp.filter_cells(slides[2], max_counts=25000)
sc.pp.filter_genes(slides[2], min_cells=10)

sc.pp.filter_cells(slides[3], min_counts=1600)
sc.pp.filter_cells(slides[3], max_counts=30000)
sc.pp.filter_genes(slides[3], min_cells=10)

sc.pp.filter_cells(slides[4], min_counts=1600)
sc.pp.filter_cells(slides[4], max_counts=20000)
sc.pp.filter_genes(slides[4], min_cells=10)

sc.pp.filter_cells(slides[5], min_counts=800)
sc.pp.filter_cells(slides[5], max_counts=10000)
sc.pp.filter_genes(slides[5], min_cells=10)

sc.pp.filter_cells(slides[6], min_counts=500)
sc.pp.filter_cells(slides[6], max_counts=8000)
sc.pp.filter_genes(slides[6], min_cells=10)

sc.pp.filter_cells(slides[7], min_counts=1600)
sc.pp.filter_cells(slides[7], max_counts=30000)
sc.pp.filter_genes(slides[7], min_cells=10)

In [None]:
for adata in slides:
    sc.pl.violin(adata, 'mt_frac')

In [None]:
for idx, adata in enumerate(slides):
    print(f"Sample: {adata.obs['sample'][0]}")
    print(f"#genes before MT filter: {adata.n_vars}")
    
    # remove MT genes for spatial mapping (keeping their counts in the object)
    adata.obsm['MT'] = adata[:, adata.var['mt'].values].X.toarray()
    slides[idx] = adata[:, ~adata.var['mt'].values]
    print(f"#genes after MT filter: {slides[idx].n_vars}")

In [None]:
for idx, adata in enumerate(slides):
    print(f"Sample: {adata.obs['sample'][0]}")
    print(f"#genes before MTrnr filter: {adata.n_vars}")
    adata.obsm['MTRNR'] = adata[:, adata.var['mtrnr'].values].X.toarray()
    slides[idx] = adata[:, ~adata.var['mtrnr'].values]
    print(f"#genes after MTrnr filter: {slides[idx].n_vars}")

In [None]:
for idx, adata in enumerate(slides):
    print(f"Sample: {adata.obs['sample'][0]}")
    print(f"#genes before RIBO filter: {adata.n_vars}")
    adata.obsm['ribo'] = adata[:, adata.var['ribo'].values].X.toarray()
    slides[idx] = adata[:, ~adata.var['ribo'].values]
    print(f"#genes after RIBO filter: {slides[idx].n_vars}")

In [None]:
slides[0].var

In [None]:
print(slides[0].X[1:10,1:10])

In [None]:
for idx, adata in enumerate(slides):
    slides[idx].layers['counts'] = adata.X.copy()
    sc.pp.normalize_total(adata, inplace=True)
    sc.pp.log1p(adata)
    slides[idx].layers['normcounts'] = adata.X.copy()
    sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000)

In [None]:
print(slides[0].X[1:10,1:10])

In [None]:
for adata in slides:
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    sc.pp.neighbors(adata)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, key_added="clusters")

In [None]:
plt.rcParams["figure.figsize"] = (4, 4)
for adata in slides:
    name = adata.obs['sample'][0]
    sc.pl.umap(adata, color=["total_counts", "n_genes_by_counts", "clusters"], wspace=0.4,
               save=f'_{name}_countsAndClusters_umaps.svg')

In [None]:
plt.rcParams["figure.figsize"] = (8, 8)
for adata in slides:
    name = adata.obs['sample'][0]
    sc.pl.spatial(adata, img_key="hires", color=["total_counts", "n_genes_by_counts"],
                  save=f'_{name}_counts.svg')

## BASIC FEATURE CLUSTERING

In [None]:
for adata in slides:
    name = adata.obs['sample'][0]
    sq.gr.spatial_neighbors(adata, coord_type='generic', radius=3.0)
    sq.pl.spatial_scatter(adata, shape='circle', color='clusters', img_alpha=0.8,
                  frameon=False, figsize=(7, 3.5),
                  size=1.5, connectivity_key='spatial_connectivities', edges_width=2,
                  save=f'_{name}_connectivities_clusters1_spatialmap.svg'
                 )

In [None]:
for adata in slides:
    sc.pl.spatial(adata, 
                  color=['SCX', 'MKX', 'TNMD', 'EGR1',
                         'BGN', 'DCN', 'POSTN', 'FMOD', 'KERA', 'LUM'],
                  layer='normcounts',
                  size=1.25,
                  vmin=0,
                  vmax="p99",
                  frameon=False,
                  cmap="plasma",
                  save = f'{adata.obs["sample"][0]}_spatial_markersANDclusters1.png'
                  )

In [None]:
for adata in slides:
    sc.pl.spatial(adata, 
                  color=['COL1A1', 'COL1A2', 'COL3A1', 'COL6A1', 'COL6A6', 
                         'COL22A1', 'COL4A1', 'COL4A2', 'COL11A1', 'COL12A1',
                         ],
                  layer='normcounts',
                  size=1.25,
                  vmin=0,
                  vmax="p99",
                  frameon=False,
                  cmap="plasma",
                  save = f'{adata.obs["sample"][0]}_spatial_markersANDclusters2.png'
                  )

In [None]:
for adata in slides:
    sc.pl.spatial(adata, 
                  color=['ABI3BP', 'GAS2', 'NEGR1', 'THBS4', 'PRG4', 'CREB5', 
                         'NOTCH3', 'ACTA2','clusters'],
                  layer='normcounts',
                  size=1.25,
                  vmin=0,
                  vmax="p99",
                  frameon=False,
                  cmap="plasma",
                  save = f'{adata.obs["sample"][0]}_spatial_markersANDclusters3.png'
                  )

In [None]:
for adata in slides:
    samplename = adata.obs['sample'][0]
    adata.write(os.path.join(RESULTS_FOLDERNAME, f'filtered_{samplename}.h5ad'))

In [None]:
for adata in slides:
    adata.var['Gene'] = adata.var.index
    adata.var.index = adata.var['ENSEMBL']
    adata.X = adata.layers['counts'].copy()
    print(adata.X[1:10, 1:10]) 

In [None]:
import anndata as ad

# Combine anndata objects together
adata = ad.concat(
    slides,
    label="sample",
    uns_merge="unique",
    join='outer',
    keys=sample_name,
    index_unique=None
)
adata

In [None]:
adata.var['ensembl_gene_id'] = adata.var.index
annot = sc.queries.biomart_annotations(
    "hsapiens",
    ["ensembl_gene_id", "external_gene_name"],
).set_index("ensembl_gene_id")

adata.var[annot.columns] = annot

adata.var.rename(columns={"external_gene_name": "Gene"}, inplace=True)
adata.var['Gene'] = adata.var['Gene'].fillna(adata.var['ensembl_gene_id'])
adata.var = adata.var.drop(columns='ensembl_gene_id')
adata.var

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'concatenated_adata.h5ad'))

# To load:

In [None]:
slides = {}
for filename in os.listdir(RESULTS_FOLDERNAME):
    if filename.startswith('filtered_') and filename.endswith(".h5ad"):
        file_path = os.path.join(RESULTS_FOLDERNAME, filename)
        try:
            # Read the h5ad file using anndata
            adata = sc.read_h5ad(file_path)
            
            # Extract the slide name from the filename (assuming filenames are like "filtered_slide_name.h5ad")
            slide_name = filename[len("filtered_") : -len(".h5ad")]
            
            # Store the data in the slides dictionary
            slides[slide_name] = adata
        except Exception as e:
            print(f"Error processing {filename}: {e}")
slides

#### Saving for 10X Loupe Browser Eploration

In [None]:
# save maps for each sample separately
clusterings = ['clusters']
for adata in slides:
    for clusternames in clusterings:
        s1 = adata.obs[[clusternames]]
        s1.index = s1.index.str.rsplit('_', n=1).str[-1]
        s1.index.name = 'Barcode'
        s1.to_csv(os.path.join(RESULTS_FOLDERNAME, f'{name}_{clusternames}_clusters.csv'))

In [None]:
slides['WSSS_THYst9383362'].obs

# Reverse Cell2Location

In [None]:
import glob

# Find all files ending with "_TendonAnnotations.csv" in the specified directory
file_pattern = '*_TendonAnnotations.csv'
file_list = glob.glob(f'{os.path.join(RESULTS_FOLDERNAME)}/{file_pattern}')

annotations_dict = {}
for file_path in file_list:
    # Extract sample name from the file name
    sample_name = file_path.split('/')[-1].split('_TendonAnnotations.csv')[0]
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    df['spot_id'] = sample_name+'_'+df['Barcode'].astype('str')
    # Add the DataFrame to the dictionary with sample name as the key
    annotations_dict[sample_name] = df

annotations_dict

In [None]:
annotations_dict['WSSS_THYst9383362'].tendon.value_counts()

In [None]:
for name, adata in slides.items():
    if name in annotations_dict:
        annotations_df = annotations_dict[name]
        # Filter annotations_df to include only spot_ids present in anndata_obj
        valid_spot_ids = adata.obs_names
        filtered_annotations_df = annotations_df[annotations_df['spot_id'].isin(valid_spot_ids)]
        # Match 'spot_id' in filtered_annotations_df with 'obs_names' in anndata_obj
        matching_indices = adata.obs_names.isin(filtered_annotations_df['spot_id'])
        # Transfer 'Tendon' column values to a new annotation column in anndata_obj
        adata.obs['Tendon'] = filtered_annotations_df.loc[matching_indices, 'tendon'].values

In [None]:
slides['WSSS_THYst9383362'].obs.Tendon.value_counts()

In [None]:
sc.pl.spatial(slides['WSSS_THYst9383362'], 
                  color=['SCX', 'MKX', 'TNMD', 'KERA', 'FMOD', 'THBS2', 'THBS4', 'EGR1', 
                  'ABI3BP', 'GAS2', 'COL3A1', 'COL1A1', 'COL6A6', 'FGF14', 'SOX9', 'Tendon'],
                  layer='normcounts',
                  size=1.25,
                  vmin=0,
                  vmax="p99",
                  frameon=False,
                  cmap="plasma",
                  save = f'WSSS_THYst9383362_tendonmarkers.png'
                  )

In [None]:
sc.pl.spatial(slides['WSSS_THYst9383362'], 
                  color=['Tendon'],
                  layer='normcounts',
                  size=1.25,
                  vmin=0,
                  vmax="p99",
                  frameon=False,
                  cmap="plasma",
                  save = f'WSSS_THYst9383362_tendonclusters.png'
                  )

In [None]:
slides.keys()

In [None]:
adata = slides['WSSS_THYst9383362'].copy()

In [None]:
adata = adata[(adata.obs['Tendon'] == 'patellar tendon')|(adata.obs['Tendon'] == 'Qqadriceps tendon'), :]

In [None]:
adata

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'WSSS_THYst9383362_quadTpatTnew.h5ad'))

# Preparing the training set

In [None]:
adata = slides['WSSS_THYst9383362'].copy()
adata.obs['Tendon'] = adata.obs['Tendon'].astype('category').cat.add_categories('Not Tendon')
adata.obs['Tendon'] = adata.obs['Tendon'].fillna('Not Tendon')
adata.obs['Tendon'].value_counts()

In [None]:
adata.var

In [None]:
adata.var.index = adata.var['ENSEMBL']

In [None]:
adata.X = adata.layers['counts'].copy()
np.max(adata.X)

In [None]:
test = adata[(adata.obs['Tendon'] == 'Not Tendon'), :]
test.obs['Tendon'].value_counts()

In [None]:
train = adata[(adata.obs['Tendon'] == 'patellar tendon')|(adata.obs['Tendon'] == 'quadriceps tendon'), :]
train.obs['Tendon'].value_counts()

In [None]:
# number of tendon spots to create equal number of non-tendon pseudodonors
N = train.n_obs
# Set the seed for reproducibility
np.random.seed(42)
# Select N random observations as donors
donor_indices = np.random.choice(test.obs_names, size=N, replace=False)
# Create a new column 'Pseudodonor' with NA
test.obs['Pseudodonor'] = 1000
# Assign numbers starting from N to the selected donors
donor_numbers = range(N, N + len(donor_indices))
test.obs.loc[donor_indices, 'Pseudodonor'] = donor_numbers

In [None]:
train.obs['Pseudodonor'] = range(len(train.obs))

In [None]:
import anndata as ad
combined = ad.concat([train, test])
combined

In [None]:
combined.obs['Pseudodonor'].value_counts()

In [None]:
NUM_OF_CELL_PER_DONOR = 0 # to filter out donors with less than this amount of cells
import random

def aggregate_and_filter(
    adata,
    donor_key="Pseudodonor",
    #condition_key="age",
    cell_identity_key="Tendon",
    #obs_to_keep=[],  # which additional metadata to keep, e.g. gender, age, etc.
    replicates_per_patient= 25
):
    pbs_cell_type_dict = {}
    for i, cell_type in enumerate(adata.obs[cell_identity_key].cat.categories):
        print(
            f'Processing {cell_type} ({i+1} out of {len(adata.obs[cell_identity_key].cat.categories)})...'
        )
        # subset adata to the given cell identity
        adata_cell_pop = adata[adata.obs[cell_identity_key] == cell_type].copy()
        # check which donors to keep according to the number of cells specified with NUM_OF_CELL_PER_DONOR
        size_by_donor = adata_cell_pop.obs.groupby([donor_key]).size()
        donors_to_drop = [
            donor
            for donor in size_by_donor.index
            if size_by_donor[donor] <= NUM_OF_CELL_PER_DONOR
        ]
        if len(donors_to_drop) > 0:
            print("Dropping the following samples:")
            print(donors_to_drop)

        pbs = []
        
        for i, sample in enumerate(adata_cell_pop.obs[donor_key].unique()):
            print(f"\tProcessing donor {i+1} out of {len(adata_cell_pop.obs[donor_key].unique())}...", end="\r")
            if sample not in donors_to_drop:
                samp_cell_subset = adata_cell_pop[adata_cell_pop.obs[donor_key] == sample]
                samp_cell_subset.X = samp_cell_subset.layers['counts'] #make sure to use raw data

                # create pseudoreplicates
                indices = list(samp_cell_subset.obs_names)
                random.shuffle(indices)
                indices = np.array_split(np.array(indices), replicates_per_patient)

                for k, pseudo_rep in enumerate(indices):

                    rep_adata = sc.AnnData(X = samp_cell_subset.X.sum(axis = 0),
                                           var = samp_cell_subset.var[[]])

                    rep_adata.obs_names = [str(sample) + '_' + str(k)]
                    rep_adata.obs['barcode'] = [samp_cell_subset.obs.index[0]]
                    #rep_adata.obs[condition_key] = samp_cell_subset.obs[condition_key].iloc[0]
                    rep_adata.obs[cell_identity_key] = samp_cell_subset.obs[cell_identity_key].iloc[0]
                    #rep_adata.obs['sample'] = samp_cell_subset.obs['sample'].iloc[0]
                    #rep_adata.obs['type'] = samp_cell_subset.obs['type'].iloc[0]
                    #rep_adata.obs['sampletype'] = samp_cell_subset.obs['sampletype'].iloc[0]
                    rep_adata.obs['replicate'] = k

                    pbs.append(rep_adata)
        print("\n")   
        pbs_cell_type_dict[cell_type] = sc.concat(pbs)
    
    return pbs_cell_type_dict

In [None]:
combined.obs['Tendon'] = combined.obs['Tendon'].astype("category")
combined.obs['Pseudodonor'] = combined.obs['Pseudodonor'].astype("category")
combined.obs.dtypes

In [None]:
pb = aggregate_and_filter(combined)
pb

In [None]:
comb = ad.concat(list(pb.values()), index_unique=None, join='outer')
comb

In [None]:
comb.obs['pseudodonor_replicate'] = comb.obs.index
comb.obs

In [None]:
comb.obs.dtypes

In [None]:
comb.obs['index'] = comb.obs['barcode'].astype("str") + "-" + comb.obs.index
comb.obs.index = comb.obs['index']
comb.obs

In [None]:
comb.var

In [None]:
counts = pd.DataFrame(comb.X, columns = comb.var_names, index=comb.obs['index'])
counts.T.to_csv(os.path.join(RESULTS_FOLDERNAME, f'training_counts_matrix_full.csv'))

In [None]:
from scipy.sparse import csr_matrix

comb.X = csr_matrix(comb.X)
comb.write(os.path.join(RESULTS_FOLDERNAME, 'trainingset.h5ad'))