### Load needed libraries

In [None]:
import os
import pandas as pd
import scanpy as sc
from datetime import datetime
import warnings
from helper_functions import *

sc.settings.n_jobs = 32
warnings.filterwarnings("ignore")

pwd = os.getcwd()

### Build reference singleome and multiome adata from mtx files

In [None]:
dataset = "reference"
technology = "singleomeCR6"
region = "MTG"
to_correct = [
    "estimated_number_of_cells", "mean_reads_per_cell", "median_genes_per_cell",
    "number_of_reads", "sequencing_saturation", "q30_bases_in_Barcode",
    "q30.bases_in_rna_read", "q30_bases_in_umi",
    "reads_mapped_to_genome", "reads_mapped_confidently_to_genome",
    "reads_mapped_confidently_to_intergenic_regions", "reads_mapped_confidently_to_intronic_regions",
    "reads_mapped_confidently_to_exonic_regions", "reads_mapped_confidently_to_transcriptome",
    "reads_mapped_antisense_to_gene", "fraction_reads_in_cells", "total_genes_detected",
    "median_umi_counts_per_cell"
]

build_anndata(
    datadir=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "_donor_mtx"),
    outfile=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + str(datetime.date(datetime.now())) + ".h5ad"),
    to_correct=to_correct,
    metadata_file=os.path.join(pwd, "input", "metadata", "Great_ApesMetadata_version101_20220321.csv"),
    metadata_name="Great Apes Metadata",
    adata_key="sample_id",
    external_key="sample_id",
    cell_ids="sample_id",
    remove_arid=True
)

dataset = "reference"
technology = "multiome"
region = "MTG"

build_anndata(
    datadir=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "_donor_mtx"),
    outfile=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + str(datetime.date(datetime.now())) + ".h5ad"),
    metadata_file=None,
    metadata_name=None
)

### Build AD singleome and multiome adata from mtx files

In [None]:
dataset = "AD"
technology = "singleomeCR6"
region = "MTG"
to_correct = [
    "estimated_number_of_cells", "mean_reads_per_cell", "median_genes_per_cell",
    "number_of_reads", "sequencing_saturation", "q30_bases_in_Barcode",
    "q30.bases_in_rna_read", "q30_bases_in_umi",
    "reads_mapped_to_genome", "reads_mapped_confidently_to_genome",
    "reads_mapped_confidently_to_intergenic_regions", "reads_mapped_confidently_to_intronic_regions",
    "reads_mapped_confidently_to_exonic_regions", "reads_mapped_confidently_to_transcriptome",
    "reads_mapped_antisense_to_gene", "fraction_reads_in_cells", "total_genes_detected",
    "median_umi_counts_per_cell"
]


build_anndata(
    datadir=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "_donor_mtx"),
    outfile=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + str(datetime.date(datetime.now())) + ".h5ad"),
    to_correct=to_correct,
    metadata_file=os.path.join(pwd, "input", "metadata", "NeuropathMetadata_version103_20220407.KT.sav"),
    metadata_name="UW Clinical Metadata",
    adata_key="external_donor_name",
    external_key="uwa",
    cell_ids="sample_id",
    remove_arid=True
)

dataset = "AD"
technology = "multiome"
region = "MTG"
to_correct = []

build_anndata(
    datadir=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "_donor_mtx"),
    outfile=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + str(datetime.date(datetime.now())) + ".h5ad"),
    to_correct=to_correct,
    metadata_file=os.path.join(pwd, "input", "metadata", "NeuropathMetadata_version103_20220407.KT.sav"),
    metadata_name="UW Clinical Metadata",
    adata_key="external_donor_name",
    external_key="uwa",
    cell_ids="sample_id",
    remove_arid=True
)

## Merge multiome and singleome datasets

In [None]:
dataset = "reference"
technology = "singleomeCR6"
region = "MTG"
date = "2022-04-08"

adata_ref_singleome = sc.read_h5ad(filename=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + date + ".h5ad"))

dataset = "reference"
technology = "multiome"
region = "MTG"
date = "2022-09-27"

adata_ref_multiome = sc.read_h5ad(filename=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + date + ".h5ad"))

dataset = "AD"
technology = "singleomeCR6"
region = "MTG"
date = "2022-04-07"

adata_singleome = sc.read_h5ad(filename=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + date + ".h5ad"))

dataset = "AD"
technology = "multiome"
region = "MTG"
date = "2022-04-07"

adata_multiome = sc.read_h5ad(filename=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + date + ".h5ad"))

In [None]:
to_drop_singleome = ["sex_y"]
to_drop_multiome = ["sex_y", "file"]

to_divide_reference_singleome = [
    "sequencing_saturation", "q30_bases_in_Barcode", "q30.bases_in_rna_read",
    "q30_bases_in_umi", "reads_mapped_to_genome", "reads_mapped_confidently_to_genome",
    "reads_mapped_confidently_to_intergenic_regions", "reads_mapped_confidently_to_intronic_regions",
    "reads_mapped_confidently_to_exonic_regions", "reads_mapped_confidently_to_transcriptome",
    "reads_mapped_antisense_to_gene", "fraction_reads_in_cells"
]
to_divide_singleome = [
    "sequencing_saturation", "q30_bases_in_Barcode", "q30.bases_in_rna_read",
    "q30_bases_in_umi", "reads_mapped_to_genome", "reads_mapped_confidently_to_genome",
    "reads_mapped_confidently_to_intergenic_regions", "reads_mapped_confidently_to_intronic_regions",
    "reads_mapped_confidently_to_exonic_regions", "reads_mapped_confidently_to_transcriptome",
    "reads_mapped_antisense_to_gene", "fraction_reads_in_cells"
]

to_rename_reference_singleome = {
    "age": "age_at_death",
    "estimated_number_of_cells": "Estimated_number_of_cells",
    "fraction_reads_in_cells": "GEX_Fraction_of_transcriptomic_reads_in_cells",
    "mean_reads_per_cell": "GEX_Mean_raw_reads_per_cell",
    "median_genes_per_cell": "GEX_Median_genes_per_cell",
    "median_umi_counts_per_cell": "GEX_Median_UMI_counts_per_cell",
    "q30.bases_in_rna_read": "GEX_Q30_bases_in_read_2",
    "q30_bases_in_Barcode": "GEX_Q30_bases_in_barcode",
    "q30_bases_in_umi": "GEX_Q30_bases_in_UMI",
    "reads_mapped_antisense_to_gene": "GEX_Reads_mapped_antisense_to_gene",
    "reads_mapped_confidently_to_exonic_regions": "GEX_Reads_mapped_confidently_to_exonic_regions",
    "reads_mapped_confidently_to_genome": "GEX_Reads_mapped_confidently_to_genome",
    "reads_mapped_confidently_to_intergenic_regions": "GEX_Reads_mapped_confidently_to_intergenic_regions",
    "reads_mapped_confidently_to_intronic_regions": "GEX_Reads_mapped_confidently_to_intronic_regions",
    "reads_mapped_confidently_to_transcriptome": "GEX_Reads_mapped_confidently_to_transcriptome",
    "reads_mapped_to_genome": "GEX_Reads_mapped_to_genome",
    "total_genes_detected": "GEX_Total_genes_detected",
}
to_rename_reference_multiome = {
    "age": "age_at_death",
    "Pipeline_version": "alignment",
}
to_rename_singleome = {
    "sex_x": "sex",
    "estimated_number_of_cells": "Estimated_number_of_cells",
    "fraction_reads_in_cells": "GEX_Fraction_of_transcriptomic_reads_in_cells",
    "mean_reads_per_cell": "GEX_Mean_raw_reads_per_cell",
    "median_genes_per_cell": "GEX_Median_genes_per_cell",
    "median_umi_counts_per_cell": "GEX_Median_UMI_counts_per_cell",
    "q30.bases_in_rna_read": "GEX_Q30_bases_in_read_2",
    "q30_bases_in_Barcode": "GEX_Q30_bases_in_barcode",
    "q30_bases_in_umi": "GEX_Q30_bases_in_UMI",
    "reads_mapped_antisense_to_gene": "GEX_Reads_mapped_antisense_to_gene",
    "reads_mapped_confidently_to_exonic_regions": "GEX_Reads_mapped_confidently_to_exonic_regions",
    "reads_mapped_confidently_to_genome": "GEX_Reads_mapped_confidently_to_genome",
    "reads_mapped_confidently_to_intergenic_regions": "GEX_Reads_mapped_confidently_to_intergenic_regions",
    "reads_mapped_confidently_to_intronic_regions": "GEX_Reads_mapped_confidently_to_intronic_regions",
    "reads_mapped_confidently_to_transcriptome": "GEX_Reads_mapped_confidently_to_transcriptome",
    "reads_mapped_to_genome": "GEX_Reads_mapped_to_genome",
    "total_genes_detected": "GEX_Total_genes_detected",
}
to_rename_multiome = {
    "Pipeline_version": "alignment",
    "sex_x": "sex"
}

# Adding metadata
adata_ref_singleome.obs["Genome"] = "GRCh38"
adata_ref_singleome.obs["gender"] = adata_ref_singleome.obs["sex"].copy()
adata_ref_multiome.obs["gender"] = adata_ref_multiome.obs["sex"].copy()

adata_singleome.obs["Genome"] = "GRCh38"


# Dropping metadata
adata_singleome.obs.drop(to_drop_singleome, axis=1, inplace=True)
adata_multiome.obs.drop(to_drop_multiome, axis=1, inplace=True)


# Editing metadata
for i in to_divide_reference_singleome:
    adata_ref_singleome.obs[i] = adata_ref_singleome.obs[i] / 100
    
for i in to_divide_singleome:
    adata_singleome.obs[i] = adata_singleome.obs[i] / 100


# Renaming metadata
adata_ref_singleome.obs.rename(columns=to_rename_reference_singleome, inplace=True)
adata_ref_multiome.obs.rename(columns=to_rename_reference_multiome, inplace=True)

adata_singleome.obs.rename(columns=to_rename_singleome, inplace=True)
adata_multiome.obs.rename(columns=to_rename_multiome, inplace=True)

In [None]:
# Final reference metadata cleanup
adata_ref_singleome.obs["fraction_mito"] = adata_ref_singleome[:, adata_ref_singleome.var_names.str.startswith("MT-")].layers["UMIs"].sum(axis=1) / adata_ref_singleome.layers["UMIs"].sum(axis=1)
adata_ref_singleome.obs["age_at_death"] = adata_ref_singleome.obs["age_at_death"].str.replace(" yrs", "").astype("float")
adata_ref_singleome.obs["reference_cell"] = 1

adata_ref_multiome.obs["fraction_mito"] = adata_ref_multiome[:, adata_ref_multiome.var_names.str.startswith("MT-")].layers["UMIs"].sum(axis=1) / adata_ref_multiome.layers["UMIs"].sum(axis=1)
adata_ref_multiome.obs["age_at_death"] = adata_ref_multiome.obs["age_at_death"].str.replace(" yrs", "").astype("float")
adata_ref_multiome.obs["reference_cell"] = 1

# Combine reference object
adata_ref_combined = adata_ref_singleome.concatenate(adata_ref_smartseq, index_unique=None)
adata_ref_combined.uns = adata_ref_singleome.uns.copy()

In [None]:
# Combine AD object
adata_query = adata_singleome.concatenate(adata_multiome, index_unique=None)
adata_query.uns = adata_singleome.uns.copy()
del adata_singleome
del adata_multiome

# Final query metadata cleanup
adata_query.obs["fraction_mito"] = adata_query[:, adata_query.var_names.str.startswith("MT-")].layers["UMIs"].sum(axis=1) / adata_query.layers["UMIs"].sum(axis=1)
adata_query.obs["reference_cell"] = 0

In [None]:
dataset = "reference"
technology = "singleomeCR6"
region = "MTG"

adata_ref_singleome.write(os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + str(datetime.date(datetime.now())) + ".h5ad"))

dataset = "reference"
technology = "multiome"
region = "MTG"

adata_ref_multiome.write(os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + str(datetime.date(datetime.now())) + ".h5ad"))

dataset = "reference"
technology = "combined"
region = "MTG"

adata_ref_combined.write(os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + str(datetime.date(datetime.now())) + ".h5ad"))

dataset = "AD"
technology = "combined"
region = "MTG"

adata_query.write(os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "." + str(datetime.date(datetime.now())) + ".h5ad"))