In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
from muon import prot as pt

from matplotlib import colors
%matplotlib inline

import muon as mu
from mudata import MuData
import scanpy.external as sce

import seaborn as sns
import matplotlib.pyplot as plt
import scipy

from sklearn.metrics import silhouette_score

In [None]:
mdata = mu.read("./citeseq_mdata_allsamples_filtered.h5mu")

In [None]:
# ribosomal genes
mdata["rna"].var['ribo'] = mdata["rna"].var_names.str.startswith(("RPS","RPL"))
sc.pp.calculate_qc_metrics(mdata["rna"], qc_vars=["ribo"], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.highest_expr_genes(mdata["rna"], n_top=20)

In [None]:
sc.pl.violin(mdata["rna"], ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo'],
             jitter=0.4, groupby = 'sample_id', rotation = 45, size=0)

In [None]:
# define masks directly as booleans
mito_genes = mdata["rna"].var_names.str.startswith("MT-")
rpl_genes  = mdata["rna"].var_names.str.startswith("RPL")
rps_genes  = mdata["rna"].var_names.str.startswith("RPS")

# combine into a single boolean mask
remove = mito_genes | rpl_genes | rps_genes

# store in .var for transparency
mdata["rna"].var["remove"] = remove

# filter genes (keep = not remove)
mu.pp.filter_var(data=mdata["rna"], var="remove", func=np.logical_not)

# drop the helper column
del mdata["rna"].var["remove"]

In [None]:
# read the 'doublet_info' column from the CSV file
doublet_info_df = pd.read_csv('citeseq_doublet_info.csv', index_col=0)
mdata.obs["doublet_info"]=doublet_info_df
sum(mdata.obs['doublet_info'])


In [None]:
# ensure the column is boolean
mdata.obs["doublet_info"] = mdata.obs["doublet_info"].astype("boolean")

# filter directly with the boolean mask
mdata = mdata[~mdata.obs["doublet_info"]].copy()

# store back as a category for clarity
mdata.obs["doublet_info"] = mdata.obs["doublet_info"].astype("category")

In [None]:
sc.pp.normalize_total(mdata["rna"], target_sum=1e4)
sc.pp.log1p(mdata["rna"])

In [None]:
sc.pp.highly_variable_genes(mdata["rna"], min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(mdata["rna"])

In [None]:
mdata["rna"].raw = mdata["rna"]

In [None]:
# create a boolean array to identify highly variable genes
keep_hvg = mdata["rna"].var.highly_variable

# store the 'keep_hvg' array in the .var DataFrame for filtering
mdata["rna"].var['keep_hvg'] = keep_hvg

# define a custom filter function
def filter_hvg_func(values: np.ndarray):
    return values  

# use muon.pp.filter_var to filter genes
mu.pp.filter_var(data=mdata["rna"], var='keep_hvg', func=filter_hvg_func)

# remove the 'keep_hvg' column after filtering
del mdata["rna"].var['keep_hvg']

In [None]:
sc.pp.scale(mdata["rna"], max_value=10)

In [None]:
sc.tl.pca(mdata["rna"], svd_solver='arpack')
sc.pp.neighbors(mdata["rna"], n_neighbors=10)
sc.tl.umap(mdata["rna"], spread=1., min_dist=.5, random_state=11)

In [None]:
sc.pl.umap(mdata["rna"], color="tissue", legend_loc="on data")

In [None]:
min_val = np.min(mdata["prot"].X)
max_val = np.max(mdata["prot"].X)

print(f"Minimum value: {min_val}")
print(f"Maximum value: {max_val}")

In [None]:
# initialize mask with all True values
mask = np.ones(mdata["prot"].n_obs, dtype=bool)

# iterate through each feature
for feature in range(mdata["prot"].n_vars):
    # extract feature values
    feature_values = mdata["prot"].X[:, feature]
    
    # calculate the 0.1% and 99.9% percentiles for the current feature
    lower_percentile = np.percentile(feature_values, 0.01)
    upper_percentile = np.percentile(feature_values, 99.99)

    # update the mask to keep cells within the 0.1% and 99.9% percentiles for the current feature
    mask &= (feature_values > lower_percentile) & (feature_values < upper_percentile)

# subset the data using the mask
adata_prot = mdata["prot"][mask, :].copy()


In [None]:
adata_rna=mdata["rna"].copy()

In [None]:
mdata = MuData({"rna": adata_rna, "prot": adata_prot})
mdata

In [None]:
# convert the AnnData object to a pandas DataFrame
adata_df = pd.DataFrame(mdata["prot"].X, columns=mdata["prot"].var_names)

# calculate the total counts per cell
adata_df['total_counts'] = adata_df.sum(axis=1)

# reset the index and rename columns for plotting
adata_long = adata_df.reset_index().rename(columns={'index': 'cell'})

# create the violin plot for total counts per cell
plt.figure(figsize=(10, 6))
sns.violinplot(y='total_counts', data=adata_long)
plt.ylabel('Total Counts')
plt.show()

In [None]:
# convert the AnnData object to a pandas DataFrame
adata_df = pd.DataFrame(mdata["prot"].X, columns=mdata["prot"].var_names)

# convert the wide DataFrame to a long format for plotting
adata_long = adata_df.melt(var_name='feature', value_name='expression')

# create the violin plots
plt.figure(figsize=(15, 6))
sns.violinplot(x='feature', y='expression', data=adata_long)
plt.xticks(rotation=90)
plt.show()

In [None]:
mdata["prot"]

In [None]:
mdata.update()

In [None]:
mu.pp.intersect_obs(mdata)

In [None]:
min_val = np.min(mdata["prot"].X)
max_val = np.max(mdata["prot"].X)

print(f"Minimum value: {min_val}")
print(f"Maximum value: {max_val}")

In [None]:
def plot_feature_distribution(adata):
    n_features = adata.n_vars
    ncols = 4
    nrows = (n_features + ncols - 1) // ncols
    
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 4))
    
    for feature_idx, feature in enumerate(adata.var_names):
        row = feature_idx // ncols
        col = feature_idx % ncols
        
        sns.histplot(adata[:, feature].X, kde=True, ax=axes[row, col])
        axes[row, col].set_title(feature)
        axes[row, col].set_xlabel('Expression')
        axes[row, col].set_ylabel('Frequency')
    
    # hide unused subplots
    for i in range(feature_idx + 1, nrows * ncols):
        row = i // ncols
        col = i % ncols
        axes[row, col].axis('off')
    
    fig.tight_layout()
    plt.show()

# call the function with your adata object
plot_feature_distribution(mdata["prot"])


In [None]:
non_isotypes = [name for name in mdata["prot"].var_names if not name.startswith('IgG')]

In [None]:
mu.pp.filter_var(mdata["prot"], non_isotypes)

In [None]:
mdata.update()

In [None]:
sc.tl.pca(mdata['prot'])
sc.pp.neighbors(mdata['prot'])
sc.tl.umap(mdata['prot'], random_state=1)

In [None]:
sc.pl.pca_variance_ratio(mdata['rna'], log=True, n_pcs=50)
sc.pl.pca_variance_ratio(mdata['prot'], log=True, n_pcs=50)

In [None]:
sce.pp.harmony_integrate(
    mdata['prot'],
    key='donor_id',          
    basis='X_pca',           
    adjusted_basis='X_harmony',
    max_iter_harmony=30
)

In [None]:
sce.pp.harmony_integrate(
    mdata['rna'],
    key='donor_id',          
    basis='X_pca',           
    adjusted_basis='X_harmony',
    max_iter_harmony=30
)

In [None]:
mu.pl.embedding(mdata, basis="prot:X_umap", frameon=False, color=["prot:tissue","prot:donor_id"])
mu.pl.embedding(mdata, basis="rna:X_umap", frameon=False, color=["prot:tissue","prot:donor_id"])

In [None]:
# recalculate nearest neighbors per modality
sc.pp.neighbors(mdata['rna'], use_rep="X_harmony")
sc.pp.neighbors(mdata['prot'], use_rep="X_harmony")

# calculate weighted nearest neighbors
mu.pp.neighbors(mdata, key_added='wnn')

In [None]:
mu.tl.umap(mdata, neighbors_key='wnn', random_state=10)
mu.tl.umap(mdata['rna'], random_state=10)
mu.tl.umap(mdata['prot'], random_state=10)

In [None]:
mu.pl.embedding(mdata, basis="X_umap", frameon=False, color=["prot:tissue","prot:donor_id"],
             #save="rna_prot_umap_wnn_20231101.png"
               )
mu.pl.embedding(mdata, basis="prot:X_umap", frameon=False, color=["prot:tissue","prot:donor_id"],
             #save="prot_umap_wnn_20231101.png"
               )
mu.pl.embedding(mdata, basis="rna:X_umap", frameon=False, color=["prot:tissue","prot:donor_id"],
             #save="rna_umap_wnn_20231101.png"
               )

In [None]:
mu.pl.umap(mdata, color=['rna:mod_weight', 'prot:mod_weight'], cmap='RdBu', size=10)

In [None]:
sc.pl.umap(mdata, color=['rna:pct_counts_mt',"rna:total_counts","rna:n_genes_by_counts"], ncols=1 ,legend_loc='on data')

In [None]:
X = np.hstack([mdata['rna'].obsm['X_harmony'], mdata['prot'].obsm['X_harmony']])

for r in (0.1, 0.2, 0.3, 0.4):
    key = f'leiden_r{r:.1f}'
    sc.tl.leiden(mdata, neighbors_key='wnn', resolution=r, key_added=key, random_state=0)

    y = mdata.obs[key].to_numpy()
    s = silhouette_score(X, y, metric='cosine')
    print(f"{key}: n_clusters={np.unique(y).size}  silhouette={s:.4f}")

    # UMAP
    sc.pl.umap(mdata, color=key, legend_loc='on data')

    # RNA markers
    mdata['rna'].obs[key] = mdata.obs[key].astype('category')
    sc.tl.rank_genes_groups(mdata['rna'], groupby=key, method='wilcoxon')
    sc.pl.rank_genes_groups_dotplot(mdata['rna'], n_genes=5, standard_scale='var',
                                    swap_axes=True, dendrogram=False)

In [None]:
mu.pl.embedding(mdata, basis="X_umap", frameon=False, color=["CD8a_TotalSeqC","CD4_TotalSeqC",
                                                            "PRL","IGFBP1","KRT8","SELL",
                                                            "AUTS2","IL1B"], ncols=2 , size=15, color_map="coolwarm")

In [None]:
for i in mdata.obs['prot:tissue'].cat.categories:
    print(i) 
    fig= sc.pl.umap(mdata[mdata.obs['prot:tissue'] == i], color = 'leiden_r0.2', return_fig=True, title= i)

In [None]:
for i in mdata.obs['prot:donor_id'].cat.categories:
    print(i) 
    fig= sc.pl.umap(mdata[mdata.obs['prot:donor_id'] == i], color = 'leiden_r0.2', return_fig=True, title= i)

In [None]:
marker_genes = ["CD3_TotalSeqC","CD62L_TotalSeqC","CD1c_TotalSeqC",'CD14_TotalSeqC',"CD44_TotalSeqC","CD39_TotalSeqC",
               'CD127_TotalSeqC','CD11c_TotalSeqC','TIGIT_TotalSeqC','CD28_TotalSeqC','CD27_TotalSeqC','CD45RA_TotalSeqC',
               'CD31_TotalSeqC']
mdata["prot"].obs["leiden_r0.2"]=mdata.obs["leiden_r0.2"]
sc.tl.dendrogram(mdata["prot"], 'leiden_r0.2')
sc.pl.dotplot(mdata["prot"], marker_genes, groupby='leiden_r0.2', dendrogram=True, use_raw=False
              , standard_scale="var"
             );

In [None]:
marker_genes = ["HLA-G","SLPI","NREP","THY1","VWF","PECAM1","CCL21","PTPRC","NCAM1",
               "FCGR3A","KIT","TPSAB1","CD3E","CD4","CD8A","JCHAIN","CD14","CD68",
               "CLEC9A","XCR1","CD1C","CD19","PRL","IGFBP1","GNLY","MPO","HLA-DRA"]
mdata["rna"].obs["leiden_r0.2"]=mdata.obs["leiden_r0.2"]
sc.tl.dendrogram(mdata["rna"], 'leiden_r0.2')
sc.pl.dotplot(mdata["rna"], marker_genes, groupby='leiden_r0.2', dendrogram=True, use_raw=True, standard_scale="var");

In [None]:
# convert the AnnData object to a pandas DataFrame
adata_df = pd.DataFrame(mdata["prot"].X, columns=mdata["prot"].var_names)

# calculate the total counts per cell
adata_df['total_counts'] = adata_df.sum(axis=1)

# add total_counts to obs in mdata["prot"]
mdata["prot"].obs['total_counts'] = adata_df['total_counts']

In [None]:
# calculate the total counts per cell
total_counts = mdata["prot"].X.sum(axis=1)

# create a new pandas series with the same index as the `obs` DataFrame
total_counts_series = pd.Series(total_counts, index=mdata["prot"].obs.index, name='total_counts')

mdata.obs['total_counts'] = total_counts_series

In [None]:
marker_genes = ["HLA-G","SLPI","NREP","THY1","VWF","PECAM1","CCL21","PTPRC","NCAM1",
               "FCGR3A","KIT","TPSAB1","CD3E","CD4","CD8A","JCHAIN","CD14","CD68",
               "CLEC9A","XCR1","CD1C","CD19","PRL","IGFBP1","GNLY","MPO","HLA-DRA"]
sc.tl.dendrogram(mdata["rna"], 'leiden_r0.2')
sc.pl.dotplot(rna, marker_genes, groupby='leiden_r0.2', dendrogram=True, use_raw=True, standard_scale="var");

In [None]:
new_cluster_names = {
    "0": "Myelo", "1": "NK", "2": "FIB",
    "3": "TCell", "4": "Myelo",
    "5": "EpiCell", "6": "FIB",
    "7": "BCell", "8": "FIB_Doublet",
    "9": "NK", "10": "EndoCell", "11": "ILC", "12": "Granulo", "13": "Lymphatic"
}

col = 'leiden_r0.2'
mdata.obs['celltype_wnn'] = mdata.obs[col].astype("str").values
mdata.obs.celltype_wnn = mdata.obs.celltype_wnn.replace(new_cluster_names)
mdata.obs.celltype_wnn = mdata.obs.celltype_wnn.astype("category")

In [None]:
mdata.write("./citeseq_mdata_allsamples_filtered_crude_clustering.h5mu")