# Max composite channels per protein in barcoding matrix

For Extended Data Fig. 3

This code produces random As within specified maxcomposition (Max n. ofcomposite channels per protein in barcoding matrix), and simulates decompression.

Produced random As and resuts of the analysis of simulated decompression are saved.


## Import Libraries

In [1]:
# Import libraries
import anndata as ad
from pathlib import Path
import os
import numpy as np
import pandas as pd
import errno
import itertools


In [2]:
# Import system libraries to configure code directory as module
from os.path import dirname, abspath, join
import sys

# Find code directory relative to our directory
THIS_DIR = dirname('__file__')
CODE_DIR = abspath(join(THIS_DIR, '..', 'code'))
# Add code directory to systems paths
sys.path.append(CODE_DIR)

## Set up

In [3]:
# Import dictionary training fnc. (smaf)
from smaf import smaf
from utils import analyse_U_W, analyse_decoding, produce_random_As, is_valid_file
from simulate_A import simulate_A

## Input data and specify output

In [4]:
# Read sce data
data_path = Path('/mnt/projects/data')
sce_path = Path(os.path.join(data_path,'0_preprocess_th184/processed/sce.h5ad'))

EXP_name = 'publication/3_A_maxcomposition'
out_path = Path(os.path.join(data_path, EXP_name))
U_path = Path(os.path.join(out_path, "U"))
A_path = Path(os.path.join(out_path, "A"))
# Create output directory if it doesn't exist
out_path.mkdir(parents=True, exist_ok=True)
U_path.mkdir(parents=True, exist_ok=True)
A_path.mkdir(parents=True, exist_ok=True)

In [5]:
# Check that input files/dictionary exist
if not is_valid_file(sce_path, ['.h5ad']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),sce_path)

In [6]:
# read and subset sce with relavant markers
sce = ad.read_h5ad(sce_path)

sce = sce[:, ~sce.var.index.isin(["panCK","CD15", "CD11c", "CD56"])]

# remove whole slide tonsil 1, 2 to reduce non-tumor immune cells
sce = sce[~sce.obs.tissue.isin(["Tonsil1","Tonsil2"]),:]

# subset sce for training U and testing A
perc = 0.25
sce_trainingA = sce[np.random.choice(sce.shape[0], int(sce.shape[0]*perc),replace=False),:]
sce_trainingU = sce[~sce.obs.index.isin(sce_trainingA.obs.index),:]

## Prepare dictionary for simulating decompression
d: initial n. of modules in U

maxItr: N. of iteration for each SMAF run

nthread: N. of thread

methodW: algorithm for calculating W. "lasso" or "omp_fixedk"

ldaU: error tolerance coefficient when calculating U.

ldaW: error tolerance coefficient when calculating W.

nblocksW_lasso: N. of blocks to separate cells when claculating W. 

In [7]:
## prepare U

## Set params for U
# set variables
d = 80
maxItr = 100
nthread = -1
methodW = 'lasso'
ldaU = 0.02
nblocksW_lasso = 1
ldaW = 0.02

# calc U
U,W,X = smaf(sce_trainingU,d,maxItr,methodW,ldaU, ldaW=ldaW, THREADS=nthread,  X_normalization='paper_norm',
          num_blocks_W=nblocksW_lasso, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
# get SMAF results
res_U, coln_U = analyse_U_W(U, W, X)

# save U_ 
pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var.index).to_csv(
    os.path.join(U_path, 'U.csv'))

# Transform U results into DF
df = pd.DataFrame(res_U).T ## .T added since it's 1D DF
df.columns = coln_U

# save U results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'result_U.csv'))

Initialized U, W with NMF, SMAF maxItr =  100


## Produce random As
Here we produced random As with maxcomposition of 2,3,4

n_A_each: n of random As produced per unique L0sum

n_L0sum: n of unique L0 sum. Defined number of L0sum will be selected in desending order from the max L0sum.

g: no of total genes

m: n of composite channels

n: [min,max] of channels per gene

d_thresh: distance threshold per gene

In [8]:
# produce Phi with maxcomposition of 2, 3, and 4
Phi2 = produce_random_As(n_A_each=2, n_L0sum=9, m=8, n=(1,2), d_thresh=0.7, g=16)
Phi3 = produce_random_As(n_A_each=2, n_L0sum=16, m=8, n=(1,3), d_thresh=0.7, g=16)
Phi4 = produce_random_As(n_A_each=2, n_L0sum=16, m=8, n=(1,4), d_thresh=0.5, g=16)

Random 18 As with L0sum between 24 and 32. 16 genes into 8 channels, max 2 channels per gene
86 0
Random 32 As with L0sum between 33 and 48. 16 genes into 8 channels, max 3 channels per gene
260 6
Random 32 As with L0sum between 49 and 64. 16 genes into 8 channels, max 4 channels per gene
127 8


In [9]:
# combine into 1 Phi 
Phi = Phi2 + Phi3 + Phi4

In [10]:
# save As
np.save(os.path.join(A_path,"Phi.npy"),np.array(Phi))

## Simulate decompression using random As

ldaW_dc: error tolerance coefficient when decoding W from Y = AUW using lasso.

nsr: noise_to_signal ratio when simulating Y from AX.

u_id: id of U used to simulate decompression.

wt: normalization weight of X (proteinwise) when simulating Y from AX.

cur_ROI: ROI used for simulating decompression. Here we selected 25% of X (see above) so noted as "percent 25".

In [11]:
ldaW_dc = 0.02
nsr = 0    # no noise added when simluating Y
u_id = 0   # only one U used
wt = 0     # no normalization on sce_trainigA
cur_ROI = "25percent" # ROI used for simulating decompression. 

sum_res_A = []  # general results for simlated decompression
genewise_res_A = []  # proteinwise/cellwise correlation for simlated decompression
Mincorr = []    # Minimum protein correlation (just for printing progress)

for phi_id, phi in enumerate(Phi):
    L0sum = np.linalg.norm(phi,ord = 0, axis = 1).sum()
    maxcomposition = np.linalg.norm(phi,ord = 0, axis = 0).max()
    X, Xhat, W, Y = simulate_A(sce_trainingA, U, phi, nsr, decoding_lasso_lda = ldaW_dc,
                         outpath=None, THREADS=-1, layer=None, num_blocks=20)

    res_A, coln_A, detail_A = analyse_decoding(phi,U,W,X,Xhat, name="", detail=True)
    res_A.extend([u_id,phi_id,L0sum,maxcomposition, nsr,ldaW_dc, wt, cur_ROI])
    coln_A.extend(["U_id", "A_id","A_L0_sum","maxcomposition","inv_SNratio","ldaW_simA", "Xnorm_weight", "ROI"])
    sum_res_A.append(res_A)
    genewise_res_A.append(detail_A[0])

    Mincorr = np.append(Mincorr,res_A[1])


    if (phi_id+1)%100 == 0:
        idx = np.argsort(Mincorr)  # get sorted index
        best5 = Mincorr[idx[-5:][::-1]] # Best 5
        print("current itr: {}, top5 mincorr:".format(phi_id) ,['{:.3f}'.format(e) for e in best5])





  dist = 1.0 - uv / np.sqrt(uu * vv)


## save results

In [12]:
# Transform A results into DF
df = pd.DataFrame(sum_res_A)
df.columns = coln_A

# save A results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'result_A.csv'))


# save genewise results as well
df = pd.DataFrame(genewise_res_A)
df.columns = sce.var.index
df.to_csv(path_or_buf=os.path.join(out_path, 'result_A_genewise.csv'))