# Finalize SMAF parameters

For Extended Data Fig. 5ab

This code runs SMAF and simulates decompression on random 200As for all 14 tissue onditions and for all 12 SMAF parameter conditions. Different tissues were tested to see the performance stability across tissues.


## Import Libraries

In [1]:
# Import libraries
import anndata as ad
from pathlib import Path
import os
import numpy as np
import pandas as pd
import errno
import itertools

## setup

In [2]:
# Import system libraries to configure code directory as module
from os.path import dirname, abspath, join
import sys

# Find code directory relative to our directory
THIS_DIR = dirname('__file__')
CODE_DIR = abspath(join(THIS_DIR, '..', 'code'))
# Add code directory to systems paths
sys.path.append(CODE_DIR)

In [3]:
# Import dictionary training fnc. (smaf)
from smaf import smaf
from utils import analyse_U_W, produce_random_As, analyse_decoding, is_valid_file
from simulate_A import simulate_A

## input data and specify output

In [4]:
# Read sce data
data_path = Path('/mnt/projects/data')
sce_path = Path(os.path.join(data_path,'0_preprocess_th184/processed/sce.h5ad'))

EXP_name = 'publication/6_Finalize_SMAF_parameters'
out_path = Path(os.path.join(data_path, EXP_name))
U_path = Path(os.path.join(out_path, "U"))
A_path = Path(os.path.join(out_path, "A"))
# Create output directory if it doesn't exist
out_path.mkdir(parents=True, exist_ok=True)
U_path.mkdir(parents=True, exist_ok=True)
A_path.mkdir(parents=True, exist_ok=True)

In [5]:
# Check that input files/dictionary exist
if not is_valid_file(sce_path, ['.h5ad']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),sce_path)

In [6]:
# read and subset sce with relavant markers
sce = ad.read_h5ad(sce_path)

sce = sce[:, ~sce.var.index.isin(["panCK","CD15", "CD11c", "CD56"])]

# remove whole slide tonsil 1, 2 to reduce non-tumor immune cells
sce = sce[~sce.obs.tissue.isin(["Tonsil1","Tonsil2"]),:]

## Set SMAF parameters tested
d: initial n. of modules in U

maxItr: N. of iteration for each SMAF run

nthread: N. of thread

l_methodW: algorithm for calculating W. "lasso" or "omp_fixedk"

l_ldaU: error tolerance coefficient when calculating U.

l_ldaW: error tolerance coefficient when calculating W. (only for lasso)

l_nblocksW_lasso: N. of blocks to separate cells when claculating W. (only for lasso)

l_k_fixedk: Sparsity of W (N. of non-zero input per cell in W) (only for omp_fixedk)

In [7]:
## prepare Us with different param
# set variables
d = 80
maxItr = 5#100
nthread = -1

# main iterator
l_methodW = ['lasso','omp_fixedk']
l_ldaU = [0.2,0.02]

# methodW specific iterator
l_nblocksW_lasso = [1, 200]
l_ldaW = [0.2,0.02]
l_k_fixedk = [1,3] 

## set ROIs (tissues) tested
Other than specific tissues, we added "1" and "2" for taking first or second ROI from all tissues, respectively. "percent25" takes random 25% of cells. Additonal ROIs can be used to assess general performance of different SMAF parameter conditions.

In [8]:
# set ROIs for training U and A
ROI_list = sce.obs.tissue.unique().tolist() # All tissues
ROI_list.extend(["1","2","percent25"])      # Add first/second ROI for all tissues and 25% random cells

## Produce random 200 As
Here we produced random As with maxcomposition of 2

n_A_each: n of random As produced per unique L0sum

n_L0sum: n of unique L0 sum. Defined number of L0sum will be selected in desending order from the max L0sum.

g: no of total genes

m: n of composite channels

n: [min,max] of channels per gene

d_thresh: distance threshold per gene

In [9]:
# produce Phi with maxcomposition of 2
Phi = produce_random_As(n_A_each=1, n_L0sum=4, m=8, n=(1,2), d_thresh=0.7, g=16)#50


Random 4 As with L0sum between 29 and 32. 16 genes into 8 channels, max 2 channels per gene
6 2


In [10]:
# save As
np.save(os.path.join(A_path,"Phi.npy"),np.array(Phi))

## Run SMAF and decompression simulation
For each tissue condition, Us were calculated for all SMAF parameter condition.

For each U, decompression was simulated using 200 random As

In [11]:
## SMAF params
expid = 0 # experiment id. Used to match results and U

# Params for simulate A (also for test ROI)
nsr = 0   
ldaW_dc = 0.02

# results container
sum_res_U = [] 
sum_res_A = []
genewise_res_A = []

# SMAF and decompression simulation
for ROI_id, cur_ROI in enumerate(ROI_list[8:]):
    print(cur_ROI)
    # subset for A and U based on the cur_ROI
    if cur_ROI in sce.obs.tissue.unique():
        X_trainingA = sce[sce.obs.tissue==cur_ROI,:]
    elif cur_ROI in sce.obs.ROI_per_tissue.unique():
        X_trainingA = sce[sce.obs.ROI_per_tissue==cur_ROI,:]
    elif "percent" in cur_ROI:
        perc = int(cur_ROI.split("percent")[-1])/100
        X_trainingA = sce[np.random.choice(sce.shape[0], int(sce.shape[0]*perc),replace=False),:]
    else:
        continue
    # rest of the cells are for training U
    X_trainingU = sce[~sce.obs.index.isin(X_trainingA.obs.index),:]


    # initialize U_list per fold
    Us = []      
    ## SMAF producing U
    for (methodW, ldaU) in itertools.product(l_methodW,l_ldaU):
        ldaW,k,num_blocks_W = (np.NaN,np.NaN,np.NaN) # initialize sub iterables for saving result. 

        if methodW == 'lasso':
            for ldaW,num_blocks_W in itertools.product(l_ldaW,l_nblocksW_lasso):            
                # calc U
                U,W,X = smaf(X_trainingU,d,maxItr,methodW,ldaU, ldaW=ldaW,k=k, THREADS=nthread,  X_normalization='paper_norm',
                          num_blocks_W=num_blocks_W, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
                # get SMAF results
                res_U, coln_U = analyse_U_W(U, W, X)
                # save U 
                pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var_names).to_csv(
                    os.path.join(U_path, 'U_expid{:d}.csv'.format(expid)))
                # append Us 
                Us.append(U)    
                # append results
                res_U.extend([expid, maxItr, methodW, ldaU, num_blocks_W,ldaW,k, cur_ROI ])
                coln_U.extend(['expid','maxItr', 'methodW', 'ldaU', 'num_blocks_W','ldaW','k','trainingA_ROI'])
                sum_res_U.append(res_U)
                expid += 1
            
        elif methodW == 'omp_fixedk':
            for k in l_k_fixedk:
                # calc U
                U,W,X = smaf(X_trainingU,d,maxItr,methodW,ldaU, ldaW=ldaW,k=k, THREADS=nthread,  X_normalization='paper_norm',
                          num_blocks_W=num_blocks_W, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
                # get SMAF results
                res_U, coln_U = analyse_U_W(U, W, X)
                # save U 
                pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var_names).to_csv(
                    os.path.join(U_path, 'U_expid{:d}.csv'.format(expid)))
                # append Us 
                Us.append(U)    
                # append results
                res_U.extend([expid, maxItr, methodW, ldaU, num_blocks_W,ldaW,k, cur_ROI ])
                coln_U.extend(['expid','maxItr', 'methodW', 'ldaU', 'num_blocks_W','ldaW','k','trainingA_ROI'])
                sum_res_U.append(res_U)
                expid += 1
            
        

    ## Simulate A        
    for cur_u_id, U in enumerate(Us):  
        # u_id need to move along fold exp
        u_id = cur_u_id + ROI_id*len(Us)
        Mincorr = np.array([])
        for phi_id, phi in enumerate(Phi):
            L0sum = np.linalg.norm(phi,ord = 0, axis = 1).sum()

            X, Xhat, W, Y = simulate_A(X_trainingA, U, phi, nsr, decoding_lasso_lda = ldaW_dc,
                                 outpath=None, THREADS=-1, layer=None, num_blocks=20)

            res_A, coln_A, detail_A = analyse_decoding(phi,U,W,X,Xhat, name="", detail=True)
            res_A.extend([u_id,phi_id,L0sum,nsr,ldaW_dc, cur_ROI])
            coln_A.extend(["U_id", "A_id","A_L0_sum","inv_SNratio","ldaW_simA", "ROI"])
            sum_res_A.append(res_A)
            genewise_res_A.append(detail_A[0])

            Mincorr = np.append(Mincorr,res_A[1])


            if (phi_id+1)%100 == 0:
                idx = np.argsort(Mincorr)  # get sorted index
                best5 = Mincorr[idx[-5:][::-1]] # Best 5
                print("current itr: {}, top5 mincorr:".format(phi_id) ,['{:.3f}'.format(e) for e in best5])




tonsilW
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)
  Anorm = (A.T/np.linalg.norm(A, axis=1)).T
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


1
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)


2
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)
  Anorm = (A.T/np.linalg.norm(A, axis=1)).T
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


percent25
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)


## Save results

In [12]:
# Transform U results into DF
df = pd.DataFrame(sum_res_U)
df.columns = coln_U
# save U results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'result_U.csv'))

# Transform A results into DF
df = pd.DataFrame(sum_res_A)
df.columns = coln_A
# save A results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'result_A.csv'))


# save genewise results as well
df = pd.DataFrame(genewise_res_A)
df.columns = sce.var.index
df.to_csv(path_or_buf=os.path.join(out_path, 'result_A_genewise.csv'))