# SMAF parameters

For Extended Data Fig. 1d-k and Extended Data Fig. 2

This code runs SMAF looped over SMAF parameters, and saves U and the analysis results.


## Import Libraries

In [1]:
# Import libraries
import anndata as ad
from pathlib import Path
import os
import numpy as np
import pandas as pd
import errno
import itertools

## Set up

In [2]:
# Import system libraries to configure code directory as module
from os.path import dirname, abspath, join
import sys

# Find code directory relative to our directory
THIS_DIR = dirname('__file__')
CODE_DIR = abspath(join(THIS_DIR, '..', 'code'))
# Add code directory to systems paths
sys.path.append(CODE_DIR)

In [3]:
# Import dictionary training fnc. (smaf)
from smaf import smaf
from utils import analyse_U_W, is_valid_file

## Input data

In [4]:
# Read sce data
data_path = Path('/mnt/projects/data')
sce_path = Path(os.path.join(data_path,'0_tissues_th182/sce/sce.h5ad'))

EXP_name = 'publication/2_SMAF_parameters'
out_path = Path(os.path.join(data_path, EXP_name))
U_path = Path(os.path.join(out_path, "U"))
# Create output directory if it doesn't exist
out_path.mkdir(parents=True, exist_ok=True)
U_path.mkdir(parents=True, exist_ok=True)

In [5]:
# Check that input files/dictionary exist
if not is_valid_file(sce_path, ['.h5ad']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),sce_path)

In [6]:
# read and subset sce with relavant markers
sce = ad.read_h5ad(sce_path)

sce = sce[:, ~sce.var.index.isin(["panCK","CD15", "CD11c", "CD56"])]



## Set varaibles
d: initial n. of modules in U

maxItr: N. of iteration for each SMAF run

nthread: N. of thread

### main iterator
l_rep: N. of replicates

l_methodW: algorithm for calculating W. "lasso" or "omp_fixedk"

l_ldaU: ldaU, error tolerance coefficient when calculating U.


### methodW specific iterator
#### only for lasso
l_ldaW: ldaW, error tolerance coefficient when calculating W.

l_nblocksW: N. of blocks to separate cells when claculating W. 
#### only for omp_fixedk
l_k: Sparsity of W (N. of non-zero input per cell in W)



In [7]:
# set variables
d = 80         # initial n. of modules in U
maxItr = 5 #100  # N. of iteration for each SMAF run
nthread = -1   # N. of thread
saveU = True   # if saving U as csv for each SMAF run

# main iterator
l_rep = range(2) #range(10)
l_methodW = ['lasso', 'omp_fixedk'] 
l_ldaU = [0.2, 0.02]#[0.3,0.2,0.1,0.05,0.02,0.005]


# methodW specific iterator
l_ldaW = [0.2,0.02]#[0.3,0.2,0.1,0.05,0.02,0.005]
l_nblocksW = [1,20]#[1, 20, 200]
l_k = [1,2]#[1,2,3,4]


## Run SMAF

In [8]:
## SMAF
sum_res = [] # results
cond = [] # conditions
expid = 0 # experiment id. Used to match results and U

# set total no of iteration for printing the progress
totalitr = len(list(l_rep))*len(l_methodW)*len(l_ldaU)


for itrid, (replicate, methodW, ldaU ) in enumerate(itertools.product(l_rep,l_methodW,l_ldaU)):
    print('CurrentItr:{}/{} Rep:{},methodW:{},ldaU:{}'.format(itrid+1,totalitr,replicate, methodW, ldaU))
    ldaW,k,num_blocks_W = (np.NaN,np.NaN,np.NaN) # initialize sub iterables for saving result. 
    if methodW == 'lasso':
        for ldaW, num_blocks_W in itertools.product(l_ldaW, l_nblocksW):            
            U,W,X = smaf(sce,d,maxItr,methodW,ldaU, ldaW=ldaW,k=k, THREADS=nthread,  X_normalization='paper_norm',
              num_blocks_W=num_blocks_W, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
            # obtain results by analysing U and W.
            res, coln = analyse_U_W(U, W, X)
            # save U (if saveU == True)
            if saveU:    
                pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var_names).to_csv(
                os.path.join(U_path, 'U_expid{:d}.csv'.format(expid)))
            # Store results
            sum_res.append(res)
            cond.append([maxItr, replicate, methodW, ldaU, num_blocks_W,ldaW,k,expid])
            expid += 1
    elif methodW == 'omp_fixedk':
        for k in l_k:
            U,W,X = smaf(sce,d,maxItr,methodW,ldaU, ldaW=ldaW,k=k, THREADS=nthread,  X_normalization='paper_norm',
              num_blocks_W=num_blocks_W, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
            # obtain results by analysing U and W.
            res, coln = analyse_U_W(U, W, X)
            # save U (if saveU == True)
            if saveU:    
                pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var_names).to_csv(
                os.path.join(U_path, 'U_expid{:d}.csv'.format(expid)))
            # Store results
            sum_res.append(res)
            cond.append([maxItr, replicate, methodW, ldaU, num_blocks_W,ldaW,k,expid])
            expid += 1



CurrentItr:1/8 Rep:0,methodW:lasso,ldaU:0.2
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
CurrentItr:2/8 Rep:0,methodW:lasso,ldaU:0.02
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
CurrentItr:3/8 Rep:0,methodW:omp_fixedk,ldaU:0.2
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
CurrentItr:4/8 Rep:0,methodW:omp_fixedk,ldaU:0.02
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
CurrentItr:5/8 Rep:1,methodW:lasso,ldaU:0.2
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
CurrentItr:6/8 Rep:1,methodW:lasso,ldaU:0.02
Initialized U, W wi

## Save results

In [9]:
# Transform results into DF
cond = pd.DataFrame(cond)
cond.columns = ['maxItr', 'replicate', 'methodW', 'ldaU', 'num_blocks_W','ldaW','k','expid']
sum_res = pd.DataFrame(sum_res)
sum_res.columns = coln

In [10]:
# combine df results and conditions
df = sum_res.join(cond)
df

Unnamed: 0,U_l1_mean,U_l0_mean,d_modules,U_90p_coherence,SMAF_W_l0_mean,SMAF_Fit,maxItr,replicate,methodW,ldaU,num_blocks_W,ldaW,k,expid
0,1.370012,4.421053,19,0.358903,1.866852,0.817289,5,0,lasso,0.2,1.0,0.2,,0
1,1.18513,2.133333,15,0.158145,3.886277,0.797542,5,0,lasso,0.2,20.0,0.2,,1
2,1.149721,2.533333,15,0.07922,8.989232,0.968967,5,0,lasso,0.2,1.0,0.02,,2
3,1.07042,1.466667,15,0.0,11.260972,0.973113,5,0,lasso,0.2,20.0,0.02,,3
4,1.764385,11.444444,27,0.435933,1.630952,0.822877,5,0,lasso,0.02,1.0,0.2,,4
5,2.069051,14.807692,26,0.497879,3.187096,0.799948,5,0,lasso,0.02,20.0,0.2,,5
6,1.438748,6.125,24,0.329186,6.943088,0.97945,5,0,lasso,0.02,1.0,0.02,,6
7,1.362382,5.913043,23,0.230768,9.039808,0.979453,5,0,lasso,0.02,20.0,0.02,,7
8,1.705907,7.222222,18,0.391955,1.0,0.828744,5,0,omp_fixedk,0.2,,,1.0,8
9,1.213149,2.333333,18,0.226426,1.953855,0.889262,5,0,omp_fixedk,0.2,,,2.0,9


In [11]:
# save results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'result.csv'))