# Finalize A
For Extended Data Figure 6. To create Extended Data Figure 6a, this code needs to be run with number of composite channels of 7, 8, and 9.

This code simulates decompression with random As and selects the best A in2 rounds of evaluation.

1st round: 4-fold simluaiton using 75% of cells for training U with finalized SMAF parameters and 25% of cells for simulating decpompression with 2000 random As each fold. Selected 50 As with best "minimum protein correlation" for each fold. In total 200 As were selected.

2nd round: Another 4-fold simulation with fixed 200As from st round. each A's performance was simulated 4-fold, so its mean performance ("minimum protein correlation") over 4-fold experments was used for selecting the best A.



In [1]:
# Import libraries
import anndata as ad
from pathlib import Path
import os
import numpy as np
import pandas as pd
import errno

## setup

In [2]:
# Import system libraries to configure code directory as module
from os.path import dirname, abspath, join
import sys

# Find code directory relative to our directory
THIS_DIR = dirname('__file__')
CODE_DIR = abspath(join(THIS_DIR, '..', 'code'))
# Add code directory to systems paths
sys.path.append(CODE_DIR)

In [3]:
# Import fnc.
from smaf import smaf
from utils import analyse_U_W, analyse_decoding, produce_random_As, is_valid_file
from simulate_A import simulate_A

## Input data and specify output 
### ! specify the number of composite channel ("ncch") in the begining so that results for different "ncch" will be saved into separate subfolders

In [4]:
# Specify the number of composite channels
ncch = 9

In [5]:
# Read sce data
data_path = Path('/mnt/projects/data')
sce_path = Path(os.path.join(data_path,'0_preprocess_th184/processed/sce.h5ad'))

EXP_name = 'publication/8_Finalize_A_16_to_{:d}'.format(ncch)
out_path = Path(os.path.join(data_path, EXP_name))
U_path = Path(os.path.join(out_path, "U"))
A_path = Path(os.path.join(out_path, "A"))
# Create output directory if it doesn't exist
out_path.mkdir(parents=True, exist_ok=True)
U_path.mkdir(parents=True, exist_ok=True)
A_path.mkdir(parents=True, exist_ok=True)

In [6]:
# Check that input files/dictionary exist
if not is_valid_file(sce_path, ['.h5ad']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),sce_path)

In [7]:
# read sce 
sce = ad.read_h5ad(sce_path)
# subset sce with relavant markers
sce = sce[:, ~sce.var.index.isin(["panCK","CD15", "CD11c", "CD56"])]
# remove tonsil 1, 2 to reduce non-tumor immune cells
sce = sce[~sce.obs.tissue.isin(["Tonsil1","Tonsil2"]),:]

## Set SMAF parameters

In [8]:
## Set params for U
# set variables
d = 80
maxItr = 5#100
nthread = -1

# main parameter
methodW = 'lasso'
ldaU = 0.02

# methodW specific parameter
nblocksW_lasso = 1
ldaW = 0.02

## Set parameters for random As
Here we produced random As with maxcomposition of 2

n_A_each: n of random As produced per unique L0sum

n_L0sum: n of unique L0 sum. Defined number of L0sum will be selected in desending order from the max L0sum.

g: no of total genes

m: n of composite channels 

n: [min,max] of channels per gene

d_thresh: distance threshold per gene

In [9]:
## set params for random As
n_A_each = 5#500   # n of random As produced per unique L0sum
n_L0sum = 4      # n of L0 sum from the max
maxp = 1         # prob of choosing n_max

g = np.shape(sce.X)[1] #no of total genes
m = ncch         # n of composite channels 
n = (1,2)        # [min,max] of channels per gene
d_thresh = 0.8   # distance threshold per gene

L0sum_max = n[1]*g
L0sum_min = n[1]*g-n_L0sum+1
print("Random {} As with L0sum between {} and {}. {} genes into {} channels, max {} channels per gene".format(
    n_A_each*n_L0sum, L0sum_min,L0sum_max, g, m, n[1]))

Random 20 As with L0sum between 29 and 32. 16 genes into 9 channels, max 2 channels per gene


In [10]:
# Set params for simulatinbg A 
nsr = 0   
ldaW_dc = 0.002

## Round 1
4-fold simluaiton using 75% of cells for training U with finalized SMAF parameters and 25% of cells for simulating decpompression with 2000 random As each fold. Selected 50 As with best "minimum protein correlation" for each fold. In total 200 As were selected.

In [11]:
## Round 1

# select 50 As out of 2000As for 4 fold -> 200 selected As
n_fold = 4
n_bestAs_perfold = 1#50
# Fold_ID list for Random subseting training data into n_fold 
fold_id_list = np.random.permutation(np.repeat(np.arange(n_fold), int(sce.shape[0]/n_fold)+1)[:sce.shape[0]])
# initialize summary result
sum_res_U = []
sum_res_A = []
# to store selected As
best_Phi_r1 = []

for fold_id in range(n_fold):
    # Produce random As
    Phi = produce_random_As(n_A_each=n_A_each, n_L0sum=n_L0sum, m=m , n=n, d_thresh=d_thresh, g=g)
    # save As
    np.save(os.path.join(A_path,"Phi_{:d}.npy".format(fold_id)),np.array(Phi))
    # subset into training U and A
    X_trainingU = sce[fold_id_list != fold_id]
    X_trainingA = sce[fold_id_list == fold_id]
    # calc U
    U,W,X = smaf(X_trainingU,d,maxItr,methodW,ldaU, ldaW=ldaW, THREADS=nthread,  X_normalization='paper_norm',
              num_blocks_W=nblocksW_lasso, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
    # get SMAF results
    res_U, coln_U = analyse_U_W(U, W, X)
    sum_res_U.append(res_U)
    # save U 
    pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var_names).to_csv(
        os.path.join(U_path, 'U_r1_fold{:d}.csv'.format(fold_id)))
    # initialize Mincorr per fold, this is used for selcting best As (minimum protein correlation)
    Mincorr = np.array([])
    # simulate As
    for phi_id, phi in enumerate(Phi):        
        X, Xhat, W, Y = simulate_A(X_trainingA, U, phi, nsr, decoding_lasso_lda = ldaW_dc,
                             outpath=None, THREADS=-1, layer=None, num_blocks=20)
        # get simulation results
        res_A, coln_A, detail_A = analyse_decoding(phi,U,W,X,Xhat, name="", detail=True)
        res_A.extend([fold_id,phi_id])
        coln_A.extend(["fold_id", "A_id"])
        sum_res_A.append(res_A)
        # second input of res_A is the minimum protein correlation
        Mincorr = np.append(Mincorr,res_A[1])
        # this is for printing
        if (phi_id+1)%100 == 0:
            idx = np.argsort(Mincorr)  # get sorted index
            best5 = Mincorr[idx[-5:][::-1]] # Best 5
            print("current fold:{}, phi_id: {}, top5 mincorr:".format(fold_id, phi_id) ,['{:.3f}'.format(e) for e in best5])
    # select best 50 As per fold
    best_Phi_r1.extend(list(np.array(Phi)[np.argsort(Mincorr)[-n_bestAs_perfold:],:,:]))



Random 20 As with L0sum between 29 and 32. 16 genes into 9 channels, max 2 channels per gene
16 6
Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)


Random 20 As with L0sum between 29 and 32. 16 genes into 9 channels, max 2 channels per gene
10 2
Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)


Random 20 As with L0sum between 29 and 32. 16 genes into 9 channels, max 2 channels per gene
10 5
Initialized U, W with NMF, SMAF maxItr =  5
Random 20 As with L0sum between 29 and 32. 16 genes into 9 channels, max 2 channels per gene
25 5
Initialized U, W with NMF, SMAF maxItr =  5


## save results for round 1

In [12]:
# Transform U results into DF
df = pd.DataFrame(sum_res_U)
df.columns = coln_U
# save U results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'R1_result_U.csv'))


# Transform A results into DF
df = pd.DataFrame(sum_res_A)
df.columns = coln_A

# save A results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'R1_result_A.csv'))


# save bets Phi for round 1
np.save(os.path.join(A_path, 'best_Phi_r1.npy'),np.array(best_Phi_r1))  

## Round 2

Another 4-fold simulation with fixed 200As from st round. each A's performance was simulated 4-fold, so its mean performance ("minimum protein correlation") over 4-fold experments was used for selecting the best A.

In [13]:
## Round 2

# select the best A out of 200As for 4 fold 
n_fold = 4
# Fold_ID list for Random subseting training data into n_fold 
fold_id_list = np.random.permutation(np.repeat(np.arange(n_fold), int(sce.shape[0]/n_fold)+1)[:sce.shape[0]])
# initialize summary result
sum_res_U = []
sum_res_A = []

# now each A has 4-fold simulation so Mincorr has to be (n_fold x n_As)
Mincorr_arr = np.zeros((n_fold, len(best_Phi_r1)))

for fold_id in range(n_fold):
    # subset into training U and A
    X_trainingU = sce[fold_id_list != fold_id]
    X_trainingU = X_trainingU[X_trainingU.obs.CISImarker == 'pos',:] # remove CISI_negative cells for trainingU
    X_trainingA = sce[fold_id_list == fold_id]
    # calc U
    U,W,X = smaf(X_trainingU,d,maxItr,methodW,ldaU, ldaW=ldaW, THREADS=nthread,  X_normalization='paper_norm',
              num_blocks_W=nblocksW_lasso, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
    # get SMAF results
    res_U, coln_U = analyse_U_W(U, W, X)
    sum_res_U.append(res_U)
    # save U 
    pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var_names).to_csv(
        os.path.join(U_path, 'U_r2_fold{:d}.csv'.format(fold_id)))
    # initialize Mincorr per fold. this is for printing only. Micorr_arr is the one used for selecting best A
    Mincorr = np.array([])
    # simulate As
    for phi_id, phi in enumerate(best_Phi_r1):        
        X, Xhat, W, Y = simulate_A(X_trainingA, U, phi, nsr, decoding_lasso_lda = ldaW_dc,
                             outpath=None, THREADS=-1, layer=None, num_blocks=20)
        # get simulation results
        res_A, coln_A, detail_A = analyse_decoding(phi,U,W,X,Xhat, name="", detail=True)
        res_A.extend([fold_id,phi_id])
        coln_A.extend(["fold_id", "A_id"])
        sum_res_A.append(res_A)
        # update Mincorr_arr (used for selecting best A)
        Mincorr_arr[fold_id,phi_id] = res_A[1]
        # for printing
        Mincorr = np.append(Mincorr,res_A[1])     
        if (phi_id+1)%100 == 0:
            idx = np.argsort(Mincorr)  # get sorted index
            best5 = Mincorr[idx[-5:][::-1]] # Best 5
            print("current fold:{}, phi_id: {}, top5 mincorr:".format(fold_id, phi_id) ,['{:.3f}'.format(e) for e in best5])
    # select best A
    mean_mincorr = np.mean(Mincorr_arr, axis = 0) # get mean Mincorr over 4-fold for each A
    best_phi = best_Phi_r1[np.argmax(mean_mincorr)] # best mean Mincorr is selected


Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)


Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)


Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)


Initialized U, W with NMF, SMAF maxItr =  5


  dist = 1.0 - uv / np.sqrt(uu * vv)


## save results for round 2

In [14]:
# Transform U results into DF
df = pd.DataFrame(sum_res_U)
df.columns = coln_U
# save U results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'R2_result_U.csv'))


# Transform A results into DF
df = pd.DataFrame(sum_res_A)
df.columns = coln_A

# save A results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'R2_result_A.csv'))


# save bets A for round 2
np.save(os.path.join(A_path, 'best_phi_id_{}.npy'.format(np.argmax(mean_mincorr))),np.array(best_phi))  
# save also as csv
best_phi_df = pd.DataFrame(best_phi)
best_phi_df.columns = sce.var.index
best_phi_df.to_csv(path_or_buf=os.path.join(A_path, 'best_phi_id_{}.csv'.format(np.argmax(mean_mincorr))))