# X normalization

For Extended Data Fig. 4ab and 5c.

This code is for evaluating the effect of normalizing proteins in X when simulating decompression.

This code produces 12 Us with unique SMAF parameter conditions and for each U, 5 normalizing weight and 200 random As were used for simulating decompression.

## impot libraries

In [1]:
# Import libraries
import anndata as ad
from pathlib import Path
import os
import numpy as np
import pandas as pd
import errno
import itertools
from sklearn.mixture import GaussianMixture

## Set up

In [2]:
# Import system libraries to configure code directory as module
from os.path import dirname, abspath, join
import sys

# Find code directory relative to our directory
THIS_DIR = dirname('__file__')
CODE_DIR = abspath(join(THIS_DIR, '..', 'code'))
# Add code directory to systems paths
sys.path.append(CODE_DIR)

In [3]:
# Import dictionary training fnc. (smaf)
from smaf import smaf
from utils import analyse_U_W, produce_random_As, analyse_decoding, is_valid_file
from simulate_A import simulate_A

## Input data and specify output

In [4]:
# Read sce data
data_path = Path('/mnt/projects/data')
sce_path = Path(os.path.join(data_path,'0_tissues_th182/sce/sce.h5ad'))

EXP_name = 'publication/4_X_normalization'
out_path = Path(os.path.join(data_path, EXP_name))
U_path = Path(os.path.join(out_path, "U"))
A_path = Path(os.path.join(out_path, "A"))
# Create output directory if it doesn't exist
out_path.mkdir(parents=True, exist_ok=True)
U_path.mkdir(parents=True, exist_ok=True)
A_path.mkdir(parents=True, exist_ok=True)

In [5]:
# Check that input files/dictionary exist
if not is_valid_file(sce_path, ['.h5ad']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),sce_path)

In [6]:
# read and subset sce with relavant markers
sce = ad.read_h5ad(sce_path)

sce = sce[:, ~sce.var.index.isin(["panCK","CD15", "CD11c", "CD56"])]

# remove whole slide tonsil and appendix data
sce = sce[~sce.obs.tissue.isin(["AppendixW","TonsilW"]),:]

# split sce to training A and U
X_trainingA = sce[np.random.choice(sce.shape[0], int(sce.shape[0]*0.25),replace=False),:]
X_trainingU = sce[~sce.obs.index.isin(X_trainingA.obs.index),:]



## Set SMAF parameters tested
d: initial n. of modules in U

maxItr: N. of iteration for each SMAF run

nthread: N. of thread

l_methodW: algorithm for calculating W. "lasso" or "omp_fixedk"

l_ldaU: error tolerance coefficient when calculating U.

l_ldaW: error tolerance coefficient when calculating W. (only for lasso)

l_nblocksW_lasso: N. of blocks to separate cells when claculating W. (only for lasso)

l_k_fixedk: Sparsity of W (N. of non-zero input per cell in W) (only for omp_fixedk)

In [7]:
## prepare Us with different param
# set variables
d = 80
maxItr = 5#100
nthread = -1

# main iterator
l_methodW = ['lasso','omp_fixedk']
l_ldaU = [0.2,0.02]

# methodW specific iterator
l_nblocksW_lasso = [1, 200]
l_ldaW = [0.2,0.02]
l_k_fixedk = [1,3] 

## Produce Us

In [8]:
## SMAF params
sum_res_U = [] # results
cond_U = [] # conditions
expid = 0 # experiment id. Used to match results and U
cur_ROI = "percent25"
# initialize U_list 
Us = []

## SMAF producing Us
for (methodW, ldaU) in itertools.product(l_methodW,l_ldaU):
    ldaW,k,num_blocks_W = (np.NaN,np.NaN,np.NaN) # initialize sub iterables for saving result. 

    if methodW == 'lasso':
        for ldaW,num_blocks_W in itertools.product(l_ldaW,l_nblocksW_lasso):            
            # calc U
            U,W,X = smaf(X_trainingU,d,maxItr,methodW,ldaU, ldaW=ldaW,k=k, THREADS=nthread,  X_normalization='paper_norm',
                      num_blocks_W=num_blocks_W, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
            # get SMAF results
            res_U, coln_U = analyse_U_W(U, W, X)
            # save U 
            pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var_names).to_csv(
                os.path.join(U_path, 'U_expid{:d}.csv'.format(expid)))
            # append Us and results list
            Us.append(U)    
            sum_res_U.append(res_U)
            cond_U.append([expid, maxItr, methodW, ldaU, num_blocks_W,ldaW,k, cur_ROI ])
            expid += 1
            
    elif methodW == 'omp_fixedk':
        for k in l_k_fixedk:
            # calc U
            U,W,X = smaf(X_trainingU,d,maxItr,methodW,ldaU, ldaW=ldaW,k=k, THREADS=nthread,  X_normalization='paper_norm',
                      num_blocks_W=num_blocks_W, num_blocks_U=1, layer=None, Normalize_U=True, saveItr=False) 
            # get SMAF results
            res_U, coln_U = analyse_U_W(U, W, X)
            # save U 
            pd.DataFrame(U, columns=list(range(1, U.shape[1]+1)),index=sce.var_names).to_csv(
                os.path.join(U_path, 'U_expid{:d}.csv'.format(expid)))
            # append Us and results list
            Us.append(U)    
            sum_res_U.append(res_U)
            cond_U.append([expid, maxItr, methodW, ldaU, num_blocks_W,ldaW,k, cur_ROI ])
            expid += 1

Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5
Initialized U, W with NMF, SMAF maxItr =  5


## Produce random 200 As
Here we produced random As with maxcomposition of 2

n_A_each: n of random As produced per unique L0sum

n_L0sum: n of unique L0 sum. Defined number of L0sum will be selected in desending order from the max L0sum.

g: no of total genes

m: n of composite channels

n: [min,max] of channels per gene

d_thresh: distance threshold per gene

In [18]:
# produce Phi with maxcomposition of 2
Phi = produce_random_As(n_A_each=1, n_L0sum=4, m=8, n=(1,2), d_thresh=0.7, g=16)#50


Random 4 As with L0sum between 29 and 32. 16 genes into 8 channels, max 2 channels per gene
11 0


In [19]:
# save As
np.save(os.path.join(A_path,"Phi.npy"),np.array(Phi))

## GMM and L2norm for normalizing with mean intensity of positive cell population

GMM is used to define and obtain mean intensity of positive cell population for each protein. Proteins in X was scaled to equalize the obtained mean intensities.

L2norm (||x||_2) is used for standard normalization of X.

In [20]:
# calculate GMM for each protein
GMMsignals = []  # mean intensity of positive cell population.
GMMnpcells = []  # no of positive cells
GMMsnrs = []     # (mean intensity of positive cell population)/(mean intensity of negative cell population)

for marker in sce.var.index:
    X = sce.X[:,sce.var.index==marker]
    G = GaussianMixture(n_components=2, random_state=230420).fit_predict(X.reshape(-1, 1))

    mean1 = np.mean(X[G==1])
    mean2 = np.mean(X[G==0])

    if mean1 >= mean2:
        signal = float(mean1)
        snr = mean1/mean2
        npcells = np.sum(G) # number of positive cells

    else:
        signal = float(mean2)
        snr = mean2/mean1
        npcells = X.shape[0]-np.sum(G) # number of positive cells

    GMMsignals.append(signal)    
    GMMnpcells.append(npcells)
    GMMsnrs.append(snr)

In [21]:
# Obtain average (across proteins) of mean signal intensity for scaling during normalization
# l2norm of each gene (||x||_2) was also calculated for standard normalization
gene_l2 = np.linalg.norm(sce.X, axis = 0)
ave_genel2 = np.full(np.shape(sce)[1], np.mean(np.linalg.norm(sce.X, axis = 0)))
GMMsignals = np.array(GMMsignals)
ave_GMMsignals = np.full(np.shape(sce)[1], np.mean(GMMsignals))

In [22]:
# Save the analysis results for norms of X
df = pd.DataFrame(data = {"GMMsignal": GMMsignals,
                          "GMMnpcells": GMMnpcells,
                          "GMMsnr": GMMsnrs,
                          "genel2": gene_l2,
                          "genel2_GMMsignal_ratio": gene_l2/GMMsignals/(np.mean(gene_l2/GMMsignals))},
                  index = sce.var.index)

df.to_csv(path_or_buf=os.path.join(out_path, 'result_norms.csv'))

## Simulate decompression with X_normalization
### set parameters

In [23]:
# Params for simulate A 
nsr = 0   
ldaW_dc = 0.02
# set weight for normalization (1: normalize 100%, 0: no normalize)
wt_list = [0, 0.5, 1, "0.5_GMM", "1_GMM"]

# Results container
sum_res_A = []
genewise_res_A = []

# prep normed XforA for weighted normalization
X_trainingA_normed = X_trainingA.copy()

### simulate decompression

In [24]:
## Simulate decompression
for wt in wt_list:      
    print(wt)
    # Normalize X with weight
    if "_GMM" in str(wt):
        cwt = float(wt.split("_GMM")[0])
        normalizer = (ave_GMMsignals*(1-cwt) + GMMsignals*cwt)/ave_GMMsignals
    else:    
        normalizer = (ave_genel2*(1-wt) + gene_l2*wt)/ave_genel2
    X_trainingA_normed.X = X_trainingA.X / normalizer
    
    # loop through Us
    for u_id, U in enumerate(Us):
        Mincorr = np.array([])
        
        # loop through As
        for phi_id, phi in enumerate(Phi): 
            # simulate decompression
            X, Xhat, W, Y = simulate_A(X_trainingA_normed, U, phi, nsr, decoding_lasso_lda = ldaW_dc,
                                 outpath=None, THREADS=-1, layer=None, num_blocks=20)
            
            # calculate L0sum for results
            L0sum = np.linalg.norm(phi,ord = 0, axis = 1).sum()
            # analyse results
            res_A, coln_A, detail_A = analyse_decoding(phi,U,W,X,Xhat, name="", detail=True)
            res_A.extend([u_id,phi_id,L0sum,nsr,ldaW_dc, wt, cur_ROI])
            coln_A.extend(["U_id", "A_id","A_L0_sum","inv_SNratio","ldaW_simA", "Xnorm_weight", "ROI"])
            sum_res_A.append(res_A)
            genewise_res_A.append(detail_A[0])
            # for printing
            Mincorr = np.append(Mincorr,res_A[1])
            if (phi_id+1)%100 == 0:
                idx = np.argsort(Mincorr)  # get sorted index
                best5 = Mincorr[idx[-5:][::-1]] # Best 5
                print("current itr: {}, top5 mincorr:".format(phi_id) ,['{:.3f}'.format(e) for e in best5])




0


  dist = 1.0 - uv / np.sqrt(uu * vv)


0.5
1
0.5_GMM
1_GMM


## save results

In [25]:
# Transform U results into DF
cond_U = pd.DataFrame(cond_U)
cond_U.columns = ['expid','maxItr', 'methodW', 'ldaU', 'num_blocks_W','ldaW','k','trainingA_ROI']
sum_res_U = pd.DataFrame(sum_res_U)
sum_res_U.columns = coln_U
# combine df results and conditions
df = sum_res_U.join(cond_U)
# save U results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'result_U.csv'))



# Transform A results into DF
df = pd.DataFrame(sum_res_A)
df.columns = coln_A

# save A results as csv
df.to_csv(path_or_buf=os.path.join(out_path, 'result_A.csv'))


# save genewise results as well
df = pd.DataFrame(genewise_res_A)
df.columns = sce.var.index
df.to_csv(path_or_buf=os.path.join(out_path, 'result_A_genewise.csv'))