In [1]:
import sys, math, csv
import pandas as pd
import numpy as np
from functools import reduce
import scipy
from scipy.stats import rankdata
from scipy import interpolate

**Federated Upper Quartile normalization:**  
*server functions:*

In [2]:
#Collects the zero lines of the clients and 
#reduces them to the lines that are present in each client.
def upperquartile_federated_server_zeros(*args):
    return reduce(np.intersect1d, args)
    
def upperquartile_federated_server(*args):
    if len(args) == 0:
        print('Error: the function needs at least one vector to work.')
        return
    else:
        uquartile = []
        for i in args:
            uquartile = np.append(uquartile,i)
        uquartile = np.exp(np.mean(np.log(uquartile)))
        return uquartile

*client functions:*

In [5]:
#Checks which lines of the client's input_data are completely zero.
def upperquartile_federated_client_zeros(data):
    if(data.isnull().values.any()):
        print("Error: the function can't handle NaN in data.\n")
        return
    
    all_zero = data.eq(0).all(axis=1)
    return np.where(all_zero)[0]
    
    
#Calculates for each sample of the client the upper quartile by library size factor.
#The implementation is based on the implementation of the 
#calcNormFactors method in bioconductor edgeR. 
#Robinson and Smyth, 2020
def upperquartile_federated_client_calc(data,all_zero):
    if isinstance(data, pd.DataFrame):
        indexes_to_keep = set(range(data.shape[0])) - set(all_zero)
        data = data.take(list(indexes_to_keep))
    
    n,m = data.shape
    if n == 0 or n == 1:
        return np.array(m * [1])
    
    data = pd.DataFrame(np.sort(data,axis=0))
    
    lib_size = np.array(data.sum(axis=0).tolist())
    uquartile = np.quantile(data,0.75,axis=0)
    return uquartile

def upperquartile_federated_client_result(data, uquartile, global_result):
    libsize = np.array(data.sum(axis=0).tolist())
    f_s = uquartile/global_result
    result = data/f_s
    return result

**Federated Quantile normalization:**  
*server function:*

In [6]:
#Aggregates the mean values of the clients.
def quantile_federated_server_new(*means):
    n = len(means)
    if n == 0:
        print('Error: the function needs at least one vector to work.')
        return
    elif n == 1:
        return means[0]
    else:
        res_mean = means[0][1]
        sample_num = means[0][0]
        for i in range(1,len(means)):
            res_mean = res_mean + means[i][1]
            sample_num += means[i][0]
        return res_mean/sample_num

*client functions:*

In [7]:
#Prepares the calculation of the quantiles. calculates the mean vector of the client.
#The implementation is based on the implementation of the 
#normalizeBetweenArrays method in bioconductor limma. 
#Gordon and Smyth, 2005
def quantile_federated_client_mean_new(data):
    if(isinstance(data, pd.DataFrame)):
        data = data.to_numpy()
        
    try:
        n,m = data.shape
    except ValueError:
        print("Error")
        return
      
    arr = np.zeros((n,m))
    for i in range(m):
        arr[:,i] = data[:,i].astype(np.float64)
        
    if m == 1:
        print("Error: you need more than one sample")
    if n == 1:
        arr = arr[~np.isnan(arr)]
        return [len(arr), np.sum(arr)], np.array(len(arr) * [1])
    
    Ix = np.empty((n,m))
    Ix[:] = np.nan
    Sort = Ix.copy()
    
    nobs = np.array(m * [n])
    i = np.arange(n)/(n-1)
    
    for j in range(m):
        six = np.sort(arr[:,j])
        x = arr[:,j]
        x = x[~(np.isnan(x))]
        temp = x.argsort()
        siix = np.empty_like(temp)
        siix[temp] = np.arange(len(x))
        
        nobsj = six.size - np.count_nonzero(np.isnan(six))
        if nobsj < n:
            nobs[j] = nobsj
            six = six[~(np.isnan(six))]
            isna = np.isnan(arr[:,j])
            f = scipy.interpolate.interp1d((np.arange(nobsj)/(nobsj-1)), six)
            Sort[:,j] = f(i)
            Ix[~isna,j] = ((np.arange(n))[~isna])[siix]
        else:
            Sort[:,j] = six
            Ix[:,j] = siix

    local_sum = np.sum(Sort, axis=1)
    
    return [m,local_sum], nobs
    
#Calculates the result of the normalization.
def quantile_federated_client_result_new(data, mean, nobs):
    if(isinstance(data, pd.DataFrame)):
        data = data.to_numpy()
        
    try:
        n,m = data.shape
    except ValueError:
        print("Error")
        return
        
    arr = np.zeros((n,m))
    for i in range(m):
        arr[:,i] = data[:,i].astype(np.float64)
       
    if n == 1:
        return np.array(m * [mean])
    
    i = np.arange(n)/(n-1)
      
    for j in range(m):
        r = rankdata(arr[:,j], method='average')
        if(nobs[j] < n):
            isna = np.isnan(arr[:,j])
            f = scipy.interpolate.interp1d(i, mean)
            arr[~isna,j] = f((r[~isna]-1)/(nobs[j]-1))
        else:
            f = scipy.interpolate.interp1d(i, mean)
            arr[:,j] = f((r-1)/(n-1))
            
    return pd.DataFrame(arr)

-----------------------------------------------------------------------------------
**Examples:**

In [8]:
client01 = pd.read_csv("A2.counts.csv", header=None)
client02 = pd.read_csv("A7.counts.csv", header=None)
client03 = pd.read_csv("A8.counts.csv", header=None)


zero_client01 = upperquartile_federated_client_zeros(client01)
zero_client02 = upperquartile_federated_client_zeros(client02)
zero_client03 = upperquartile_federated_client_zeros(client03)

global_zeros = upperquartile_federated_server_zeros(zero_client01, zero_client02, zero_client03)

result01 = upperquartile_federated_client_calc(client01, global_zeros)
result02 = upperquartile_federated_client_calc(client02, global_zeros)
result03 = upperquartile_federated_client_calc(client03, global_zeros)

uq = upperquartile_federated_server(result01, result02, result03)

local_result01 = upperquartile_federated_client_result(client01, result01, uq)
local_result02 = upperquartile_federated_client_result(client02, result02, uq)
local_result03 = upperquartile_federated_client_result(client03, result03, uq)

#print(local_result01.head(5))
#print(local_result02.head(5))
#print(local_result03.head(5))

In [9]:
client01 = pd.read_csv("A2.counts.csv", header=None)
client02 = pd.read_csv("A7.counts.csv", header=None)
client03 = pd.read_csv("A8.counts.csv", header=None)

tupel_client1, nobs1 = quantile_federated_client_mean_new(client01)
tupel_client2, nobs2 = quantile_federated_client_mean_new(client02)
tupel_client3, nobs3 = quantile_federated_client_mean_new(client03)

global_mean = quantile_federated_server_new(tupel_client1, tupel_client2, tupel_client3)

I = quantile_federated_client_result_new(client01, global_mean, nobs1)
II = quantile_federated_client_result_new(client02, global_mean, nobs2)
III = quantile_federated_client_result_new(client03, global_mean, nobs3)

#print(I.head(5))
#print(II.head(5))
#print(III.head(5))