## Load data 

The data has been generated using the R package splatter.

In [1]:
# system packages
import numpy as np
import pandas as pd
import scanpy as sc
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import scipy.linalg as la
import scipy.sparse.linalg as lsa
import os.path as op
import os


basedir = '/home/anne/Documents/manuscripts/horizontal-pca'
pcapath = '/home/anne/Documents/featurecloud/pca/federated_dp_pca'
outdir = '/home/anne/Documents/featurecloud/pca/horizontal-pca/results/single-cell-simulated'


# simulation code
sys.path.append(pcapath)
from python.PCA.horizontal.horizontal_pca_benchmark import wrapper_k_variation
import python.PCA.shared_functions as sh
import python.PCA.horizontal.horizontal_pca_power_iteration as h
import python.PCA.horizontal.balcan as b
import python.PCA.horizontal.bai as bai
import python.PCA.horizontal.proxy_covariance as proxy
import python.PCA.vertical.simulate_federated_vertically_partionned_pca as vertical
import python.PCA.horizontal.leave1out as l1o

import python.PCA.comparison as co
sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=True)
sc.settings.figdir=op.join(basedir, 'figures/single-cell/')

## Simulate PCA on three different kinds of simulated count data

Reduce the dimensionality of the data by running principal component analysis (PCA), which reveals the main axes of variation and denoises the data.

In order to understand the potential problems with federated principal component analysis for dimensionality reduction in federated setting, federated PCA is simulated using different approaches: Federated Power iteration, an exact algorithm and approximate PCA from this publication:

*Maria-Florina Balcan, Vandana Kanchanapally, Yingyu Liang, and David Woodruff. Improved distributed principalcomponent analysis. In Proceedings of the 27th International Conference on Neural Information Processing Systems -Volume 2, NIPS’14, page 3113–3121, Cambridge, MA, USA,2014. MIT Press.*

In [6]:
np.random.seed(40)

Split the data into different chunks. Here, the data is horizontally partionned. This means, the cells are partionned over several virtual sites, but measurements for all genes are available.

In [27]:
def run_PCAs(inputfile):
    major_group_outlier = sc.read_h5ad(inputfile)
    sc.pp.scale(major_group_outlier)
    data_list_major_group_outlier = [major_group_outlier[major_group_outlier.obs['Batch']==b].X for b in np.unique(major_group_outlier.obs['Batch'])]
    # compute scanpy's standard PCA
    sc.tl.pca(major_group_outlier)

    # simulated fedeated power iteration 
    k= 50
    x, e, count = h.simulate_distributed_horizontal(data_list_major_group_outlier, k, maxit=1000)
    proj = np.dot(major_group_outlier.X, x)
    major_group_outlier.obsm['X_pca_pow'] = proj
    major_group_outlier.varm['PCs_pow'] = x

    xx, ee = b.simulate_federated_horizontal_pca(data_list_major_group_outlier, k)
    proj = np.dot(major_group_outlier.X, xx)
    major_group_outlier.obsm['X_pca_approx'] = proj
    major_group_outlier.varm['PCs_approx'] = xx
    return major_group_outlier

In [28]:
def save_results(major_group_outliers, filename):
    result = pd.DataFrame(
        np.concatenate([
            np.concatenate([major_group_outlier.obs[['Cell', 'Batch', 'Group']], 
                            np.atleast_2d(np.array(['SIT']*major_group_outlier.shape[0])).T,
                            major_group_outlier.obsm['X_pca_pow'][:,0:2]], axis=1),
            np.concatenate([major_group_outlier.obs[['Cell', 'Batch', 'Group']],
                            np.atleast_2d(np.array(['APSTACK']*major_group_outlier.shape[0])).T,
                            major_group_outlier.obsm['X_pca_approx'][:,0:2]], axis=1),
            np.concatenate([major_group_outlier.obs[['Cell', 'Batch', 'Group']],
                            np.atleast_2d(np.array(['Centralised']*major_group_outlier.shape[0])).T,
                            major_group_outlier.obsm['X_pca'][:,0:2]],axis=1)

        ], axis=0), 
    columns = ['Cell','Batch','Group', 'PCA','PC1','PC2'])
    result.to_csv(filename, sep='\t', index=False)


In [29]:
inputfile='/home/anne/Documents/featurecloud/pca/horizontal-pca/data/simulated/major_group_outlier.h5ad'
outputfile= '/home/anne/Documents/featurecloud/pca/horizontal-pca/data/simulated/major_group_outlier_pca.tsv'

major_group_outlier = run_PCAs(inputfile)
save_results(major_group_outlier, outputfile)

converged: 1000
(100, 1000)
(100, 1000)
(100, 1000)
(100, 1000)
(100, 1000)
Intermediate dimensions100
(500, 1000)


In [30]:
inputfile='/home/anne/Documents/featurecloud/pca/horizontal-pca/data/simulated/major_group.h5ad'
outputfile= '/home/anne/Documents/featurecloud/pca/horizontal-pca/data/simulated/major_group_pca.tsv'

major_group_outlier = run_PCAs(inputfile)
save_results(major_group_outlier, outputfile)

converged: 1000
(100, 1000)
(100, 1000)
(100, 1000)
(100, 1000)
(100, 1000)
Intermediate dimensions100
(500, 1000)


In [31]:
inputfile='/home/anne/Documents/featurecloud/pca/horizontal-pca/data/simulated/major_site.h5ad'
outputfile= '/home/anne/Documents/featurecloud/pca/horizontal-pca/data/simulated/major_site_pca.tsv'

major_group_outlier = run_PCAs(inputfile)
save_results(major_group_outlier, outputfile)

converged: 1000
(100, 1000)
(100, 1000)
(100, 1000)
(100, 1000)
(100, 1000)
Intermediate dimensions100
(500, 1000)
