In [1]:
import anndata as ad
import scanpy as sc
import numpy as np
import scipy as scp
import pandas as pd
import os
import re

In [2]:
adataNames = os.listdir('./jointAdata/')
adataList = []
for aN in adataNames[6:]:
    adataList.append(sc.read_h5ad('./jointAdata/'+aN))
adataList

[AnnData object with n_obs × n_vars = 23216 × 60623 
     layers: 'spliced', 'unspliced',
 AnnData object with n_obs × n_vars = 22549 × 60623 
     layers: 'spliced', 'unspliced',
 AnnData object with n_obs × n_vars = 22840 × 60623 
     layers: 'spliced', 'unspliced',
 AnnData object with n_obs × n_vars = 22751 × 60623 
     layers: 'spliced', 'unspliced',
 AnnData object with n_obs × n_vars = 23142 × 60623 
     layers: 'spliced', 'unspliced',
 AnnData object with n_obs × n_vars = 23215 × 60623 
     layers: 'spliced', 'unspliced']

In [3]:
samples = []
for an in adataNames:
    samples.append(re.search('S[0-9]+.',an).group()[:-1])
samples

['S4', 'S17', 'S16', 'S5', 'S15', 'S2', 'S13', 'S1', 'S3', 'S18', 'S6', 'S14']

In [4]:
# remove duplicated values that have different conditions
phenoDat = pd.read_csv('phenoSCCall.csv')
phenoDat.drop('Unnamed: 0',axis=1, inplace=True)
print(phenoDat.shape)
dup = phenoDat.duplicated('barcodes',keep=False)
dupPhenoDat = phenoDat.loc[dup,:]
dup2 = dupPhenoDat.duplicated(['barcodes','treatment'],keep=False)
dupKeep = dupPhenoDat.loc[dup2,:].sort_values('barcodes')
remove = [x for x in list(dupPhenoDat.index) if x not in list(dupKeep.index)]
pD = phenoDat.drop(remove,axis = 0)
print(pD.shape)
len(set(pD['barcodes']))

(23914, 4)
(23573, 4)


23415

In [5]:
keep = list(pD['barcodes'])
keep[:5]

['AAACCTGAGTTGTCGT',
 'AAACCTGCACAGCGTC',
 'AAACCTGGTACTTCTT',
 'AAAGATGGTGTGAATA',
 'AAAGCAAGTAGCTGCC']

In [6]:
[x for x in [0,range(2,6)]]

[0, range(2, 6)]

In [4]:
%%time
# here use all, subset with keep later
checkSameVar = []
for i in range(1,len(adataList)):
    checkSameVar.append(all(adataList[i].var.index == adataList[i-1].var.index))
check = all(checkSameVar)
assert check == True, "The vars are not same in all adata"
#convert each sparse matrix to array
print('converting to array')
XList = []
SList = []
UList = []
obs = []
var_names = list(adataList[0].var.index)
obsNames = []
for an in adataList:
    XList.append(an.X.toarray())
    SList.append(an.layers['spliced'].toarray())
    UList.append(an.layers['unspliced'].toarray())
    obsNames += list(an.obs.index.values)
cellBarcodes = set(obsNames)
rows = len(cellBarcodes)
cols = adataList[0].shape[1]
cellDict = dict(zip(cellBarcodes,range(rows)))
X = np.zeros((rows,cols))
S = np.zeros((rows,cols))
U = np.zeros((rows,cols))
print('adding values')
for i in range(len(adataList)):
    for j in range(adataList[i].shape[0]):
        if j % 10000 == 0:
            print('done ' + str(j) + ' of file no ' + str(i+1) + ' of ' + str(len(adataList)))
        idx = cellDict[adataList[i].obs.index[j]]
        X[idx] = X[idx] + XList[i][j]
        S[idx] = S[idx] + SList[i][j]
        U[idx] = U[idx] + UList[i][j]
print('converting into sparse matrices')
X = scp.sparse.csr_matrix(X)
S = scp.sparse.csr_matrix(S)
U = scp.sparse.csr_matrix(U)
concatAdata = ad.AnnData(X,
              {'obs_names': cellBarcodes},
              {'var_names': var_names},
               layers={'spliced':S,
                      'unspliced':U})
filename = 'finalConcatanatedData.h5ad'
print('final adata is:')
print(concatAdata)
print('writing file ' + filename)
concatAdata.write(filename)

converting to array
adding values
done 0 of file no 1 of 2
done 10000 of file no 1 of 2
done 20000 of file no 1 of 2
done 0 of file no 2 of 2
done 10000 of file no 2 of 2
done 20000 of file no 2 of 2
converting into sparse matrices
final adata is:
AnnData object with n_obs × n_vars = 23585 × 60623 
    layers: 'spliced', 'unspliced'
writing file finalConcatanatedData.h5ad
CPU times: user 1min 6s, sys: 45.5 s, total: 1min 52s
Wall time: 1min 52s


In [2]:
adataList = [ad.read_h5ad('finalConcatanatedData_1.h5ad'),ad.read_h5ad('finalConcatanatedData_2.h5ad')]