In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import muon as mu
import matplotlib.pyplot as plt
from muon import atac as ac
from muon import prot as pt
import anndata as ad
import random
import os
import scipy

In [None]:
# path
path_ATAC= '/Users/alexandra/Desktop/Data/SingleCell/K652 /ISSAAC-seq-main/other_methods_preprocessing/share-seq/K562/ATAC'
matrix_ATAC_file = os.path.join(path_ATAC, 'matrix.mtx')
peaks_ATAC_file = os.path.join(path_ATAC, 'features.tsv')
barcodes_ATAC_file = os.path.join(path_ATAC, 'barcodes.tsv')

# Read matrix data
sparse_ATAC_matrix = scipy.io.mmread(matrix_ATAC_file).tocsc()
matrix_ATAC = pd.DataFrame.sparse.from_spmatrix(sparse_ATAC_matrix)

# Read genes/features
peaks_ATAC = pd.read_csv(peaks_ATAC_file, header=None, sep='\t')

# Read barcodes
barcodes_ATAC = pd.read_csv(barcodes_ATAC_file, header=None, sep='\t')

In [None]:
# make anndata object
adata_ATAC = ad.AnnData(X = sparse_ATAC_matrix.T, obs = barcodes_ATAC, var = peaks_ATAC)



In [None]:
## rename atac
adata_ATAC.var.head(5)

Unnamed: 0,0,1,2
0,chr1,540931,541007
1,chr1,713821,714533
2,chr1,740180,740374
3,chr1,752681,753146
4,chr1,762023,763282


In [None]:
# rename atac var
adata_ATAC.var.rename(columns = {0: 'Chr',
                                 1: 'Start',
                                 2: 'End'}, inplace = True)

adata_ATAC.var['Start'] = adata_ATAC.var['Start'].astype(str)
adata_ATAC.var['End'] = adata_ATAC.var['End'].astype(str)

adata_ATAC.var.head(5)

Unnamed: 0,Chr,Start,End
0,chr1,540931,541007
1,chr1,713821,714533
2,chr1,740180,740374
3,chr1,752681,753146
4,chr1,762023,763282


In [None]:
# make var_names
adata_ATAC.var_names = adata_ATAC.var['Chr'] + "_" + adata_ATAC.var['Start'] + "_" + adata_ATAC.var['End']
adata_ATAC.var_names[0:5]

Index(['chr1_540931_541007', 'chr1_713821_714533', 'chr1_740180_740374',
       'chr1_752681_753146', 'chr1_762023_763282'],
      dtype='object')

In [None]:
# rename atac obs
adata_ATAC.obs.rename(columns = {0: 'Cell_index'}, inplace=True)
adata_ATAC.obs.head(5)

Unnamed: 0,Cell_index
0,"R1.75,R2.54,R3.52,P1.33"
1,"R1.65,R2.43,R3.90,P1.37"
2,"R1.82,R2.52,R3.18,P1.36"
3,"R1.89,R2.40,R3.58,P1.36"
4,"R1.86,R2.28,R3.80,P1.37"


In [None]:
# make obs_names
adata_ATAC.obs_names = adata_ATAC.obs["Cell_index"]
adata_ATAC.obs_names[0:5]

Index(['R1.75,R2.54,R3.52,P1.33', 'R1.65,R2.43,R3.90,P1.37',
       'R1.82,R2.52,R3.18,P1.36', 'R1.89,R2.40,R3.58,P1.36',
       'R1.86,R2.28,R3.80,P1.37'],
      dtype='object', name='Cell_index')

In [None]:
# sort adata by cell names
adata_ATAC = adata_ATAC[adata_ATAC.obs.sort_index().index]
adata_ATAC.obs_names[0:5]

Index(['R1.65,R2.01,R3.04,P1.31', 'R1.65,R2.01,R3.28,P1.31',
       'R1.65,R2.01,R3.47,P1.33', 'R1.65,R2.02,R3.13,P1.36',
       'R1.65,R2.02,R3.53,P1.31'],
      dtype='object', name='Cell_index')

In [None]:
# read path
path_RNA = '/Users/alexandra/Desktop/Data/SingleCell/K652 /ISSAAC-seq-main/other_methods_preprocessing/share-seq/K562/RNA'
matrix_RNA_file = os.path.join(path_RNA, 'matrix.mtx')
genes_RNA_file = os.path.join(path_RNA, 'features.tsv')
barcodes_RNA_file = os.path.join(path_RNA, 'barcodes.tsv')

# Read matrix data
sparse_RNA_matrix = scipy.io.mmread(matrix_RNA_file).tocsc()
matrix_RNA = pd.DataFrame.sparse.from_spmatrix(sparse_RNA_matrix)

# Read genes/features
genes_RNA = pd.read_csv(genes_RNA_file, header=None, sep='\t')

# Read barcodes
barcodes_RNA = pd.read_csv(barcodes_RNA_file, header=None, sep='\t')

In [None]:
# read rna anndata
adata_RNA = ad.AnnData(X = sparse_RNA_matrix.T, obs = barcodes_RNA, var = genes_RNA )



In [None]:
# rename rna var, make var_names
adata_RNA.var.rename(columns = {0: 'Gene'}, inplace = True)
adata_RNA.var_names = adata_RNA.var['Gene']
adata_RNA.var.head(5)

Unnamed: 0_level_0,Gene
Gene,Unnamed: 1_level_1
5S_rRNA,5S_rRNA
7SK,7SK
A1BG,A1BG
A1CF,A1CF
A2M,A2M


In [None]:
# rename rna obs, make obs_names
adata_RNA.obs.rename(columns = {0: 'Cell_index'}, inplace=True)
adata_RNA.obs_names = adata_RNA.obs['Cell_index']
adata_RNA.obs.head(5)

Unnamed: 0_level_0,Cell_index
Cell_index,Unnamed: 1_level_1
"R1.65,R2.01,R3.04,P1.39","R1.65,R2.01,R3.04,P1.39"
"R1.65,R2.01,R3.28,P1.39","R1.65,R2.01,R3.28,P1.39"
"R1.65,R2.01,R3.47,P1.41","R1.65,R2.01,R3.47,P1.41"
"R1.65,R2.01,R3.93,P1.40","R1.65,R2.01,R3.93,P1.40"
"R1.65,R2.02,R3.13,P1.44","R1.65,R2.02,R3.13,P1.44"


In [None]:
# sort adata rna by cell names
adata_RNA = adata_RNA[adata_RNA.obs.sort_index().index]
adata_RNA.obs_names[0:5]

Index(['R1.65,R2.01,R3.04,P1.39', 'R1.65,R2.01,R3.28,P1.39',
       'R1.65,R2.01,R3.47,P1.41', 'R1.65,R2.01,R3.93,P1.40',
       'R1.65,R2.02,R3.13,P1.44'],
      dtype='object', name='Cell_index')

In [None]:
## comparing cell names between atac and rna
adata_RNA.obs_names

Index(['R1.65,R2.01,R3.04,P1.39', 'R1.65,R2.01,R3.28,P1.39',
       'R1.65,R2.01,R3.47,P1.41', 'R1.65,R2.01,R3.93,P1.40',
       'R1.65,R2.02,R3.13,P1.44', 'R1.65,R2.02,R3.53,P1.39',
       'R1.65,R2.02,R3.55,P1.41', 'R1.65,R2.02,R3.60,P1.42',
       'R1.65,R2.02,R3.84,P1.40', 'R1.65,R2.03,R3.13,P1.41',
       ...
       'R1.96,R2.92,R3.57,P1.42', 'R1.96,R2.93,R3.08,P1.45',
       'R1.96,R2.93,R3.16,P1.39', 'R1.96,R2.94,R3.12,P1.43',
       'R1.96,R2.94,R3.12,P1.45', 'R1.96,R2.94,R3.61,P1.39',
       'R1.96,R2.94,R3.89,P1.45', 'R1.96,R2.95,R3.90,P1.42',
       'R1.96,R2.96,R3.40,P1.45', 'R1.96,R2.96,R3.51,P1.41'],
      dtype='object', name='Cell_index', length=8515)

In [63]:
adata_ATAC.obs_names

Index(['R1.65,R2.01,R3.04,P1.31', 'R1.65,R2.01,R3.28,P1.31',
       'R1.65,R2.01,R3.47,P1.33', 'R1.65,R2.02,R3.13,P1.36',
       'R1.65,R2.02,R3.53,P1.31', 'R1.65,R2.02,R3.55,P1.33',
       'R1.65,R2.02,R3.60,P1.34', 'R1.65,R2.03,R3.17,P1.37',
       'R1.65,R2.03,R3.43,P1.37', 'R1.65,R2.04,R3.09,P1.35',
       ...
       'R1.96,R2.93,R3.08,P1.37', 'R1.96,R2.93,R3.16,P1.31',
       'R1.96,R2.94,R3.12,P1.35', 'R1.96,R2.94,R3.12,P1.37',
       'R1.96,R2.94,R3.61,P1.31', 'R1.96,R2.94,R3.89,P1.37',
       'R1.96,R2.95,R3.90,P1.34', 'R1.96,R2.95,R3.90,P1.37',
       'R1.96,R2.96,R3.40,P1.37', 'R1.96,R2.96,R3.51,P1.33'],
      dtype='object', name='Cell_index', length=7744)

In [None]:
# overlap between rna and atac cell names 
set(adata_RNA.obs_names) & set(adata_ATAC.obs_names)

# there is no overlap between R1 R2 R3 P1 combination

set()

In [None]:
# R1 R2 R3 might be round 1- 3 for barcode manufacturing process 
# isolate R1 R2 R3 
barcode_1 = adata_ATAC.obs_names
barcode_2 = adata_RNA.obs_names
barcode_1_split = [entry.split(',') for entry in barcode_1]
barcode_2_split = [entry.split(',') for entry in barcode_2]

barcode_unique_1 = pd.DataFrame({ 'Cell_index': [','.join(row[:3])  for row in barcode_1_split]})
barcode_unique_2 = pd.DataFrame({ 'Cell_index': [','.join(row[:3]) for row in barcode_2_split ]})

In [None]:
# R1 R2 R3 is not unique for RNA
print("RNA is uique: ", barcode_unique_2['Cell_index'].is_unique)

duplicates = barcode_unique_2['Cell_index'][barcode_unique_2['Cell_index'].duplicated(keep=False)]
duplicates

RNA is uique:  False


144     R1.65,R2.39,R3.81
145     R1.65,R2.39,R3.81
172     R1.65,R2.46,R3.84
173     R1.65,R2.46,R3.84
186     R1.65,R2.50,R3.41
              ...        
8435    R1.96,R2.64,R3.20
8502    R1.96,R2.91,R3.23
8503    R1.96,R2.91,R3.23
8508    R1.96,R2.94,R3.12
8509    R1.96,R2.94,R3.12
Name: Cell_index, Length: 300, dtype: object

In [None]:
# R1 R2 R3 is not unique in ATAC
print("ATAC is uique: ", barcode_unique_1['Cell_index'].is_unique)

duplicates = barcode_unique_1['Cell_index'][barcode_unique_1['Cell_index'].duplicated(keep=False)]
duplicates

ATAC is uique:  False


127     R1.65,R2.39,R3.81
128     R1.65,R2.39,R3.81
165     R1.65,R2.50,R3.41
166     R1.65,R2.50,R3.41
202     R1.65,R2.60,R3.21
              ...        
7731    R1.96,R2.91,R3.23
7736    R1.96,R2.94,R3.12
7737    R1.96,R2.94,R3.12
7740    R1.96,R2.95,R3.90
7741    R1.96,R2.95,R3.90
Name: Cell_index, Length: 256, dtype: object

In [95]:
overlapped_cells = set(barcode_unique_1['Cell_index']) & set(barcode_unique_2['Cell_index'])
print("Number of overlapped cells in RNA and ATAC: ", len(overlapped_cells))

Number of overlapped cells in RNA and ATAC:  7102
