In [1]:
import sys
import os
import muon as mu


# Add the Functions folder to your path
sys.path.append(os.path.abspath("/Users/alexandra/Desktop/EG-CLR/Functions"))

# Now import your functions
from QC_functions import *
from CLR_functions import *

In [2]:
# Load multimodal data
mdata = mu.read("/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/processed_data/multiome_data.h5mu")

adata_gem = mdata['rna']
adata_atac = mdata['atac']

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


In [3]:
# obtain orginal read counts
adata_org = sc.read_10x_mtx('/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/filtered_feature_bc_matrix',
                            gex_only = False)

gex_rows = list(map(lambda x: x == 'Gene Expression', adata_org.var['feature_types']))
atac_rows = list(map(lambda x: x == 'Peaks', adata_org.var['feature_types']))

adata_gem_org = adata_org[:, gex_rows].copy()
adata_atac_org = adata_org[:, atac_rows].copy()

adata_gem_org.var_names_make_unique()
adata_atac_org.var_names_make_unique()

In [4]:
# replace read counts with orgaianl read counts
gem_index = adata_gem.var_names
atac_index = adata_atac.var.index

cell_index = adata_gem.obs.index

adata_gem_new = adata_gem_org[:, adata_gem_org.var_names.isin(gem_index)].copy()
adata_atac_new = adata_atac_org[:, adata_atac_org.var_names.isin(atac_index)].copy()

adata_gem_new = adata_gem_new[adata_gem_new.obs_names.isin(cell_index)].copy()
adata_atac_new = adata_atac_new[adata_atac_new.obs_names.isin(cell_index)].copy()


In [5]:
# normalizing scRNA-seq by chromasomex 
adata_gem_new = assign_chr('/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/filtered_feature_bc_matrix/features.tsv.gz', adata_gem_new)
adata_gem_chrx = subset_adata_by_chromosome(adata_gem_new,-1)
sc.pp.normalize_total(adata_gem_chrx, target_sum=1e4)
sc.pp.log1p(adata_gem_chrx)
adata_gem_chrx

AnnData object with n_obs × n_vars = 10359 × 503
    var: 'gene_ids', 'feature_types', 'Chromosome', 'Start', 'End'
    uns: 'log1p'

In [6]:
# define promoter, enhancer and gene body regions
adata_atac_new = assign_loc('/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/filtered_feature_bc_matrix/features.tsv.gz',
    "/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/atac_peak_annotation.tsv", adata_atac_new)
adata_atac_new = assign_chr('/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/filtered_feature_bc_matrix/features.tsv.gz', adata_atac_new)
adata_atac_chrx = subset_adata_by_chromosome(adata_atac_new,-1)
adata_atac_chrx

AnnData object with n_obs × n_vars = 10359 × 3424
    var: 'gene_ids', 'feature_types', 'gene', 'distance', 'peak_type', 'Chromosome', 'Start', 'End'

In [7]:
# binarize scATAC-seq 
adata_atac_chrx.X = (adata_atac_chrx.X > 0).astype(int)

In [11]:
# compute CLR matrix for GATA1 -> ChrX
MI_Matrix_chrx = MI_Matrix_MIinfoClassif(adata_gem_chrx, adata_atac_chrx)

In [12]:
CLR_matrix_chrx = CLR_Matrix(MI_Matrix_chrx)

In [None]:
write_matrixes("ChrX_allgene_allpeaks",GATA1)

In [None]:
adata_CLR_matrix_chrx = ad.AnnData(X=CLR_matrix_chrx.values)
adata_CLR_matrix_chrx.obs = adata_atac_chrx.var.copy()
adata_CLR_matrix_chrx.var = adata_gem_chrx.var.copy()
adata_CLR_matrix_chrx

AnnData object with n_obs × n_vars = 3424 × 503
    obs: 'gene_ids', 'feature_types', 'gene', 'distance', 'peak_type', 'Chromosome', 'Start', 'End'
    var: 'gene_ids', 'feature_types', 'Chromosome', 'Start', 'End', 'gene'

In [None]:
CLR_matrix_chrx = pd.DataFrame(CLR_matrix_chrx.values, index=adata_atac_chrx.var['gene'], columns=adata_gem_chrx.var['geng_ids'])
GATA1 = CLR_matrix_chrx["GATA1"].copy()
GATA1

chrX:62333-63291            0.000000
chrX:63937-64751            1.680467
chrX:68722-69507            0.000000
chrX:69671-70530            0.083656
chrX:162739-163466          0.000000
                              ...   
chrX:155249188-155250096    1.749103
chrX:155253186-155254082    0.000000
chrX:155254614-155255426    1.475307
chrX:155258082-155258986    0.000000
chrX:155259726-155260695    1.619834
Name: GATA1, Length: 3424, dtype: float64

In [89]:
# This function takes a gene name and a CLR matrix, and returns a DataFrame containing the CLR values for the specified genes.
def EG_pair_by_name(gene, CLR_Matrix):
    
    if gene not in CLR_Matrix.columns:
        print(f"Warning: {gene} not found in CLR matrix.")
    else:
        EG_pair = CLR_Matrix[gene].copy()
        EG_pair = EG_pair[EG_pair > 0]
        #GATA1_EG_pair = GATA1_EG_pair.reset_index()
        
        
        #GATA1_EG_pair.columns = ['Peak', 'CLR_value']
        GATA1_EG_pair = EG_pair.index.str.extract(r'^(chr\w+):(\d+)-(\d+)$')
        GATA1_EG_pair.columns = ['chr', 'start', 'end']

    
        return GATA1_EG_pair

In [90]:
GATA1_EG_pair = EG_pair_by_name("GATA1", CLR_matrix_chrx)
GATA1_EG_pair

Unnamed: 0,chr,start,end
0,chrX,63937,64751
1,chrX,69671,70530
2,chrX,167850,168779
3,chrX,176215,177159
4,chrX,192744,193400
...,...,...,...
1953,chrX,155232071,155232857
1954,chrX,155240875,155241468
1955,chrX,155249188,155250096
1956,chrX,155254614,155255426


In [85]:
CRISPRi_data = pd.read_excel('/Users/alexandra/Desktop/Data/CRISPRiFlowFISH/41588_2019_538_MOESM3_ESM.xlsx',
                             sheet_name='Supplementary Table 6a', 
                             skiprows=0,
                             header=1)
CRISPRi_GATA1 = CRISPRi_data[CRISPRi_data['Gene'] == 'GATA1'].copy()
CRISPRi_GATA1 = CRISPRi_GATA1[['chr', 'start', 'end', 'class', 'Significant']].copy()
CRISPRi_GATA1

  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)


Unnamed: 0,chr,start,end,class,Significant
196,chrX,48641372,48641493,intergenic,True
197,chrX,48659028,48659074,intergenic,True
198,chrX,48660375,48660526,promoter,True
1848,chrX,46655945,46656445,intergenic,False
1849,chrX,46696085,46696605,promoter,False
...,...,...,...,...,...
2117,chrX,49962985,49963545,genic,False
2118,chrX,49969065,49969605,intergenic,False
2119,chrX,50191545,50192045,genic,False
2120,chrX,50322885,50323405,intergenic,False


In [95]:
CRISPRi_GATA1_TP = CRISPRi_GATA1[CRISPRi_GATA1["Significant"] == True]
CRISPRi_GATA1_TN = CRISPRi_GATA1[CRISPRi_GATA1["Significant"] == False]

In [112]:
CRISPRi_GATA1_TP = CRISPRi_GATA1_TP[['chr', 'start', 'end']].copy()
CRISPRi_GATA1_TP

Unnamed: 0,chr,start,end
196,chrX,48641372,48641493
197,chrX,48659028,48659074
198,chrX,48660375,48660526
1993,chrX,48619903,48620645
1999,chrX,48643025,48643685
2003,chrX,48700065,48700565
2065,chrX,49040725,49041225
2081,chrX,49100665,49101185


In [97]:
CRISPRi_GATA1_TN

Unnamed: 0,chr,start,end,class,Significant
1848,chrX,46655945,46656445,intergenic,False
1849,chrX,46696085,46696605,promoter,False
1850,chrX,46696685,46697185,promoter,False
1851,chrX,46697525,46698025,genic,False
1852,chrX,46771460,46772025,promoter,False
...,...,...,...,...,...
2117,chrX,49962985,49963545,genic,False
2118,chrX,49969065,49969605,intergenic,False
2119,chrX,50191545,50192045,genic,False
2120,chrX,50322885,50323405,intergenic,False


In [103]:
CRISPRi_GATA1_start =  CRISPRi_GATA1['start'].min()
CRISPRi_GATA1_end = CRISPRi_GATA1['end'].max() 

print("CRISPR for GATA1 starts at ",CRISPRi_GATA1_start, " end at ", CRISPRi_GATA1_end)

CRISPR for GATA1 starts at  46655945  end at  50435165


In [113]:
# Filter for overlapping peaks
GATA1_EG_pair_limited = GATA1_EG_pair[
    (GATA1_EG_pair["end"].values.astype(int) < CRISPRi_GATA1_end ) & 
    (GATA1_EG_pair["start"].values.astype(int) > CRISPRi_GATA1_start)
]
GATA1_EG_pair_limited

Unnamed: 0,chr,start,end
597,chrX,46762892,46763815
598,chrX,46771506,46772278
599,chrX,46772417,46772801
600,chrX,46828749,46829653
601,chrX,46907130,46908041
...,...,...,...
748,chrX,49686778,49687624
749,chrX,49901127,49901946
750,chrX,49933733,49934640
751,chrX,49944910,49945812


In [114]:
GATA1_EG_pair_limited

Unnamed: 0,chr,start,end
597,chrX,46762892,46763815
598,chrX,46771506,46772278
599,chrX,46772417,46772801
600,chrX,46828749,46829653
601,chrX,46907130,46908041
...,...,...,...
748,chrX,49686778,49687624
749,chrX,49901127,49901946
750,chrX,49933733,49934640
751,chrX,49944910,49945812


In [115]:

CRISPRi_GATA1_TP

Unnamed: 0,chr,start,end
196,chrX,48641372,48641493
197,chrX,48659028,48659074
198,chrX,48660375,48660526
1993,chrX,48619903,48620645
1999,chrX,48643025,48643685
2003,chrX,48700065,48700565
2065,chrX,49040725,49041225
2081,chrX,49100665,49101185


In [121]:
import pandas as pd
import pybedtools

# Example DataFrame
df = pd.DataFrame({
    "chr": ["chr1", "chr1", "chr2"],
    "start": [100, 200, 300],
    "end": [150, 250, 350]
})

# Convert to BedTool
bed_str = df.to_csv(sep="\t", header=False, index=False)
bed = pybedtools.BedTool(bed_str, from_string=True)

# Now you can use .intersect(), .saveas(), etc.
print(bed)

chr1	100	150
chr1	200	250
chr2	300	350



In [124]:
bed_str

'chr1\t100\t150\nchr1\t200\t250\nchr2\t300\t350\n'

In [125]:
# Load as BedTool
GATA1_EG_pair_limited_bed_str = GATA1_EG_pair_limited.to_csv(sep="\t", header=False, index=False)
CRISPRi_GATA1_TP_bed_str = CRISPRi_GATA1_TP.to_csv(sep="\t", header=False, index=False)

GATA1_EG_pair_limited_bed = pybedtools.BedTool(GATA1_EG_pair_limited_bed_str, from_string=True)
CRISPRi_GATA1_TP_bed = pybedtools.BedTool(CRISPRi_GATA1_TP_bed_str, from_string=True)

# Intersect: return entries in A that overlap with B
overlap = GATA1_EG_pair_limited_bed.intersect(CRISPRi_GATA1_TP_bed, u=True)  # 'u=True' gives only entries from A

# Convert result to DataFrame
overlap_df = overlap.to_dataframe(names=["chr", "start", "end"])
overlap_df


Unnamed: 0,chr,start,end
0,chrX,48619991,48620883
1,chrX,48640998,48641905
2,chrX,48658644,48659432
3,chrX,48659993,48660815
4,chrX,49040546,49041486


# remove peaks that does not exist at the begining