In [None]:
import sys
import os
import muon as mu


# Add the Functions folder to your path
sys.path.append(os.path.abspath("/Users/alexandra/Desktop/EG-CLR/Functions"))

# Now import your functions
from QC_functions import *
from CLR_functions import *
from Verfication_functions import *

# Run EG_CLR

In [None]:
# Load multimodal data
mdata = mu.read("/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/processed_data/multiome_data.h5mu")

adata_gem = mdata['rna']
adata_atac = mdata['atac']

In [None]:
# obtaining orginal read counts
adata_org = sc.read_10x_mtx('/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/filtered_feature_bc_matrix',
                            gex_only = False)

gex_rows = list(map(lambda x: x == 'Gene Expression', adata_org.var['feature_types']))
atac_rows = list(map(lambda x: x == 'Peaks', adata_org.var['feature_types']))

adata_gem_org = adata_org[:, gex_rows].copy()
adata_atac_org = adata_org[:, atac_rows].copy()

adata_gem_org.var_names_make_unique()
adata_atac_org.var_names_make_unique()

# replace read counts with orgaianl read counts
gem_index = adata_gem.var_names
atac_index = adata_atac.var.index

cell_index = adata_gem.obs.index

adata_gem_new = adata_gem_org[:, adata_gem_org.var_names.isin(gem_index)].copy()
adata_atac_new = adata_atac_org[:, adata_atac_org.var_names.isin(atac_index)].copy()

adata_gem_new = adata_gem_new[adata_gem_new.obs_names.isin(cell_index)].copy()
adata_atac_new = adata_atac_new[adata_atac_new.obs_names.isin(cell_index)].copy()

# normalizing scRNA-seq again  
adata_gem_new = assign_chr('/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/filtered_feature_bc_matrix/features.tsv.gz', adata_gem_new)
sc.pp.normalize_total(adata_gem_new, target_sum=1e4)
sc.pp.log1p(adata_gem_new)
adata_gem_new

In [None]:
# isolate Chr x gene
adata_gem_chrx = adata_gem_new[:,adata_gem_new.var["Chromosome"] == 'chrX'].copy() 
adata_gem_chrx.obs

In [None]:
# define promoter, enhancer and gene body regions
adata_atac_new = assign_loc('/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/filtered_feature_bc_matrix/features.tsv.gz',
    "/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/atac_peak_annotation.tsv", adata_atac_new)
adata_atac_new = assign_chr('/Volumes/G-DRIVE mobile USB-C/Single-cell_data/K562/10x/ISSAACC-seq_generated/hg19_10xCloud_aligned_data/filtered_feature_bc_matrix/features.tsv.gz', adata_atac_new)
adata_atac_chrx = subset_adata_by_chromosome(adata_atac_new,-1)
adata_atac_chrx

In [None]:
# before leave: 
adata_atac_chrx.X = (adata_atac_chrx.X > 0).astype(int)
MI_Matrix_chrx = MI_Matrix_MIinfoClassif(adata_gem_chrx, adata_atac_chrx)
write_matrixes("ChrX_AllGenes_AllPeaks_NormByCell", MI_Matrix_chrx)

In [None]:
# Select accessible (promoter + gene body) and expressive gene

# define promoter, enhancer and gene body regions
adata_CRE, adata_gene, adata_promoter = separate_GRE_gene_promotor(adata_atac_chrx)


# limited gene to be accessible in gene body, promoter and expressed 
adata_rna_flitered, adata_atac_gene_filtered, adata_CRE, adata_atac_promoter_filtered = define_open_express_gene(adata_gem_chrx, adata_atac_chrx)


In [None]:
# binarize scATAC-seq 
adata_atac_chrx.X = (adata_atac_chrx.X > 0).astype(int)

In [None]:
# compute CLR matrix ChrX (RNA vs ATAC)
MI_Matrix_chrx = MI_Matrix_MIinfoClassif(adata_rna_flitered, adata_atac_chrx)
CLR_matrix_chrx = CLR_Matrix(MI_Matrix_chrx)

In [None]:
MI_Matrix_chrx = pd.DataFrame(MI_Matrix_chrx.values, index=adata_atac_chrx.var['gene_ids'], columns=adata_rna_flitered.var['gene'])
write_matrixes("ChrX_SelectGenes_AllPeaks_NormByCell",MI_Matrix_chrx)

In [None]:
CLR_matrix_chrx = pd.DataFrame(CLR_matrix_chrx.values, index=adata_atac_chrx.var['gene_ids'], columns=adata_rna_flitered.var['gene'])

# Run CRISPRi comparison

In [None]:
# obtain GATA1 gene and peak pairs
GATA1_EG_pair = EG_pair_by_name("GATA1", CLR_matrix_chrx)
GATA1_EG_pair

In [None]:
# Read CRISPRi data

CRISPRi_data = pd.read_excel('/Users/alexandra/Desktop/Data/CRISPRiFlowFISH/41588_2019_538_MOESM3_ESM.xlsx',
                             sheet_name='Supplementary Table 6a', 
                             skiprows=0,
                             header=1)

# Find GATA1 
CRISPRi_GATA1 = CRISPRi_data[CRISPRi_data['Gene'] == 'GATA1'].copy()
CRISPRi_GATA1 = CRISPRi_GATA1[['chr', 'start', 'end', 'class', 'Significant','Gene']].copy()

CRISPRi_GATA1

In [None]:
# Find the range of GATA1 data
CRISPRi_GATA1_start =  CRISPRi_GATA1['start'].min()
CRISPRi_GATA1_end = CRISPRi_GATA1['end'].max() 

print("CRISPR for GATA1 starts at ",CRISPRi_GATA1_start, " end at ", CRISPRi_GATA1_end)

# Filter for overlapping peaks
GATA1_EG_pair_limited = GATA1_EG_pair[
    (GATA1_EG_pair["end"].values.astype(int) < CRISPRi_GATA1_end ) & 
    (GATA1_EG_pair["start"].values.astype(int) > CRISPRi_GATA1_start)
]

GATA1_EG_pair_limited

In [None]:
CRISPRi_GATA1_limited = find_detected_peaks(GATA1_EG_pair_limited, CRISPRi_GATA1)
CRISPRi_GATA1_limited  

In [None]:
CRISPRi_GATA1_limited
GATA1_EG_pair_limited.reset_index(drop=True, inplace=True) 

In [None]:
# coverlap between two data sets (chr, start, end)
def CRISPRi_comparison(A, B):
    
    A_str = A.to_csv(sep="\t", header=False, index=False)
    B_str = B.to_csv(sep="\t", header=False, index=False)

    A_bed = pybedtools.BedTool(A_str, from_string=True)
    B_bed = pybedtools.BedTool(B_str, from_string=True)

    # Intersect: returns TP CRIPSRi peaks that overlap with CLR peaks
    overlap =  A_bed.intersect(B_bed, wa=True)  

    # Convert result to DataFrame
    overlap_df = overlap.to_dataframe(names=["chr", "start", "end"])
    
    return overlap_df

In [None]:
# Find TP for CRISPRi
CRISPRi_TP = CRISPRi_GATA1_limited[CRISPRi_GATA1_limited['Significant'] == True].copy()
CRISPRi_TP = CRISPRi_TP[['chr', 'start', 'end']].copy()

overlap_df = CRISPRi_comparison(GATA1_EG_pair_limited[['chr', 'start', 'end']].copy(),CRISPRi_TP)
overlap_df

In [None]:
merge_df = pd.merge(overlap_df, GATA1_EG_pair_limited, on=['chr', 'start', 'end'], how='left')
merge_df