# Extract reference counts for new cell type marker regions

## Load required libraries

In [None]:
import pandas as pd
import re
import glob
import os
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

## Load the markers

In [None]:
# Load raw counts from raw_counts_matrix.csv
raw_df = pd.read_csv(
    "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/raw_counts_matrix.csv",
    sep=",",  
    header=0,
    index_col=None
)

# Reset index and rename columns 
raw_df.reset_index(inplace=False)

In [None]:
# Load the cell type marker file
bed_file = '/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/new_pairwise_cell_types_markers.bed'
regions = pd.read_csv(bed_file, sep='\t', header=None, names=['chrom', 'start', 'end'])

In [None]:
print(raw_df.head())
print(regions.head())

## Extract the the cell type marker regions

In [None]:
extracted_df = regions.merge(raw_df, on=['chrom', 'start', 'end'], how='inner')


In [None]:
print("Dimensions:", extracted_df.shape)
print(extracted_df.head())

In [None]:
# Create a new column combining 'chrom', 'start', and 'end'
extracted_df['region'] = extracted_df['chrom'] + '-' + extracted_df['start'].astype(str) + '-' + extracted_df['end'].astype(str)

# Set new column as the index and drop original columns
extracted_df.set_index('region', inplace=True)
extracted_df.drop(columns=['chrom', 'start', 'end'], inplace=True)

# Check the reformatted DataFrame
print(extracted_df.head())


In [None]:
extracted_df.to_csv('/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/extracted_raw_counts_pairwise_marker_region.csv')
