#  Extract original reference counts for cell type marker regions

## Load required libraries

In [None]:
import pandas as pd
import re
import glob
import os
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

## Load raw atac-seq count data

In [None]:
# Load raw counts
raw_df = pd.read_csv(
    "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/gabriel/markers_identification_input_files/raw_counts.txt",
    sep='\t', header=0, index_col=None
)
raw_df.reset_index(inplace=True)
raw_df.rename(columns={"index": "region"}, inplace=True)

## Load cell type marker regions in bed format

In [None]:
# Load the cell type marker file
bed_file = '/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/cell_type_markers.bed'

regions = pd.read_csv(bed_file, sep='\t', header=None, names=['chrom', 'start', 'end'])

In [None]:
print(raw_df.head())
print(regions.head())

## Extract chromosome, start, and end from region column

In [None]:
# Extract chrom, start, end
region_pattern = r'^(chr[^:]+):(\d+)-(\d+)$'
raw_df[['chrom', 'start', 'end']] = raw_df['region'].str.extract(region_pattern)

In [None]:
raw_df['start'] = raw_df['start'].astype(int)
raw_df['end'] = raw_df['end'].astype(int)

In [None]:
# Put chrom, start, end at the front
desired_order = ['chrom', 'start', 'end'] + [col for col in raw_df.columns if col not in ['chrom', 'start', 'end', 'region']]
raw_df = raw_df[desired_order]

## Load sample metadata and harmonize sample names

In [None]:
# Load metadata
meta_df = pd.read_csv('/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/gabriel/markers_identification_input_files/metadata.txt', sep='\t')

In [None]:
# Combine subgroups into main groups
# meta_df['groups'] = meta_df['groups'].replace({
    # 'Naive_CD8_Tcells': 'CD8_Tcells',
    # 'Non_Naive_CD8_Tcells': 'CD8_Tcells',
    # 'Naive_CD4_Tcells': 'CD4_Tcells',
    # 'Non_Naive_CD4_Tcells': 'CD4_Tcells',
    # 'Tregs': 'CD4_Tcells'
# })

In [None]:
print(meta_df.head())

## Create unique sample identifiers by combining group and sample

In [None]:
# Better column mapping to preserve uniqueness
meta_df['group_with_sample'] = meta_df['groups'] + "_" + meta_df['sample']
sample_to_group_with_sample = meta_df.set_index('sample')['group_with_sample'].to_dict()

mapped_columns = {
    col: sample_to_group_with_sample.get(col, col)
    for col in raw_df.columns
}

raw_df.rename(columns=mapped_columns, inplace=True)


In [None]:
print(raw_df.head())

## Extract the the cell type marker regions

In [None]:
extracted_df = regions.merge(raw_df, on=['chrom', 'start', 'end'], how='inner')


In [None]:
print("Dimensions:", extracted_df.shape)
print(extracted_df.head())

In [None]:
# Create a new column combining 'chrom', 'start', and 'end'
extracted_df['region'] = extracted_df['chrom'] + '-' + extracted_df['start'].astype(str) + '-' + extracted_df['end'].astype(str)

# Set this new column as the index and drop original columns
extracted_df.set_index('region', inplace=True)
extracted_df.drop(columns=['chrom', 'start', 'end'], inplace=True)

# Check the reformatted DataFrame
print(extracted_df.head())


In [None]:
extracted_df.to_csv('/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/extracted_raw_counts_marker_region.csv')
