## Extraction of the new cell-type markers in sc-zhang reference matrix
- Group biologically similar cell types by averaging
- Intersects grouped matrix with marker regions (≥400 bp overlap)

## Import required libraries

In [None]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyranges as pr

## Load sc-zhang reference matrix

In [None]:
# Open the formatted Zhang file 
subset = pd.read_pickle('/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/subset_by_celltype.pkl')

In [None]:
# Get first 5 entries to check
subset.head()

## Load extended cell type marker bed file

In [None]:
# Load BED file into a DataFrame
df_bed = pd.read_csv("/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/new_pairwise_cell_types_markers.bed", sep="\t", header=None, names=["chrom", "start", "end"])


In [None]:
print(df_bed.head())


# Define and aggregate cell type groups
- Biologically similar or closely related cell types (e.g. alveolar type 1 and 2) are grouped.
- Single cell types that are already specific or not part of any group are handled individually.

In [None]:
# Define the cell type groups as lists of column names
cell_type_groups = {
    "Endothelial_General": ["Endothelial General 1", "Endothelial General 2", "Endothelial General 3"],
    "Colon_Epithelial": ["Colon Epithelial 1", "Colon Epithelial 2", "Colon Epithelial 3"],
    "Mammary_Luminal_Epi": ["Mammary Luminal Epi 1", "Mammary Luminal Epi 2"],
    "Fibro": ["Fibro General", "Fibro Epithelial", "Fibro GI", "Fibro Nerve", "Fibro Muscle", "Fibro Liver Adrenal"],
    "Alveolar": ["Alveolar Type 1", "Alveolar Type 2"],
    "Macrophage": ["Macrophage General", "Macrophage Gen or Alv"],
    "Plasma_Memory_B": ["Plasma B", "Memory B"]
}

# List of cell types not included in any group
cell_type_single = ["T Lymphocyte 1 (CD8+)", "T lymphocyte 2 (CD4+)", "Naive T", "Natural Killer T", "Acinar", "Ductal", "Airway Goblet", "Colon Goblet", "Mast", "Tuft", "Hepatocyte"]

# Averaging similar cell types

In [None]:
# Create a copy of the subset to modify
df_grouped = subset.copy()

# Aggregate grouped cell types by averaging
for group_name, columns in cell_type_groups.items():
    df_grouped[group_name] = df_grouped[columns].mean(axis=1)  
    df_grouped = df_grouped.drop(columns=columns)  

# Keep single cell types 
df_grouped = df_grouped[cell_type_single + list(cell_type_groups.keys())]

# Display the new DataFrame with grouped cell types
print(df_grouped.head())


## Clean up and prepare for intersection

In [None]:
# Extract chromosome, start, and end from the index (Region)
df_grouped = df_grouped.reset_index()  
df_grouped[['chrom', 'start_end']] = df_grouped['Region'].str.split(':', expand=True)
df_grouped[['start', 'end']] = df_grouped['start_end'].str.split('-', expand=True).astype(int)
df_grouped.drop(columns=['start_end'], inplace=True)

In [None]:
print(df_grouped.head())

## Perform region overlap with marker bed file

In [None]:
# Rename columns to match PyRanges
df_bed_renamed = df_bed.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"})
df_grouped_renamed = df_grouped.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"})

# Create PyRanges objects
pr_bed = pr.PyRanges(df_bed_renamed)
pr_grouped = pr.PyRanges(df_grouped_renamed)

# Perform intersection
intersection = pr_bed.join(pr_grouped, suffix="_matrix")

# Calculate the exact overlap length
intersection_df = intersection.df.copy()
intersection_df["OverlapStart"] = intersection_df[["Start", "Start_matrix"]].apply(max, axis=1)
intersection_df["OverlapEnd"] = intersection_df[["End", "End_matrix"]].apply(min, axis=1)
intersection_df["OverlapLength"] = intersection_df["OverlapEnd"] - intersection_df["OverlapStart"]

# Define minimum overlap required
min_overlap_length = 400

# Filter intersections based on minimum overlap
intersection_filtered = intersection_df[intersection_df["OverlapLength"] >= min_overlap_length]

# Check the number of resulting entries
print(f"Number of entries after limiting overlap: {intersection_filtered.shape[0]}")

# Show results
print(intersection_filtered.head())


## Remove redundant overlaps

In [None]:
# Sort by OverlapLength (descending) to keep largest overlap
intersection_filtered_sorted = intersection_filtered.sort_values(by='OverlapLength', ascending=False)

# Drop duplicates based on BED markers 
intersection_unique = intersection_filtered_sorted.drop_duplicates(subset=['Chromosome', 'Start', 'End'])

# Final check of number of unique entries
print(f"Number of unique entries after removing duplicates: {intersection_unique.shape[0]}")

# Display final results
print(intersection_unique.head())


# Clean and save final matrix

In [None]:
# Drop unwanted columns
columns_to_drop = ['Start_matrix', 'End_matrix', 'OverlapStart', 'OverlapEnd', 'OverlapLength','Region']
intersection_final = intersection_unique.drop(columns=columns_to_drop)

# Check the cleaned DataFrame
print(intersection_final.head())

In [None]:
# Save the final DataFrame as a CSV file
intersection_final.to_csv("/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/sc_pairwise_marker_regions.csv", index=False)