# Original reference cell-type specific marker plotting in bulk data

## Load required libraries

In [None]:
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore


## Load markers

In [None]:
# Define file paths
data_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/subset_trimmed_data_markers/"

markers_file = os.path.join(data_dir, "all_cell_types_markers.csv")

# Load the markers file
df_markers = pd.read_csv(markers_file)

# Check structure
print(df_markers.head())


## Merge the sample marker region counts

In [None]:
# Get all bulk data CSV files, excluding the markers file
bulk_files = glob.glob(os.path.join(data_dir, "*.csv"))
bulk_files = [f for f in bulk_files if "all_cell_types_markers.csv" not in f] 

# Initialize merged dataframe as df_markers
df_merged = df_markers.copy()

# Merge all bulk datasets iteratively
for file in bulk_files:
    # Extract sample name from filename
    sample_name = os.path.basename(file).replace("_aligned_sorted.final_preprocessed_read_counts.csv", "")

    # Load bulk dataset
    df_bulk = pd.read_csv(file)

    # Rename count column to sample name
    count_column = df_bulk.columns[3]  
    df_bulk.rename(columns={count_column: sample_name}, inplace=True)

    # Merge with df_markers
    df_merged = pd.merge(df_merged, df_bulk, on=["chrom", "start", "end"], how="left")

    print(f"Merged: {sample_name} | Current shape: {df_merged.shape}")

# Fill missing values with 0
df_merged.fillna(0, inplace=True)


In [None]:
print(df_merged)

## Integrate corces samples

In [None]:
# Define your directories containing BED files
base_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/corces/"

dirs = ["BRCA_per_sample_scores", "COAD_per_sample_scores", 
        "LUAD_per_sample_scores", "LUSC_per_sample_scores"]

# Get list of all marker-region BED files
bed_files = []
for directory in dirs:
    bed_files.extend(glob.glob(f"{base_dir}{directory}/*_marker_regions.bed"))

# Filter to keep only original marker regions
filtered_bed_files = [file for file in bed_files 
                      if "pairwise" not in file and "global" not in file]

# Load each BED file into a dictionary of dataframes
bed_dfs = {file.split('/')[-1]: pd.read_csv(file, sep="\t", header=None, 
                                            names=["chrom", "start", "end", "score"])
           for file in filtered_bed_files}

# Verification
for name, df in bed_dfs.items():
    print(f"{name} has {df.shape[0]} rows")
    print(df.head())


In [None]:
# Columns are strings/numbers and consistent
df_merged['chrom'] = df_merged['chrom'].astype(str)
df_merged['start'] = df_merged['start'].astype(int)
df_merged['end'] = df_merged['end'].astype(int)

for filename, df in bed_dfs.items():
    
    sample_name = filename.split("_marker_regions.bed")[0]

    # Rename the 'score' column to the sample name
    df_sample = df.rename(columns={'score': sample_name})

    # Ensure consistency in column types
    df_sample['chrom'] = df_sample['chrom'].astype(str)
    df_sample['start'] = df_sample['start'].astype(int)
    df_sample['end'] = df_sample['end'].astype(int)

    # Merge into df_merged
    df_merged = df_merged.merge(df_sample, on=['chrom', 'start', 'end'], how='left')

# Check after merge
print(df_merged.head())


In [None]:
# Exclude metadata columns
metadata_cols = ["cell_type", "chrom", "start", "end"]
count_cols = [col for col in df_merged.columns if col not in metadata_cols]

# Separate metadata and count data
df_metadata = df_merged[metadata_cols].copy()  
df_counts = df_merged[count_cols].copy()  

# Initialize MinMaxScaler to scale between -4 and 4
scaler = MinMaxScaler(feature_range=(-1, 1))

# Apply MinMax scaling to count data
df_counts_scaled = pd.DataFrame(scaler.fit_transform(df_counts), columns=count_cols)

# Merge metadata back with scaled counts
df_scaled_final = pd.concat([df_metadata, df_counts_scaled], axis=1)

In [None]:
print(df_scaled_final)

In [None]:
# Exclude metadata columns
metadata_cols = ["cell_type", "chrom", "start", "end"]
df_counts = df_scaled_final.drop(columns=metadata_cols)

# Apply Z-score normalization (column-wise)
df_counts_zscore = df_counts.apply(zscore, axis=0)

# Reattach metadata
df_zscore_normalized = df_scaled_final[metadata_cols].copy()
df_zscore_normalized = pd.concat([df_zscore_normalized, df_counts_zscore], axis=1)

In [None]:
print(df_zscore_normalized.describe())

In [None]:
# Clip values to stay within [-4, 4]
df_counts_zscore_clipped = df_counts_zscore.clip(-4, 4)

# Reattach metadata
df_zscore_clipped = df_scaled_final[metadata_cols].copy()  
df_zscore_clipped = pd.concat([df_zscore_clipped, df_counts_zscore_clipped], axis=1) 

# Display first few rows
print(df_zscore_clipped.head())


## Clean up naming and order

In [None]:
df_zscore_clipped.rename(columns={
    "hepatocytes_SRR20002338.final_preprocessed_read_counts.csv": "Hepatocytes",
    "hepatocytes_SRR15567295.final_preprocessed_read_counts.csv": "Hepatocytes",
    "hepatocytes_SRR20002337.final_preprocessed_read_counts.csv": "Hepatocytes",
    "hepatocytes_SRR20002336.final_preprocessed_read_counts.csv": "Hepatocytes",
    "hepatocytes_SRR20002339.final_preprocessed_read_counts.csv": "Hepatocytes",
    "hepatocytes_SRR16213317.final_preprocessed_read_counts.csv": "Hepatocytes",
    "hepatocytes_SRR15567296.final_preprocessed_read_counts.csv": "Hepatocytes",
    "Fibroblasts_SRR13276223.final_preprocessed_read_counts.csv": "Fibroblasts",
    "colon_adenocarcinoma_SRR8192659.final_preprocessed_read_counts.csv": "Colon_Adenocarcinoma",
    "DCs_SRR11297453.final_preprocessed_read_counts.csv": "DCs",
    "HT29_SRR23524783.final_preprocessed_read_counts.csv": "HT29",
    "Endothelial_SRR11149143.final_preprocessed_read_counts.csv": "Endothelial",
    "CD4Tcell_SRR2920493.final_preprocessed_read_counts.csv": "CD4_Tcells",
    "bcell_SRR2920492.final_preprocessed_read_counts.csv": "Bcells",
    "CD8Tcell_SRR2920494.final_preprocessed_read_counts.csv": "CD8_Tcells",
    "Neutrophils_SRR11909927.final_preprocessed_read_counts.csv": "Neutrophils",
    "colon_SRR14305369.final_preprocessed_read_counts.csv": "Colon",
    "pancreas_SRR14305136.final_preprocessed_read_counts.csv": "Pancreas",
    "breast_SRR14305402.final_preprocessed_read_counts.csv": "Breast",
    "NKcell_SRR2920495.final_preprocessed_read_counts.csv": "NK",
    "lung_SRR14107854.final_preprocessed_read_counts.csv": "Lung",
    "Macrophages_SRR12810628.final_preprocessed_read_counts.csv": "Macrophages",
    "liver_SRR14305414.final_preprocessed_read_counts.csv": "Liver",
    "BRCA_000CFD9F_ADDF_4304_9E60_6041549E189C_X017_S06": "BRCA",
    "COAD_0B139EBC_D372_4EC3_90DB_4CC9BC6F38DC_X006_S05": "COAD",
    "LUAD_18545E4A_6285_4DB7_A52C_9E12075D6B89_X033_S04": "LUAD",
    "LUSC_19B5174F_A29F_4022_8BF8_025685B32CB8_X034_S01": "LUSC"
   
    
}, inplace=True)

print(df_zscore_clipped.head())  


In [None]:
# Define the new desired order for `cell_type`
desired_cell_type_order = [
    "Bcells", "CD4_Tcells", "CD8_Tcells", "NK", "Macrophages", 
    "DCs", "Neutrophils", "Endothelial", "Fibroblasts"
]

# Convert `cell_type` column to categorical type with defined order
df_zscore_clipped["cell_type"] = pd.Categorical(
    df_zscore_clipped["cell_type"], 
    categories=desired_cell_type_order, 
    ordered=True
)

# Sort DataFrame based on the ordered `cell_type` column
df_zscore_clipped = df_zscore_clipped.sort_values("cell_type")

# Display the first few rows to verify the new order
print(df_zscore_clipped.head())


In [None]:
# Create a subset copy 
df_subset = df_zscore_clipped.copy()

# Define the desired column order 
desired_column_order = [
    "Bcells", "CD4_Tcells", "CD8_Tcells", "NK", "Macrophages", 
    "DCs", "Neutrophils", "Endothelial", "Fibroblasts"
]

# Identify all remaining columns
metadata_cols = ["cell_type", "chrom", "start", "end"]  
remaining_columns = [col for col in df_subset.columns if col not in desired_column_order + metadata_cols]

# Remove unwanted tissue types
excluded_columns = ["HT29", "Colon_Adenocarcinoma", "Hepatocytes", 
                    "Colon", "Pancreas", "Breast", "Lung", "Liver",
                    "BRCA", "COAD", "LUAD", "LUSC"]
remaining_columns = [col for col in remaining_columns if col not in excluded_columns]

# Define final column order
final_column_order = metadata_cols + desired_column_order + remaining_columns

# Reorder columns 
df_subset = df_subset[final_column_order]

# Display first few rows to verify
print(df_subset.head())


In [None]:
# Identify consecutive blocks and label only the middle row
cell_types = df_subset["cell_type"].tolist()
n = len(cell_types)
cell_type_labels = [""] * n

start = 0  
for i in range(1, n+1):
    
    if i == n or cell_types[i] != cell_types[start]:
        # The block is [start, i-1]
        mid_index = (start + i - 1) // 2
        cell_type_labels[mid_index] = cell_types[start]
        start = i

# Exclude Metadata Columns
metadata_cols = ["cell_type", "chrom", "start", "end"]
df_numeric = df_subset.drop(columns=metadata_cols)

# X-axis Labels 
x_labels = df_numeric.columns

# Define Colormap 
cmap_colors = [
    (0.00, "#4169E1"),
    (0.15, "#6A98E1"),
    (0.25, "#A4C5F5"),
    (0.50, "#FFFFCC"),
    (0.75, "#FFD699"),
    (0.85, "#D73027"),
    (1.00, "#B40426")
]
custom_cmap = LinearSegmentedColormap.from_list("custom_cmap_smooth", cmap_colors, N=1024)
color_norm = plt.Normalize(vmin=-4, vmax=4, clip=True)

# Plot the Heatmap
plt.figure(figsize=(10, 8), dpi=300)
ax = sns.heatmap(
    df_numeric,
    cmap=custom_cmap,
    center=0,
    xticklabels=x_labels,
    yticklabels=cell_type_labels,  
    vmin=-4,
    vmax=4,
    norm=color_norm,
    robust=True
)

# Remove the y-axis tick marks but keep the custom labels
plt.tick_params(axis='y', which='both', left=False, labelleft=True)
ax.spines["left"].set_visible(False)

# Customize tick label font sizes and rotations
plt.xticks(rotation=90, fontsize=16)
plt.yticks(rotation=0, fontsize=16)

# Customize the colorbar 
cbar = ax.collections[0].colorbar  
cbar.set_ticks([-4, -2, 0, 2, 4])     
cbar.ax.tick_params(labelsize=16)     

# Add Centered Title with extra padding
plt.title("Original reference samples", fontsize=20, fontweight="bold", loc="center", pad=15)

# Save the figure
plt.savefig("heatmap_original_reference_samples.png", dpi=600, bbox_inches="tight")

# Show the figure
plt.show()


In [None]:
# Create a subset copy without modifying the original DataFrame
df_subset_tissue = df_zscore_clipped.copy()

# Define the desired column order 
desired_column_order = [
    "Bcells", "CD4_Tcells", "CD8_Tcells", "NK", "Macrophages", 
    "DCs", "Neutrophils", "Endothelial", "Fibroblasts"
]

# Identify all remaining columns 
metadata_cols = ["cell_type", "chrom", "start", "end"]  
remaining_columns = [col for col in df_subset_tissue.columns if col not in desired_column_order + metadata_cols]

# Remove unwanted tissue types
excluded_columns = ["HT29", "Colon_Adenocarcinoma", "Hepatocytes", 
                    "BRCA", "COAD", "LUAD", "LUSC"]

remaining_columns = [col for col in remaining_columns if col not in excluded_columns]

# Define final column order
final_column_order = metadata_cols + desired_column_order + remaining_columns

# Reorder columns 
df_subset_tissue = df_subset_tissue[final_column_order]

# Display first few rows to verify
print(df_subset_tissue.head())


In [None]:
# Create custom y-axis labels
cell_types = df_subset_tissue["cell_type"].tolist()
n = len(cell_types)
cell_type_labels = [""] * n  
start = 0
for i in range(1, n + 1):
    
    if i == n or cell_types[i] != cell_types[start]:
        
        mid_index = (start + i - 1) // 2
        cell_type_labels[mid_index] = cell_types[start]
        start = i

# Exclude Metadata Columns
metadata_cols = ["cell_type", "chrom", "start", "end"]
df_numeric = df_subset_tissue.drop(columns=metadata_cols)

# X-axis Labels 
x_labels = df_numeric.columns

# Define Colormap
cmap_colors = [
    (0.00, "#4169E1"),
    (0.15, "#6A98E1"),
    (0.25, "#A4C5F5"),
    (0.50, "#FFFFCC"),
    (0.75, "#FFD699"),
    (0.85, "#D73027"),
    (1.00, "#B40426")
]
custom_cmap = LinearSegmentedColormap.from_list("custom_cmap_smooth", cmap_colors, N=1024)

# Normalize Colors More Smoothly
color_norm = plt.Normalize(vmin=-4, vmax=4, clip=True)

# Generate Heatmap with Adjusted Scaling
plt.figure(figsize=(10, 8), dpi=300)  
ax = sns.heatmap(
    df_numeric,
    cmap=custom_cmap,
    center=0,
    xticklabels=x_labels,
    yticklabels=cell_type_labels,  
    vmin=-4,
    vmax=4,
    norm=color_norm,
    robust=True
)

# Remove the y-axis tick marks 
plt.tick_params(axis='y', which='both', left=False, labelleft=True)
ax.spines["left"].set_visible(False)

# Customize tick label font sizes and rotations
plt.xticks(rotation=90, fontsize=16)
plt.yticks(rotation=0, fontsize=16)

# Customize the colorbar: set ticks and adjust tick label font size
cbar = ax.collections[0].colorbar
cbar.set_ticks([-4, -2, 0, 2, 4])
cbar.ax.tick_params(labelsize=16)

# Add Centered Title with extra padding
plt.title("Original reference and tissue samples", fontsize=20, fontweight="bold", loc="center", pad=15)

# Save the figure 
plt.savefig("heatmap_original_reference_tissue_samples.png", dpi=600, bbox_inches="tight")

# Show the figure
plt.show()
