# New reference cell-type specific marker plotting in single-cell data

## Load required libraries

In [None]:
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore

## Load markers

In [None]:
# Load the markers file
df_markers = pd.read_csv("/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/new_pairwise_cell_types_markers.csv")

# Check structure
print(df_markers.head())


In [None]:
# Load the markers file
df_sc_matrix = pd.read_csv("/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/sc_pairwise_marker_regions.csv")

# Check structure
print(df_sc_matrix.head())

In [None]:
# Rename columns in df_markers to match df_sc_matrix
df_markers.rename(columns={'chrom': 'Chromosome', 'start': 'Start', 'end': 'End'}, inplace=True)

# Merge the two dataframes on Chromosome, Start, and End
df_merged = df_sc_matrix.merge(df_markers, on=['Chromosome', 'Start', 'End'], how='left')

# Reorder columns to place 'cell_type' as the first column
columns_order = ['cell_type'] + [col for col in df_merged.columns if col != 'cell_type']
df_merged = df_merged[columns_order]

# Display the first few rows to check
print(df_merged.head())


In [None]:
print(df_merged.isna().sum().sum()) 

In [None]:
# Exclude metadata columns
metadata_cols = ["cell_type", "Chromosome", "Start", "End"]
count_cols = [col for col in df_merged.columns if col not in metadata_cols]

# Separate metadata and count data
df_metadata = df_merged[metadata_cols].copy()  
df_counts = df_merged[count_cols].copy()  

# Initialize MinMaxScaler to scale between -4 and 4
scaler = MinMaxScaler(feature_range=(-1, 1))

# Apply MinMax scaling to count data
df_counts_scaled = pd.DataFrame(scaler.fit_transform(df_counts), columns=count_cols)

# Merge metadata back with scaled counts
df_scaled_final = pd.concat([df_metadata, df_counts_scaled], axis=1)

In [None]:
print(df_scaled_final)

In [None]:
# Exclude metadata columns
metadata_cols = ["cell_type", "Chromosome", "Start", "End"]
df_counts = df_scaled_final.drop(columns=metadata_cols)

# Apply Z-score normalization (column-wise)
df_counts_zscore = df_counts.apply(zscore, axis=0)

# Reattach metadata
df_zscore_normalized = df_scaled_final[metadata_cols].copy()
df_zscore_normalized = pd.concat([df_zscore_normalized, df_counts_zscore], axis=1)

In [None]:
print(df_zscore_normalized.describe())

In [None]:
# Clip values to stay within [-4, 4]
df_counts_zscore_clipped = df_counts_zscore.clip(-4, 4)

# Reattach metadata
df_zscore_clipped = df_scaled_final[metadata_cols].copy()  
df_zscore_clipped = pd.concat([df_zscore_clipped, df_counts_zscore_clipped], axis=1)  

# Display first few rows
print(df_zscore_clipped)


## Clean up naming and order

In [None]:
rename_dict = {
    "T Lymphocyte 1 (CD8+)": "CD8_Tcells",
    "T lymphocyte 2 (CD4+)": "CD4_Tcells",
    "Naive T": "Naive T",
    "Natural Killer T": "NK",
    "Fibro": "Fibroblasts",
    "Endothelial_General": "Endothelial",
    "Macrophage": "Macrophages",
    "Plasma_Memory_B": "Bcells"   
}

rename_dict = {old: new for old, new in rename_dict.items() if old in df_zscore_clipped.columns}

df_zscore_clipped.rename(columns=rename_dict, inplace=True)

# Verify the renamed columns
print(df_zscore_clipped)


In [None]:
# Remove rows where 'cell_type' is 'DCs' or 'Neutrophils'
df_zscore_clipped = df_zscore_clipped[~df_zscore_clipped['cell_type'].isin(['DCs', 'Neutrophils'])]

# Remove rows 'cell_type' is NaN
df_zscore_clipped = df_zscore_clipped.dropna(subset=['cell_type'])

# Verify removal
print(df_zscore_clipped['cell_type'].unique())  
print(df_zscore_clipped['cell_type'].value_counts())  
print(df_zscore_clipped.isna().sum())  


In [None]:
# Define the new desired order for `cell_type`
desired_cell_type_order = [
    "Bcells", "CD4_Tcells", "CD8_Tcells", "NK", "Macrophages", "Endothelial", "Fibroblasts", "Hepatocytes"
]

# Convert `cell_type` column to categorical type with defined order
df_zscore_clipped["cell_type"] = pd.Categorical(
    df_zscore_clipped["cell_type"], 
    categories=desired_cell_type_order, 
    ordered=True
)

# Sort DataFrame based on the ordered `cell_type` column
df_zscore_clipped = df_zscore_clipped.sort_values("cell_type")

# Display the first few rows to verify the new order
print(df_zscore_clipped)


In [None]:
# Remove NaN rows after sorting
df_zscore_clipped = df_zscore_clipped.dropna(subset=["cell_type"])

# Display first few rows to verify the changes
print(df_zscore_clipped.head())


In [None]:
# Define the desired column order
desired_column_order = [
    "Bcells", "CD4_Tcells", "CD8_Tcells", "NK", "Macrophages", 
    "Endothelial", "Fibroblasts", "Hepatocytes"  # Renamed
]

# Identify all metadata columns
metadata_cols = ["cell_type", "Chromosome", "Start", "End"]

# Rename "Hepatocyte" to "Hepatocytes" 
if "Hepatocyte" in df_zscore_clipped.columns:
    df_zscore_clipped = df_zscore_clipped.rename(columns={"Hepatocyte": "Hepatocytes"})

# Identify all remaining columns 
remaining_columns = [col for col in df_zscore_clipped.columns if col not in desired_column_order + metadata_cols]

# Remove unwanted cell types
unwanted_cell_types = ["Naive T", "Acinar", "Ductal", "Airway Goblet", "Colon Goblet", 
                        "Mast", "Tuft", "Colon_Epithelial", "Mammary_Luminal_Epi", "Alveolar"]
remaining_columns = [col for col in remaining_columns if col not in unwanted_cell_types]

# Define the final column order 
final_column_order = metadata_cols + [col for col in desired_column_order if col in df_zscore_clipped.columns] + remaining_columns

# Reorder columns 
df_zscore_clipped = df_zscore_clipped[final_column_order]

# Display first few rows to verify
print(df_zscore_clipped.head())


In [None]:
# Compute y-axis labels
cell_types = df_zscore_clipped["cell_type"].astype(str).tolist()
n = len(cell_types)
middle_labels = [""] * n
start = 0
for i in range(1, n + 1):
    
    if i == n or cell_types[i] != cell_types[start]:
        mid_index = (start + i - 1) // 2
        middle_labels[mid_index] = cell_types[start]
        start = i

# Exclude Metadata Columns
metadata_cols = ["cell_type", "Chromosome", "Start", "End"]
df_numeric = df_zscore_clipped.drop(columns=metadata_cols)

# X-axis Labels 
x_labels = df_numeric.columns

# Define colormap
cmap_colors = [
    (0.00, "#4169E1"),
    (0.15, "#6A98E1"),
    (0.25, "#A4C5F5"),
    (0.50, "#FFFFCC"),
    (0.75, "#FFD699"),
    (0.85, "#D73027"),
    (1.00, "#B40426")
]
custom_cmap = LinearSegmentedColormap.from_list("custom_cmap_smooth", cmap_colors, N=1024)

# Normalize Colors More Smoothly
color_norm = plt.Normalize(vmin=-4, vmax=4, clip=True)


# Generate heatmap 
plt.figure(figsize=(8, 8), dpi=300) 
ax = sns.heatmap(
    df_numeric,
    cmap=custom_cmap,
    center=0,
    xticklabels=x_labels,
    yticklabels=middle_labels, 
    vmin=-4,
    vmax=4,
    norm=color_norm,
    robust=True
)

# Remove the y-axis tick
plt.tick_params(axis='y', which='both', left=False, labelleft=True)

# Remove the left axis spine
ax.spines["left"].set_visible(False)

# Customize tick label font sizes and rotations
plt.xticks(rotation=90, fontsize=16)
plt.yticks(rotation=0, fontsize=16)

# Customize colorbar ticks and font size
cbar = ax.collections[0].colorbar  
cbar.set_ticks([-4, -2, 0, 2, 4])   
cbar.ax.tick_params(labelsize=16)    

# Add title
plt.title("Validation in single-cell data", fontsize=20, fontweight="bold", loc="center", pad=15)

# Save the figure
plt.savefig("heatmap_validation_new_markers_single_cell_samples.png", dpi=600, bbox_inches="tight")

# Show the figure
plt.show()
