# Dimensionality reduction and visualization of original marker based cell types

## Load required libraries

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D  
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import re
import umap

## Load normalized marker counts

In [None]:
# Load the CSV file into a DataFrame.
normalized_df = pd.read_csv("/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/normCounts_cell_type_markers.csv", index_col=0)

In [None]:
# print(normalized_df.head())

In [None]:
# Function to rename columns
def rename_columns(col):
    if 'CD4' in col or 'Tregs' in col:
        return 'CD4_Tcells'
    elif 'CD8' in col:
        return 'CD8_Tcells'
    else:
        return col.split('.')[0]  

# Apply renaming
normalized_df.columns = [rename_columns(col) for col in normalized_df.columns]

# Check the resulting DataFrame
# print(normalized_df.head())

In [None]:
# Drop Monocyte columns
normalized_df = normalized_df.loc[:, ~normalized_df.columns.str.contains('Monocytes')]

# Check result
# print(normalized_df)

In [None]:
# PCA calculation
pca = PCA(n_components=3)
pca_results = pca.fit_transform(normalized_df.T)

# Create DataFrame with PCA results
pca_df = pd.DataFrame(pca_results, columns=['PC1', 'PC2', 'PC3'])

# Extract cell type labels correctly 
pca_df['Cell_type_group'] = [col.split('.')[0] for col in normalized_df.columns]

# Generate distinct color palette
unique_types = pca_df['Cell_type_group'].unique()
palette = sns.color_palette("husl", len(unique_types))
color_dict = dict(zip(unique_types, palette))

## Scale from 0 to 1

In [None]:
# Rescale PCA results to [-1, 1]
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_pca = scaler.fit_transform(pca_df[['PC1', 'PC2', 'PC3']])

# Update pca_df with scaled values
pca_df_scaled = pd.DataFrame(scaled_pca, columns=['PC1', 'PC2', 'PC3'])
pca_df_scaled['Cell_type_group'] = pca_df['Cell_type_group']

# check
print(pca_df_scaled.head())


## Compute the Silhouette Score

In [None]:
overall_silhouette = silhouette_score(
    pca_df_scaled[['PC1', 'PC2', 'PC3']],
    pca_df_scaled['Cell_type_group']
)

print(f"Overall Silhouette Score: {overall_silhouette:.3f}")


## 3D PCA plot

In [None]:
# Create figure
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')
ax.view_init(elev=30, azim=-45)  # Adjust viewpoint

# Background color adjustments
ax.set_facecolor('white')
fig.patch.set_facecolor('white')

# Grid lines
ax.grid(color='gray', linestyle='-', linewidth=0.4, alpha=0.4)

# Plot points and thin vertical lines
for ctype in unique_types:
    idx = pca_df_scaled['Cell_type_group'] == ctype
    ax.scatter(
        pca_df_scaled.loc[idx, 'PC2'],
        pca_df_scaled.loc[idx, 'PC1'],
        pca_df_scaled.loc[idx, 'PC3'],
        label=ctype,
        s=40,
        alpha=0.9,
        edgecolors='k',
        linewidths=0.6,
        color=color_dict[ctype]
    )
    
    for x, y, z in zip(pca_df_scaled.loc[idx, 'PC2'], 
                       pca_df_scaled.loc[idx, 'PC1'], 
                       pca_df_scaled.loc[idx, 'PC3']):
        ax.plot([x, x], [y, y], [-1, z], color='gray', linewidth=0.2, alpha=0.4)

# Set axis ticks
ticks = [-1, -0.5, 0, 0.5, 1]
ax.set_xticks(ticks)
ax.set_xticklabels([str(t) for t in ticks])

ax.set_yticks(ticks)
ax.set_yticklabels([str(t) for t in ticks])

ax.set_zticks(ticks)
ax.set_zticklabels([str(t) for t in ticks])

# Axis labels with bold font and additional padding
ax.set_xlabel('PC2', fontsize=16, fontweight='bold', labelpad=10)
ax.set_ylabel('PC1', fontsize=16, fontweight='bold', labelpad=10)
ax.set_zlabel('PC3', fontsize=16, fontweight='bold', labelpad=10)

# Add a centered title
plt.title("PCA of original markers", fontsize=20, fontweight="bold", loc="center")

# Legend outside
legend = ax.legend(title='Cell types', bbox_to_anchor=(1.10, 0.85), loc='upper left',
                   fontsize=20, title_fontsize=20, frameon=False, markerscale=2.5)

# Silhouette score annotation
ax.text2D(0.15, 0.75, f"Silhouette score: {overall_silhouette:.2f}",
          transform=ax.transAxes, fontsize=16, ha='left', va='top')

# Tick label font size on all axes
for tick in ax.get_xticklabels():
    tick.set_fontsize(16)
for tick in ax.get_yticklabels():
    tick.set_fontsize(16)
for tick in ax.get_zticklabels():
    tick.set_fontsize(16)

# Save 
plt.savefig("PCA_original_markers.png", dpi=600, bbox_inches="tight")  # PNG

# Show the figure
plt.show()


## UMAP plot

In [None]:
# Define color palette
distinct_colors = [
    "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", 
    "#FFFF33", "#A65628", "#F781BF", "#999999", "#66C2A5",
    "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F",
    "#E5C494", "#B3B3B3", "#1B9E77", "#D95F02", "#7570B3"
]  

# Assign colors to each cell type
num_cell_types = len(unique_types)
color_dict = {ctype: distinct_colors[i % len(distinct_colors)] for i, ctype in enumerate(unique_types)}

# Apply UMAP 
umap_model = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.9, metric='euclidean', random_state=42)
embedding = umap_model.fit_transform(pca_df_scaled.drop(columns=['Cell_type_group']))

# Convert UMAP results to DataFrame
umap_df = pca_df_scaled.copy()
umap_df[['UMAP1', 'UMAP2']] = embedding

# Create figure
plt.figure(figsize=(14, 10))

# Plot each cell type separately
for ctype in unique_types:
    idx = umap_df['Cell_type_group'] == ctype
    
    plt.scatter(
        umap_df.loc[idx, 'UMAP1'],
        umap_df.loc[idx, 'UMAP2'],
        label=ctype,
        s=120,  
        alpha=0.95,  
        edgecolors='black',
        linewidths=1,
        color=color_dict[ctype],
        marker='o'  
    )

# Add labels and title
plt.xlabel("UMAP1", fontsize=20, fontweight='bold')
plt.ylabel("UMAP2", fontsize=20, fontweight='bold')

# Add title with extra padding
plt.title("UMAP of original markers", fontsize=30, fontweight="bold", pad=20)


# Add legend
plt.legend(title="Cell types", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=20, title_fontsize=20, frameon=False)

# Save 
plt.savefig("UMAP_original_markers.png", dpi=600, bbox_inches="tight")

# Show the figure
plt.show()
