In [None]:
import numpy as np
import pandas as pd
from nimfa import Nmf
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import os
import pyreadr
from sklearn.cluster import KMeans

In [None]:
################# Perform NMF-based clustering of T cells using neighboring cell compositions
#################
#################
#################
#################
#################
#################

In [None]:
#################(1). Read cell composition data
cellcomposition = pd.read_csv("/user/space/analysis/neighbor/allcancer/tcell/cellcomposition_merge.csv",index_col=0) # Cell composition data was generated in Step09_1, line 500
print(cellcomposition.iloc[:, :5])

In [None]:
#################(2). Transform the cell composition data to numeric format
cellcomposition_numeric = cellcomposition.select_dtypes(include=['float64', 'int64'])
print(cellcomposition_numeric.iloc[:, :5])

In [None]:
#################(3). Evaluate the appropriate settings for K 
k_range = range(2, 12)
divergences = []
for k in k_range:
    model = Nmf(cellcomposition_numeric.values, rank=k, method="nsnmf", max_iter=2000)
    result = model()
    divergences.append(result.distance(metric='euclidean'))  # Record the loss

In [None]:
#################(4). Visualize the loss to select appropriate value of K
plt.plot(k_range, divergences, 'bo-')
plt.xlabel('Rank (K)')
plt.ylabel('Divergence (Euclidean)')
plt.title('Elbow Method for Optimal K')
plt.show()

In [None]:
#################(5). Run NMF
def run_nmf(data):
    model = Nmf(data, rank=8, method="nsnmf", max_iter=2000)
    return model()
nmf_result = run_nmf(cellcomposition_numeric.values)

In [None]:
#################(6). Extract cell-factor matrix for downstream analysis
H_transposed = nmf_result.coef().T
cell_by_factor = pd.DataFrame(
    H_transposed,
    index=cellcomposition_numeric.columns,  # Column name -> Row name (colnames(cellcomposition) in R)
    columns=[f"Factor_{i+1}" for i in range(H_transposed.shape[1])]  # Factor column name
)
print(cell_by_factor.head())

In [None]:
#################(7). Write cell-factor matrix to .csv file for analysis in R
cell_by_factor.to_csv("/user/space/analysis/neighbor/allcancer/tcell/3_7_cell_by_factor.csv", index=True, index_label="Cell_ID")

In [None]:
cell_by_factor = pd.read_csv("/user/space/analysis/neighbor/allcancer/tcell/3_7_cell_by_factor.csv",index_col=0)
print(cell_by_factor.iloc[:, :8])

In [None]:
#################(8). Perform K-means cluster

In [None]:
# Check the data (ensure there are no NaN/Inf values)
print(cell_by_factor.isnull().sum())  # Confirm that there are no missing values
print(cell_by_factor.describe())   

In [None]:
# Standardization (Z-score normalization, to avoid differences in factor units)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cell_scaled = scaler.fit_transform(cell_by_factor)

In [None]:
#################(9). Search for the appropriate K value
inertia = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(cell_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 10), inertia, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Within-cluster SSE (Inertia)")
plt.title("Elbow Method")
plt.show()

In [None]:
# K-means cluster: 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42).fit(cell_scaled)
cell_by_factor['Cluster'] = kmeans.labels_  # Add cluster labels to the original data

# Check the number of samples in each cluster
print(cell_by_factor['Cluster'].value_counts())

In [None]:
#################(10). Export the complete information including the row names (cell ID) and column names (Cluster)
cell_by_factor['Cluster'].to_csv("/user/space/analysis/neighbor/allcancer/tcell/3_10_cluster_kmeans_4.csv", header=True, index=True)

In [None]:
print(cell_by_factor['Cluster'].head())