# Spectral Clustering as a Classification Method for Gut Microbiome Data

In [127]:
#importing libraries 
import pandas as pd
from scipy.spatial import distance
from utils_preprocessing import  merge_dataset
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split
import numpy as np

In [128]:
dataset=merge_dataset(path_otu="data/curatedMD_diseased_train/20221207_diseased_taxrelabund_train.csv", path_metadata="data/curatedMD_diseased_train/20221207_diseased_metadata_train_18_columns.csv", level=5)

In [129]:
dataset.head()

Unnamed: 0,sample_id,study_name,subject_id,body_site,antibiotics_current_use,study_condition,disease,age,age_category,country,...,f__Sutterellaceae,f__Synergistaceae,f__Tannerellaceae,f__Thermaceae,f__Tissierellia_unclassified,f__Veillonellaceae,f__Vibrionaceae,f__Victivallaceae,f__Xanthomonadaceae,f__Yersiniaceae
0,SID31004,FengQ_2015,SID31004,stool,missing,CRC,CRC;fatty_liver;hypertension,64,adult,AUT,...,0.0,0.0,0.0,0.0,0.0,0.1302,0.0,0.0,0.0,0.0
1,SID31030,FengQ_2015,SID31030,stool,missing,adenoma,adenoma;fatty_liver;hypertension,70,senior,AUT,...,0.0,0.0,0.28971,0.0,0.0,0.18229,0.0,0.0,0.0,0.0
2,SID31159,FengQ_2015,SID31159,stool,missing,CRC,CRC,73,senior,AUT,...,0.0,0.0,0.45937,0.0,0.0,1.14717,0.0,0.0,0.0,0.0
3,SID31223,FengQ_2015,SID31223,stool,missing,CRC,CRC,65,adult,AUT,...,0.0,0.05291,0.32816,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SID31237,FengQ_2015,SID31237,stool,missing,CRC,CRC;fatty_liver;hypertension,67,senior,AUT,...,0.0,0.0,0.02137,0.0,0.0,0.00266,0.0,0.0,0.0,0.0


In [130]:
dataset['study_condition'].value_counts()

study_condition
IBD        654
CRC        555
IGT        233
T2D        198
adenoma    171
ACVD       171
Name: count, dtype: int64

In [131]:
ibd_class = dataset[dataset['study_condition'] == 'IBD']
crc_class = dataset[dataset['study_condition'] == 'CRC']

# Reduce the 'IBD' class to 555 samples
ibd_class_subset = ibd_class.sample(n=555, random_state=42)  # Adjust 

# Combine the subsets of both classes
final_dataset = pd.concat([ibd_class_subset, crc_class])

# Shuffle the final dataset to ensure randomness
final_dataset = final_dataset.sample(frac=1, random_state=42)

In [132]:
final_dataset

Unnamed: 0,sample_id,study_name,subject_id,body_site,antibiotics_current_use,study_condition,disease,age,age_category,country,...,f__Sutterellaceae,f__Synergistaceae,f__Tannerellaceae,f__Thermaceae,f__Tissierellia_unclassified,f__Veillonellaceae,f__Vibrionaceae,f__Victivallaceae,f__Xanthomonadaceae,f__Yersiniaceae
1697,SAMD00114897,YachidaS_2019,sub_10670,stool,missing,CRC,CRC,69,senior,JPN,...,0.00000,0.0,0.38378,0.0,0.0,29.50806,0.0,0.0,0.0,0.0
1224,V1_UC26_4,NielsenHB_2014,V1_UC26,stool,missing,IBD,IBD,36,adult,ESP,...,0.00000,0.0,1.89772,0.0,0.0,0.91117,0.0,0.0,0.0,0.0
355,CSM7KOTC,HMP_2019_ibdmdb,C3023,stool,no,IBD,IBD,60,adult,USA,...,0.62574,0.0,0.00000,0.0,0.0,0.72030,0.0,0.0,0.0,0.0
1480,mix35_mix36-N712-S507_GTAGAGGA-AAGGAGTA,ThomasAM_2019_c,mix35_mix36-N712-S507_GTAGAGGA-AAGGAGTA,stool,missing,CRC,CRC,28,adult,JPN,...,0.00000,0.0,2.58337,0.0,0.0,0.28302,0.0,0.0,0.0,0.0
1507,MMRS42780924ST-27-0-0,VogtmannE_2016,MMRS42780924ST-27-0-0,stool,missing,CRC,CRC,65,adult,USA,...,0.01482,0.0,2.86593,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168,O2_UC37_2,NielsenHB_2014,O2_UC37,stool,missing,IBD,IBD,44,adult,ESP,...,0.83215,0.0,0.56393,0.0,0.0,8.86506,0.0,0.0,0.0,0.0
200,CSM67UFV,HMP_2019_ibdmdb,C3006,stool,no,IBD,IBD,32,adult,USA,...,0.00000,0.0,0.36014,0.0,0.0,0.00615,0.0,0.0,0.0,0.0
1884,SZAXPI017456-23,YuJ_2015,SZAXPI017456-23,stool,missing,CRC,CRC,77,senior,CHN,...,0.00000,0.0,0.10567,0.0,0.0,0.45300,0.0,0.0,0.0,0.0
1959,CCIS72607085ST-4-0,ZellerG_2014,FR-824,stool,missing,CRC,CRC,74,senior,FRA,...,0.00000,0.0,5.88908,0.0,0.0,4.27232,0.0,0.0,0.0,0.0


In [133]:
final_dataset.loc[:, 'class'] = final_dataset['study_condition']

In [134]:
final_dataset=final_dataset.iloc[:,18:]

In [135]:
final_dataset=final_dataset.reset_index(drop=True)

In [12]:
final_dataset['class'].value_counts()

class
CRC    555
IBD    555
Name: count, dtype: int64

In [142]:
final_dataset.to_csv("microbiome_family_level.csv", index=False)

In [None]:
X=final_dataset.drop(columns=['class'])
y=final_dataset['class']

In [None]:
# TODO split the dataset X and y to create an hold hout which is gonna be used to test the final model with the optimal thresholds

## Spectral Clustering Analysis

In [140]:
def get_distance_matrix(dataset, metric="braycurtis"):
    distance_matrix=distance.squareform(distance.pdist(dataset, metric=metric))
    #avoiding NaN values
    distance_matrix=pd.DataFrame(distance_matrix).fillna(0).values
    return distance_matrix

def get_accuracy(cluster_labels: np.ndarray, y_train: pd.core.series.Series)->int:
  mapping_class_to_clusters=pd.DataFrame({'cluster':list(cluster_labels), 'class':list(y_train)})
  # given this dataset we need to determine what is the majority class for each cluster
  group_by_class_to_clusters=mapping_class_to_clusters.groupby(by=["cluster", "class"]).size().reset_index(name="count")
  max_counts_idx = group_by_class_to_clusters.groupby('cluster')['count'].idxmax()
  # Display the corresponding rows
  result = group_by_class_to_clusters.loc[max_counts_idx]
  total_number_of_samples=group_by_class_to_clusters['count'].sum()
  correctly_predicted=result['count'].sum()
  accuracy=correctly_predicted/total_number_of_samples
  accuracy=accuracy.round(3)
  return accuracy

def iterated_hold_out_clustering(X:pd.core.frame.DataFrame,y:pd.core.series.Series,iterations:int,thresh:float)->list:
  results={'training_accuracy':[], 'test_accuracy':[]}
  for i in range(iterations):
    # splitting dataset in training and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=i)
    X_train_graph=get_distance_matrix(X_train)
    X_train_graph[X_train_graph>=thresh]=1
    X_train_graph_similarity=1-X_train_graph
    clusterer = SpectralClustering(n_clusters=2, random_state=i, affinity="precomputed", n_jobs=-1)
    # distance matrix is computed using braycurtis distance!
    cluster_labels = clusterer.fit_predict(X_train_graph_similarity)
    accuracy_training=get_accuracy(cluster_labels,y_train)
    results['training_accuracy'].append(accuracy_training)
    # doing the same procedure on X_test
    X_test_graph=get_distance_matrix(X_test)
    X_test_graph[X_test_graph>=thresh]=1
    X_test_graph_similarity=1-X_test_graph
    clusterer_test = SpectralClustering(n_clusters=2, random_state=i, affinity="precomputed", n_jobs=-1)
    cluster_labels_test = clusterer_test.fit_predict(X_test_graph_similarity)
    accuracy_test=get_accuracy(cluster_labels_test,y_test)
    results['test_accuracy'].append(accuracy_test)

  return results

In [138]:
#TODO write a function that will try different parameters for the threshold 
def threshold_grid_search():
  pass

In [139]:
iterated_hold_out_clustering(X,y,2,0.957)

{'training_accuracy': [0.643, 0.646], 'test_accuracy': [0.646, 0.61]}

In [None]:
# TODO produce all the charts to show how the performance vary with different values

In [None]:
# Use the best parameters to test the final clustering 
