# Spectral Clustering as a Classification Method for Gut Microbiome Data

In [3]:
#importing libraries 
import pandas as pd
from scipy.spatial import distance
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score

In [54]:
#importing utils
from utils_spectral_clustering_model import get_distance_matrix,get_accuracy,get_accuracy_test,get_test_set_distances,grid_search_threshold

In [44]:
# see dataset_preprocessing notebook to get to know how the dataset has been generate
final_dataset=pd.read_csv("data/microbiome_family_level.csv")

In [45]:
X=final_dataset.drop(columns=['class'])
y=final_dataset['class']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

## Performing GridSearch to Select the Best Threshold for the Spectral Clustering Model

In [None]:
grid_search_results=grid_search_threshold(X_train,y_train,[0.5,0.7,0.9,0.97],100)

In [51]:
#saving results of the experiments
#pd.DataFrame(grid_search_results).to_csv("experiments_results/grid_search_spectral_clustering.csv", index=None)

In [12]:
grid_search_results=pd.read_csv("experiments_results/grid_search_spectral_clustering.csv")
grid_search_results=grid_search_results.round(3)

### Best Threshold Selection



In [13]:
import plotly.express as px

In [30]:
fig=px.line(grid_search_results, x="thresh", y="test_accuracy",  title="<b>Spectral Clustering Accuracy vs Threshold<b>", color_discrete_sequence=[px.colors.qualitative.Bold[2]],text="test_accuracy")
fig.add_annotation(x=0.94, y=0.7,
            text="Optimal Threshold",
            showarrow=False,   
            )
fig.update_traces(textposition="bottom right")
fig.update_traces(line=dict(width=4,))
fig.update_xaxes(title="Threshold Value")
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', showlegend=False)
fig.add_vline(x=0.97, line_width=3, line_dash="dash", line_color=px.colors.qualitative.Dark2[7], opacity=0.7)
fig.update_layout(font=dict(size=10),)
#fig.write_html("optimal_threshold.html")

## Testing the Model

In [51]:
optimal_threshold=0.975
#fitting the model using the whole training set
X_train_graph=get_distance_matrix(X_train)
X_train_graph[X_train_graph>=optimal_threshold]=1
X_train_graph_similarity=1-X_train_graph
clusterer = SpectralClustering(n_clusters=2, random_state=6, affinity="precomputed", n_jobs=-1)
# distance matrix is computed using braycurtis distance!
cluster_labels = clusterer.fit_predict(X_train_graph_similarity)
accuracy_training,result_classes_per_cluster=get_accuracy(cluster_labels,y_train)
#testing the model on the test set
X_test_graph=get_test_set_distances(X_test,X_train) 
#use the threshold
X_test_graph[X_test_graph>=optimal_threshold]=1
#convert distance to similarity
X_test_graph_similarity=1-X_test_graph
#get test accuracy
test_accuracy=get_accuracy_test(X_train_graph_similarity, X_test_graph_similarity, cluster_labels, y_test, result_classes_per_cluster)

In [52]:
print(f"training accuracy:{accuracy_training}, test accuracy:{test_accuracy}")

training accuracy:0.661, test accuracy:0.621


## Training Various Machine Learning Models and Conducting a Comparative Analysis with Spectral Clustering

In [None]:
# TODO perform classification task using various machine learning models