# Spectral Clustering as a Classification Method for Gut Microbiome Data

In [3]:
#importing libraries 
import pandas as pd
from scipy.spatial import distance
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score

In [4]:
#importing utils
from utils_spectral_clustering_model import get_distance_matrix,get_accuracy,get_accuracy_test,get_test_set_distances,grid_search_threshold

In [5]:
# see dataset_preprocessing notebook to get to know how the dataset has been generate
final_dataset=pd.read_csv("data/microbiome_family_level.csv")

In [6]:
X=final_dataset.drop(columns=['class'])
y=final_dataset['class']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

## Performing GridSearch to Select the Best Threshold for the Spectral Clustering Model

In [None]:
grid_search_results=grid_search_threshold(X_train,y_train,[0.5,0.7,0.9,0.97],100)

In [51]:
#saving results of the experiments
#pd.DataFrame(grid_search_results).to_csv("experiments_results/grid_search_spectral_clustering.csv", index=None)

In [12]:
grid_search_results=pd.read_csv("experiments_results/grid_search_spectral_clustering.csv")
grid_search_results=grid_search_results.round(3)

### Best Threshold Selection



In [26]:
import plotly.express as px

In [30]:
fig=px.line(grid_search_results, x="thresh", y="test_accuracy",  title="<b>Spectral Clustering Accuracy vs Threshold<b>", color_discrete_sequence=[px.colors.qualitative.Bold[2]],text="test_accuracy")
fig.add_annotation(x=0.94, y=0.7,
            text="Optimal Threshold",
            showarrow=False,   
            )
fig.update_traces(textposition="bottom right")
fig.update_traces(line=dict(width=4,))
fig.update_xaxes(title="Threshold Value")
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', showlegend=False)
fig.add_vline(x=0.97, line_width=3, line_dash="dash", line_color=px.colors.qualitative.Dark2[7], opacity=0.7)
fig.update_layout(font=dict(size=10),)
#fig.write_html("optimal_threshold.html")

## Testing the Model

In [51]:
optimal_threshold=0.975
#fitting the model using the whole training set
X_train_graph=get_distance_matrix(X_train)
X_train_graph[X_train_graph>=optimal_threshold]=1
X_train_graph_similarity=1-X_train_graph
clusterer = SpectralClustering(n_clusters=2, random_state=6, affinity="precomputed", n_jobs=-1)
# distance matrix is computed using braycurtis distance!
cluster_labels = clusterer.fit_predict(X_train_graph_similarity)
accuracy_training,result_classes_per_cluster=get_accuracy(cluster_labels,y_train)
#testing the model on the test set
X_test_graph=get_test_set_distances(X_test,X_train) 
#use the threshold
X_test_graph[X_test_graph>=optimal_threshold]=1
#convert distance to similarity
X_test_graph_similarity=1-X_test_graph
#get test accuracy
test_accuracy=get_accuracy_test(X_train_graph_similarity, X_test_graph_similarity, cluster_labels, y_test, result_classes_per_cluster)

In [52]:
print(f"training accuracy:{accuracy_training}, test accuracy:{test_accuracy}")

training accuracy:0.661, test accuracy:0.621


## Training Various Machine Learning Models and Conducting a Comparative Analysis with Spectral Clustering

In [9]:
from utils_classification_model import iterated_hold_out_nn

In [None]:
results_neural_networks=iterated_hold_out_nn(X_train, y_train, 1000)

In [15]:
accuracy_training_hold_out=np.mean(results_neural_networks['training_accuracy'])
accuracy_test_hold_out=np.mean(results_neural_networks['test_accuracy'])

In [17]:
print(f"Hold-out result MLPClassifier: \n accuracy_training:{round(accuracy_training_hold_out,3)}, accuracy_test:{round(accuracy_test_hold_out,3)} ")

Hold-out result MLPClassifier: 
 accuracy_training:0.65, accuracy_test:0.633 


## Training and testing the NN (MLPClassifier) model

In [8]:
from sklearn.neural_network import MLPClassifier

In [18]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(99, 50), activation='relu', solver='adam', alpha=1e-5,random_state=0, max_iter=3)
# train the neural network
mlp_classifier.fit(X_train, y_train)
# predict on the training set
y_pred_training = mlp_classifier.predict(X_train)
# calculate accuracy on the training set
accuracy_training = accuracy_score(y_pred_training, y_train)
# predict on the test set
y_pred = mlp_classifier.predict(X_test)
# calculate accuracy
accuracy_test = accuracy_score(y_test, y_pred)



In [19]:
print(f"MLPClassifier training accuracy: {round(accuracy_training,3)} and test accuracy: {round(accuracy_test,3)}")

MLPClassifier training accuracy: 0.697 and test accuracy: 0.649


In [20]:
accuracy_training

0.6971736204576043

In [21]:
accuracy_test

0.6485013623978202

## Comparison between MLPClassifier and Spectral Clustering (Generate Charts)

### Iterated Hold-Out Procedure Comparison

In [36]:
data = {
    'model': ['Spectral Clustering', 'MLPClassifier'],
    'training_accuracy': [0.662, 0.650],
    'test_accuracy': [0.665, 0.630]
}

In [46]:
df = pd.DataFrame(data)
df = df.sort_values(by=['model'], ascending=True)
df_melted = df.melt(id_vars='model', var_name='accuracy_type', value_name='accuracy')
fig = px.bar(df_melted, x='accuracy', y='model', color='accuracy_type', color_discrete_sequence=[px.colors.qualitative.Bold[2],px.colors.qualitative.Dark2[7],px.colors.qualitative.Bold[2],px.colors.qualitative.Dark2[7]],barmode='group',orientation='h',text_auto=True,
             labels={'accuracy': 'Accuracy', 'model': 'Model', 'accuracy_type': 'Accuracy Type'},
             title='Comparison of Training and Test Accuracies Iterated Hold-out Procedure (1000 iterations)')
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
fig.update_layout(font=dict(size=10),)
fig.show()

In [47]:
#fig.write_html("charts/iterated_hold_out_model_selection_procedure_comparison.html")

### Final Comparison with the optimal models

In [51]:
data = {
    'model': ['Spectral Clustering', 'MLPClassifier'],
    'test accuracy': [0.621, 0.633],
}

In [52]:
df = pd.DataFrame(data)
fig = px.bar(df, x='test accuracy', y='model', color_discrete_sequence=[px.colors.qualitative.Bold[2]], orientation='h',text_auto=True,
             labels={'accuracy': 'Accuracy', 'model': 'Model'},
             title='Comparison of Spectral Clustering and MLP on the Test Set')
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
fig.update_layout(font=dict(size=10),)
fig.show()

In [53]:
#fig.write_html("charts/comparison_two_models_test_set.html")