# ðŸ§¬ Oral Microbiome Analysis: Biomarker Discovery for OSCC

This Colab notebook demonstrates how to load 16S rRNA feature data, visualize taxonomic profiles, and apply machine learning models to classify samples as Healthy or OSCC.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve


## ðŸ“‚ Load the Feature Table and Metadata

In [None]:
from google.colab import files
uploaded = files.upload()
# Example filenames: feature-table.tsv, metadata.csv


In [None]:
# Adjust this section according to uploaded filenames
feature_table = pd.read_csv("feature-table.tsv", sep="\t", index_col=0)
metadata = pd.read_csv("metadata.csv")

# Join metadata to feature table
data = feature_table.T.merge(metadata, left_index=True, right_on='SampleID')
X = data.drop(columns=['SampleID', 'Class'])  # microbial features
y = data['Class']  # Healthy or OSCC


## ðŸ“Š Visualize Top Taxa

In [None]:
top_taxa = X.sum(axis=0).sort_values(ascending=False).head(10)
top_taxa.plot(kind='bar', color='skyblue')
plt.title("Top 10 Abundant Microbial Taxa")
plt.ylabel("Total Counts")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## ðŸ¤– Random Forest Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Healthy', 'OSCC'], yticklabels=['Healthy', 'OSCC'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
y_probs = rf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test.map({'Healthy': 0, 'OSCC': 1}), y_probs)
roc_auc = roc_auc_score(y_test.map({'Healthy': 0, 'OSCC': 1}), y_probs)

plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend()
plt.grid()
plt.show()
