In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
# Paths to data files
SAMPLE_GROUP_PATH = "data/sample.group"
SPECIES_PROFILE_PATH = "data/mpa4_species.profile"  # Use species-level abundance data

# Load sample group metadata
print("Loading sample group data...")
sample_group = pd.read_csv(SAMPLE_GROUP_PATH, sep="\t")  # Ensure tab-separated format

Loading sample group data...


In [None]:
# Load species abundance data
print("Loading species-level abundance data...")
species_abundance = pd.read_csv(SPECIES_PROFILE_PATH, sep="\t")  # Ensure tab-separated format

Loading species-level abundance data...


In [None]:
species_abundance_melted = species_abundance.melt(
    id_vars=["name"],  # Keep the "name" column as is
    var_name="Sample",  # Convert column headers into a new column named "Sample"
    value_name="Abundance"  # The corresponding values become "Abundance"
)

print("Species abundance columns:", species_abundance.columns.tolist())
print("Sample group columns:", sample_group.columns.tolist())

Species abundance columns: ['name', '850945', '907005', '907544', '907995', '910252', '917369', '920071', 'A0002', 'A0005', 'A0006', 'A0010', 'A0011', 'A0019', 'A0020', 'A0021', 'A0022', 'A0027', 'A0032', 'A324942', 'A327859', 'A673425', 'A833717', 'B182623', 'B266801', 'B481792', 'B489623', 'B506558', 'B899205', 'B921006', 'B935810', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0010', 'C0011', 'C0014', 'C0015', 'C0017', 'C0019', 'C0020', 'C0021', 'C0023', 'C0024', 'C0027', 'C0028', 'C0029', 'C0031', 'C0032', 'C0033', 'C0035', 'C0036', 'C0037', 'C0038', 'C0041', 'C0042', 'C0043', 'C0044', 'C0046', 'C0047', 'C0048', 'C0049', 'C0050', 'C0051', 'C0052', 'C0053', 'C0054', 'C0055', 'C0056', 'C0057', 'C0058', 'C0060', 'C0061', 'C0063', 'C0068', 'C0069', 'C0070', 'C0071', 'C0072', 'C0075', 'C0076', 'C0078', 'C0079', 'C0080', 'C0082', 'C0083', 'C0085', 'C0086', 'C0088', 'C0089', 'C0090', 'C0091', 'C0092', 'C0093', 'C0094', 'C0095', 'C0098', 'C0099', 'C01', 'C0100', 'C0102', 'C0103',

In [None]:

# Merge species abundance data with sample metadata
merged_data = species_abundance.merge(sample_group, on="Sample")

KeyError: 'Sample'

In [None]:
# Encode labels for Random Forest (Control = 0, Disease = 1)
print("Encoding labels for Random Forest...")
data['Group'] = data['Group'].apply(lambda x: 1 if x == "Disease" else 0)

# Prepare features (X) and labels (y)
X = data.drop(columns=["Sample", "Group", "Project", "Project_1"])
y = data["Group"]

# Split data into training and testing sets
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest classifier
print("Training Random Forest classifier...")
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate Random Forest classifier
print("Evaluating model performance...")
y_pred = model.predict(X_test)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
accuracy = accuracy_score(y_test, y_pred)

print(f"ROC AUC Score: {roc_auc:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Save results
print("Saving feature importances...")
feature_importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importances.to_csv("results/feature_importances.csv", index=False)

print("All analyses complete. Results saved to the 'results/' directory.")


In [None]:
# Encode labels for Random Forest (Control = 0, Disease = 1)
print("Encoding labels for Random Forest...")
data['Group'] = data['Group'].apply(lambda x: 1 if x == "Disease" else 0)

# Prepare features (X) and labels (y)
X = data.drop(columns=["Sample", "Group", "Project", "Project_1"])
y = data["Group"]

# Split data into training and testing sets
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest classifier
print("Training Random Forest classifier...")
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate Random Forest classifier
print("Evaluating model performance...")
y_pred = model.predict(X_test)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
accuracy = accuracy_score(y_test, y_pred)

print(f"ROC AUC Score: {roc_auc:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Save results
print("Saving feature importances...")
feature_importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importances.to_csv("results/feature_importances.csv", index=False)

print("All analyses complete. Results saved to the 'results/' directory.")


In [None]:
# Encode labels for Random Forest (Control = 0, Disease = 1)
print("Encoding labels for Random Forest...")
data['Group'] = data['Group'].apply(lambda x: 1 if x == "Disease" else 0)

# Prepare features (X) and labels (y)
X = data.drop(columns=["Sample", "Group", "Project", "Project_1"])
y = data["Group"]

# Split data into training and testing sets
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest classifier
print("Training Random Forest classifier...")
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate Random Forest classifier
print("Evaluating model performance...")
y_pred = model.predict(X_test)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
accuracy = accuracy_score(y_test, y_pred)

print(f"ROC AUC Score: {roc_auc:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Save results
print("Saving feature importances...")
feature_importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importances.to_csv("results/feature_importances.csv", index=False)

print("All analyses complete. Results saved to the 'results/' directory.")
