In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim



In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [5]:
microbiome_data = pd.read_csv('genera.counts.tsv',delimiter='\t')
print(microbiome_data.head())


   Age        BMI       Stage  \
0   57  26.880952  Stage_I_II   
1   65  26.562500     Healthy   
2   40  25.000000     Healthy   
3   67  20.173253     Healthy   
4   77  24.464602     Healthy   

   d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Monoglobales;f__Firm-18;g__UBA1775  \
0                                                  0                                 
1                                                  0                                 
2                                                  0                                 
3                                                  0                                 
4                                                  0                                 

   d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Peptostreptococcales;f__Acidaminobacteraceae;g__Fusibacter_A  \
0                                                 47                                                           
1                                                162        

In [None]:
print(microbiome_data.shape)

(346, 11945)


In [7]:
microbiome_data['Stage'] = microbiome_data['Stage'].apply(lambda x: 0 if x == 'Healthy' else 1)

scaler = StandardScaler()
microbiome_data_scaled = scaler.fit_transform(microbiome_data)

In [8]:
stage = microbiome_data['Stage'].values
print(stage)
full_data = pd.DataFrame(microbiome_data_scaled, columns=microbiome_data.columns)
full_data['Stage'] = stage
print(full_data.head())

[1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0
 1 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 1 1 0 1 0 1
 1 1 1 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0
 1 1 0 0 1 1 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1
 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0
 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0
 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 1
 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 0 0 1 1 1 1
 1 1 0 1 1 1 1 0 0 1 1 0 1]
        Age       BMI  Stage  \
0 -0.605229  1.209710      1   
1  0.173968  1.109950      0   
2 -2.261022  0.620475      0   
3  0.368767 -0.891571      0   
4  1.342763  0.452754      0   

   d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Monoglobales;f__Firm-18;g__UBA1775  \
0                 

In [9]:
X = full_data.drop('Stage', axis = 1)
y = full_data['Stage']

In [10]:
print(len(set(y)))

2


In [11]:
# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X,  # Features
    y,  # Target
    test_size=0.2, random_state=42
)

In [12]:
X_train,y_train = torch.tensor(X_train.values,dtype = torch.float), torch.tensor(y_train.values,dtype = torch.int)
X_test, y_test = torch.tensor(X_test.values,dtype = torch.float), torch.tensor(y_test.values,dtype = torch.int)



In [13]:
print(X_train)

tensor([[ 0.6610,  0.7886, -0.2589,  ...,  0.0000,  0.0000, -0.0538],
        [ 0.0766, -0.5839, -0.2589,  ...,  0.0000,  0.0000, -0.0538],
        [ 0.6610, -0.3483, -0.2589,  ...,  0.0000,  0.0000, -0.0538],
        ...,
        [-1.5792, -0.4906, -0.2589,  ...,  0.0000,  0.0000, -0.0538],
        [ 0.5636,  0.3952, -0.2589,  ...,  0.0000,  0.0000, -0.0538],
        [-0.5078,  3.6932, -0.2589,  ...,  0.0000,  0.0000, -0.0538]])


In [14]:
train_data = TensorDataset(X_train,y_train)
test_data = TensorDataset(X_test,y_test)
train_loader = DataLoader(train_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size = 32)

In [15]:
model = nn.Sequential(
    nn.LazyLinear(1024, bias = True),
    nn.ReLU(),
    nn.Dropout(0.25),
    nn.LazyLinear(2),
    nn.Softmax(-1)
)

model.to(device)


Sequential(
  (0): LazyLinear(in_features=0, out_features=1024, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.25, inplace=False)
  (3): LazyLinear(in_features=0, out_features=2, bias=True)
  (4): Softmax(dim=-1)
)

In [16]:
learning_rate = 0.001
num_epochs = 20

In [18]:
# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Move data to the device (e.g., GPU)
        inputs = inputs.to(device)
        labels = labels.to(device).long()  # Convert labels to LongTensor

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if (i+1) % 10 == 0:  # Print every 10 batches
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

print("Training complete!")

# Save the model checkpoint
torch.save(model.state_dict(), 'model.pth')


Training complete!


In [22]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():  # Disable gradient computation
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)  # Forward pass
        _, predicted = torch.max(outputs, 1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_preds, all_labels)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.59


In [None]:
# Initialize the logistic regression model
model = SVC()

# Fit the model on the training data
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{cm}')

# ROC Curve and AUC
auc_score = roc_auc_score(y_test, y_prob)
print(f'AUC: {auc_score:.4f}')

# Plot ROC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split( full_data.drop('Stage', axis=1),
    full_data['Stage'], test_size=0.2, random_state=42)
print(np.unique(y_train))


In [None]:
#Check model agaisnt mutiple solvers
solvers = [
    ('SAG', LogisticRegression(penalty='l2', solver='sag', max_iter=1000, random_state=42)),
    ('SAGA', LogisticRegression(penalty='l1', solver='saga', max_iter=1000, random_state=42)),
    ('lbfgs', LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, random_state=42)),
    ('liblinear', LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000, random_state=42)),
    ('SGD', SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)),
    ('Passive-Aggressive', PassiveAggressiveClassifier(max_iter=1000, random_state=42)),
    ('Perceptron', Perceptron(max_iter=1000, random_state=42))
]
results = []
for name, model in solvers:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        auc_score = roc_auc_score(y_test, y_prob)
    else:
        auc_score = np.nan

    results.append((name, accuracy, auc_score))

results_df = pd.DataFrame(results, columns=["Solver", "Accuracy", "AUC"])
print(results_df)


In [None]:
from sklearn.linear_model import LogisticRegressionCV
lasso_model = LogisticRegressionCV(
    Cs=10, penalty='l1', solver='SAGA', max_iter=10000, cv=5, random_state=42
)

lasso_model.fit(X_train, y_train)
selected_features = np.where(lasso_model.coef_ != 0)[1]
print(f'Selected Features: {selected_features}')

In [None]:
selected_feature_names = X_train.columns[selected_features]
X_train_selected = X_train[selected_feature_names]
X_test_selected = X_test[selected_feature_names]

print(f"Selected features: {selected_feature_names}")

In [None]:
final_model = LogisticRegression(solver='SAGA', penalty='l1', max_iter=10000)
final_model.fit(X_train_selected, y_train)

In [None]:
y_pred = final_model.predict(X_test_selected)
y_prob = final_model.predict_proba(X_test_selected)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, final_model.predict_proba(X_test_selected)[:, 1])
print(f"Accuracy: {accuracy}")

cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{cm}')

# ROC Curve and AUC
auc_score = roc_auc_score(y_test, y_prob)
print(f'AUC: {auc_score:.4f}')

# Plot ROC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()