In [None]:
# Import necessary libraries
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Assuming the previous notebook is called 'Model_Training_Assessment.ipynb'
%run 'Model_Training_Assessment.ipynb'

In [None]:
# Assuming 'kmer_matrix_dense' and 'true_labels' are available
X = kmer_matrix_dense  # Feature matrix
y = np.random.randint(0, 2, size=(X.shape[0]))  # Example binary labels (replace with actual)

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the data (important for many models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
# Function to measure training time and accuracy for different models
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    # Start the timer
    start_time = time.time()

    # Train the model
    model.fit(X_train, y_train)

    # Stop the timer
    end_time = time.time()
    training_time = end_time - start_time

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    return training_time, accuracy

# Models to evaluate
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('Bagging', BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=50, random_state=42)),
]

# Store results
results = []

# Evaluate each model
for name, model in models:
    training_time, accuracy = train_and_evaluate(model, X_train, y_train, X_test, y_test)
    results.append({'Model': name, 'Training Time (s)': training_time, 'Accuracy': accuracy})

In [None]:
# Create a DataFrame to display the results
results_df = pd.DataFrame(results)
print(results_df)

# Plot Training Time vs Accuracy
plt.figure(figsize=(8, 6))
plt.scatter(results_df['Training Time (s)'], results_df['Accuracy'], color='blue')
plt.title('Training Time vs Accuracy for Different Models')
plt.xlabel('Training Time (seconds)')
plt.ylabel('Accuracy')
for i, row in results_df.iterrows():
    plt.annotate(row['Model'], (row['Training Time (s)'], row['Accuracy']))
plt.grid(True)
plt.show()

In [None]:
# 2. Time vs Hyperparameters (e.g., n_estimators, max_depth for Random Forest)
# We'll use Random Forest to demonstrate this

# Try different configurations of RandomForest (e.g., different n_estimators)
n_estimators_range = [10, 50, 100, 200, 500]
training_times = []
accuracies = []

for n_estimators in n_estimators_range:
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    training_time, accuracy = train_and_evaluate(rf, X_train, y_train, X_test, y_test)
    training_times.append(training_time)
    accuracies.append(accuracy)

# Plot Time vs Accuracy for Random Forest with different n_estimators
plt.figure(figsize=(8, 6))
plt.plot(n_estimators_range, training_times, label='Training Time (s)', marker='o')
plt.plot(n_estimators_range, accuracies, label='Accuracy', marker='o')
plt.title('Time vs Accuracy for Random Forest (Different n_estimators)')
plt.xlabel('Number of Estimators')
plt.ylabel('Training Time / Accuracy')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# 3. Deployment Time: Measure prediction time (inference time)
def measure_inference_time(model, X_test):
    start_time = time.time()
    model.predict(X_test)  # Make predictions
    end_time = time.time()
    inference_time = end_time - start_time
    return inference_time

# Measure inference time for each model
inference_times = []

for name, model in models:
    model.fit(X_train, y_train)  # Train the model
    inference_time = measure_inference_time(model, X_test)  # Measure inference time
    inference_times.append({'Model': name, 'Inference Time (s)': inference_time})

# Create DataFrame for inference times
inference_times_df = pd.DataFrame(inference_times)
print(inference_times_df)

In [None]:
# 4. Summarize findings
# Here, you will summarize your results:
# - Discuss how training time varies across models and configurations.
# - Discuss how the number of estimators or model complexity affects training time and accuracy.
# - Discuss the deployment time and how long predictions take in real-world scenarios.

# Summary:
# - Logistic Regression is faster but less accurate than Random Forest and Bagging.
# - Random Forest performance improves with more estimators but requires more training time.
# - Bagging can improve performance over simple models (like Logistic Regression) but still requires more computation.
# - Inference time is relatively quick for most models, but larger models (e.g., Random Forest) will have higher prediction times.

# Save the results for further review
results_df.to_csv("model_training_comparison.csv", index=False)
inference_times_df.to_csv("model_inference_times.csv", index=False)
