# Model 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lars, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
import scipy.stats as stats


# Load data
file_path = '../matrix/otu_merged_data.csv'
df = pd.read_csv(file_path)

# Function to map drug concentration
def map_drug_concentration(value):
    if value == 0:
        return "none"
    elif value == 5 or value == 10:
        return "low"
    elif value == 50 or value == 100:
        return "high"
    else:
        return "unknown"


# Create 'Drug Set' column
drug_columns = ['amoxicillin', 'oxytetracycline_dihydrate', 'sulfadiazine', 'trimethoprim', 'tylosin_tartrate', 'ciprofloxacin']
df['Drug Set'] = df[drug_columns].apply(lambda row: '_'.join([map_drug_concentration(x) for x in row]), axis=1)

# Filter data to only include specific bacterial families
bacterial_families = ["o__Bacillales;", "o__Lactobacillales;", "o__Enterobacteriales;", "o__Burkholderiales;",
                      "o__Actinomycetales;", "o__Aeromonadales;", "o__Pseudomonadales;"]
df_filtered = df[['SampleID', 'Group', 'Isolation_source', 'Drug Set'] + bacterial_families]

# Prepare training and test sets
train_df = df_filtered[df_filtered['Group'].isin(['G1', 'G2', 'G3'])]
test_df = df_filtered[df_filtered['Group'] == 'G4']

# Label encode the categorical features
label_encoders = {}
for col in ['Isolation_source', 'Drug Set']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

# Define features and target variables
X_train = train_df[['Isolation_source', 'Drug Set']]
y_train = train_df[bacterial_families]
X_test = test_df[['Isolation_source', 'Drug Set']]
y_test = test_df[bacterial_families]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    'Least Angle Regression': Lars(),
    'Random Forest': RandomForestRegressor(),
    'Lasso': Lasso(),
    'Elastic-Net': ElasticNet()
}

# Initialize evaluation metrics dictionary
evaluation_metrics = {}

# Train and evaluate models
for family in bacterial_families:
    evaluation_metrics[family] = {}
    for model_name, model in models.items():
        # Use Leave-One-Out Cross-Validation
        loo = LeaveOneOut()
        rmse_scores = np.sqrt(-cross_val_score(model, X_train_scaled, y_train[family], cv=loo, scoring='neg_mean_squared_error'))
        
        rmse_mean = np.mean(rmse_scores)
        rmse_std_error = stats.sem(rmse_scores)
        
        # Fit on the entire training set and evaluate on the test set
        model.fit(X_train_scaled, y_train[family])
        y_pred = model.predict(X_test_scaled)
        test_rmse = np.sqrt(mean_squared_error(y_test[family], y_pred))
        
        # Store evaluation metrics
        evaluation_metrics[family][model_name] = {
            'Train RMSE': rmse_mean,
            'Train Standard Error': rmse_std_error,
            'Test RMSE': test_rmse
        }


In [None]:
evaluation_metrics