# Model 2: Optimal Feature Selection and Evaluation

This notebook demonstrates the process of identifying the best number of features for model performance. It includes:

- Data preprocessing and cleaning
- Feature selection and transformation
- Model training with various classifiers
- Evaluation of model performance to determine the optimal feature set

The goal is to achieve the highest accuracy in predicting antibiotic combinations by selecting the appropriate number of top features.

In [None]:
# imprt libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Function to map antibiotic concentration to categories
def map_concentration(value):
    if pd.isna(value):
        return "unknown"
    elif value == 0:
        return "none"
    elif value in [5, 10]:
        return "low"
    elif value in [50, 100]:
        return "high"
    else:
        return "unknown"

# Load and preprocess data
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    antibiotics = ['amoxicillin', 'oxytetracycline_dihydrate', 'sulfadiazine', 'trimethoprim', 'tylosin_tartrate', 'ciprofloxacin']
    for antibiotic in antibiotics:
        df[antibiotic] = df[antibiotic].apply(map_concentration)
    df['antibiotic_combination'] = df[antibiotics].apply(lambda row: '_'.join(row), axis=1)
    return df.dropna()

# Evaluate models and select the best number of features
def evaluate_models(df_cleaned, feature_columns, target_column):
    X = pd.concat([df_cleaned[feature_columns], df_cleaned[['Isolation_source', 'Group']]], axis=1)
    X = pd.get_dummies(X)
    y = df_cleaned[target_column].astype('category').cat.codes
    
    models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM": SVC(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(random_state=42)
    }
    
    accuracy_scores = {}
    for k in range(10, 101, 10):
        selector = SelectKBest(chi2, k=k)
        X_selected = selector.fit_transform(X, y)
        
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        for name, model in models.items():
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            acc = accuracy_score(y_test, y_pred)
            
            if k not in accuracy_scores or acc > accuracy_scores[k]:
                accuracy_scores[k] = acc
    
    return accuracy_scores

# Main execution
file_path = '../matrix/otu_merged_data.csv'
df_cleaned = load_and_preprocess_data(file_path)
feature_columns = [col for col in df_cleaned.columns if col.startswith('o__')]
accuracy_scores = evaluate_models(df_cleaned, feature_columns, 'antibiotic_combination')

# Plot the accuracy scores
plt.figure(figsize=(12, 6))
plt.bar(accuracy_scores.keys(), accuracy_scores.values(), color='lightblue')
plt.title('Model Accuracy by Number of Features')
plt.xlabel('Number of Top Features')
plt.ylabel('Accuracy')
plt.xticks(range(10, 101, 10))
plt.show()


In [None]:
best_k = max(accuracy_scores, key=accuracy_scores.get)
best_k

# Draw the confusion matrix for the given input

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import warnings

warnings.filterwarnings('ignore')

# Read the data from the CSV file
df = pd.read_csv('../matrix/otu_merged_data.csv')

# Function to map concentration values to categories
def map_concentration(value):
    if pd.isna(value):
        return "unknown"
    elif value == 0:
        return "none"
    elif value in [5, 10]:
        return "low"
    elif value in [50, 100]:
        return "high"
    else:
        return "unknown"

# Apply concentration mapping to antibiotic columns
antibiotics = ['amoxicillin', 'oxytetracycline_dihydrate', 'sulfadiazine', 'trimethoprim', 'tylosin_tartrate', 'ciprofloxacin']
for antibiotic in antibiotics:
    df[antibiotic] = df[antibiotic].apply(map_concentration)

# Function to map antibiotic concentrations to set names
def map_to_set(row):
    if all(value == 'unknown' for value in row):
        return 'Unknown'
    mapping = {
        'high_high_high_high_high_high': 'Set 1',
        'high_high_high_none_none_none': 'Set 2',
        'high_none_none_none_none_none': 'Set 3',
        'low_low_low_low_low_low': 'Set 4',
        'low_low_low_none_none_none': 'Set 5',
        'low_none_none_none_none_none': 'Set 6',
        'none_none_none_none_none_none': 'Control'
    }
    key = '_'.join(row)
    return mapping.get(key, 'Other')

# Apply set mapping to create a new 'set_name' column
df['set_name'] = df[antibiotics].apply(map_to_set, axis=1)

# Drop rows with missing values
df_cleaned = df.dropna()

# Select feature columns starting with 'o__'
feature_columns = [col for col in df_cleaned.columns if col.startswith('o__')]
X = df_cleaned[feature_columns]

# Concatenate feature columns with 'Isolation_source' and 'Group' columns
X = pd.concat([X, df_cleaned[['Isolation_source', 'Group']]], axis=1)

# One-hot encode categorical columns
X = pd.get_dummies(X, columns=['Isolation_source', 'Group'])

# Target variable
y = df_cleaned['set_name']

# Encode target variable using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Select k best features using chi-squared test
best_k = best_k # get from previous example
selector = SelectKBest(chi2, k=best_k)
X_selected = selector.fit_transform(X, y_encoded)

# Scale the selected features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Oversample the data to balance the classes
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_scaled, y_encoded)
X_balanced, y_balanced = X_resampled, y_resampled

# Define the models to be evaluated
models = {
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression()
}

# Define the parameter grids for grid search
param_grids = {
    "Random Forest": {'n_estimators': [100, 200], 'max_depth': [10, 20]},
    "SVM": {'C': [1, 10], 'kernel': ['rbf', 'linear']},
    "Decision Tree": {'max_depth': [5, 10]},
    "Logistic Regression": {'C': [1, 10]}
}

# Create subplots for each model
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))
axes = axes.flatten() 

# Perform grid search and cross-validation for each model
for ax, (name, model) in zip(axes, models.items()):
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='roc_auc_ovr')
    grid_search.fit(X_balanced, y_balanced)
    best_model = grid_search.best_estimator_
    
    # Make predictions using the best model
    y_pred = cross_val_predict(best_model, X_balanced, y_balanced, cv=5)
    y_balanced_labels = label_encoder.inverse_transform(y_balanced)
    y_pred_labels = label_encoder.inverse_transform(y_pred)
    cm = confusion_matrix(y_balanced_labels, y_pred_labels, labels=label_encoder.classes_)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    annot_array = np.vectorize(lambda x, y: f'{x}\n({y:.2%})')(cm, cm_normalized)
    
    sns.heatmap(cm_normalized, annot=annot_array, fmt="", cmap='Blues', ax=ax)
    ax.set_title(f'Normalized Confusion Matrix for {name}', fontsize=14)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    ax.set_xticklabels(labels=label_encoder.classes_, rotation=45)
    ax.set_yticklabels(labels=label_encoder.classes_, rotation=45)

plt.tight_layout()
plt.savefig('confusion_matrices.pdf', format='pdf', dpi=300)


# Draw the ROC plot on the test set

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import warnings

warnings.filterwarnings('ignore')

best_k=40
df = pd.read_csv('../matrix/otu_merged_data.csv') 

# Function to map concentration values to categories
def map_concentration(value):
    if pd.isna(value):
        return "unknown"
    elif value == 0:
        return "none"
    elif value in [5, 10]:
        return "low"
    elif value in [50, 100]:
        return "high"
    else:
        return "unknown"

antibiotics = ['amoxicillin', 'oxytetracycline_dihydrate', 'sulfadiazine', 'trimethoprim', 'tylosin_tartrate', 'ciprofloxacin']
for antibiotic in antibiotics:
    df[antibiotic] = df[antibiotic].apply(map_concentration)

# Function to map antibiotic concentrations to set names
def map_to_set(row):
    mapping = {
        'high_high_high_high_high_high': 'Set 1',
        'high_high_high_none_none_none': 'Set 2',
        'high_none_none_none_none_none': 'Set 3',
        'low_low_low_low_low_low': 'Set 4',
        'low_low_low_none_none_none': 'Set 5',
        'low_none_none_none_none_none': 'Set 6',
        'none_none_none_none_none_none': 'Control'
    }
    key = '_'.join(row)
    return mapping.get(key, 'Other')

df['set_name'] = df[antibiotics].apply(map_to_set, axis=1)

df_cleaned = df.dropna()

# Selecting feature columns
feature_columns = [col for col in df_cleaned.columns if col.startswith('o__')]
X = df_cleaned[feature_columns]
X = pd.concat([X, df_cleaned[['Isolation_source', 'Group']]], axis=1)
X = pd.get_dummies(X, columns=['Isolation_source', 'Group'])

scaler = StandardScaler()

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42, probability=True),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42)
}

param_grids = {
    "Random Forest": {'n_estimators': [100, 200], 'max_depth': [10, 20]},
    "SVM": {'C': [1, 10], 'kernel': ['rbf', 'linear']},
    "Decision Tree": {'max_depth': [5, 10]},
    "Logistic Regression": {'C': [1, 10]}
}

sets = ['Set 1', 'Set 2', 'Set 3', 'Set 4', 'Set 5', 'Set 6', 'Control']

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.ravel() 

# Loop through each set and create subplots
for i, set_name in enumerate(['Set 1', 'Set 2', 'Set 3', 'Set 4', 'Set 5', 'Set 6']):
    ax = axes[i]

    # Convert set names to binary labels
    y_binary = df_cleaned['set_name'].apply(lambda x: 1 if x == set_name else 0)

    # Feature selection using chi-square test
    selector = SelectKBest(chi2, k=best_k)
    X_selected = selector.fit_transform(X, y_binary)
    X_scaled = scaler.fit_transform(X_selected)

    # Resampling using RandomOverSampler
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_scaled, y_binary)

    X_balanced, y_balanced = X_resampled, y_resampled
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3)
    
    
    for name, model in models.items():
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='roc_auc')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_probas = cross_val_predict(best_model, X_test, y_test, cv=5, method='predict_proba')
        fpr, tpr, _ = roc_curve(y_test, y_probas[:, 1])
        roc_auc = auc(fpr, tpr)

        ax.plot(fpr, tpr, label=f'{name} (area = {roc_auc:.2f})')

    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'ROC Curves for {set_name} vs Control')
    ax.legend(loc="lower right")

plt.tight_layout()
plt.show()
