In [1]:
#import required libraries after installing the packages from the Autosklearn_installation_guide.ipynb file

import autosklearn
import joblib
import pandas as pd
import numpy as np
import warnings
import autosklearn.classification
import autosklearn.classification as classifier

In [2]:
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import (accuracy,
                                 f1,
                                 roc_auc,
                                 precision,
                                 average_precision,
                                 recall,
                                 log_loss,
                                 r2,
                                 mean_squared_error,
                                 mean_absolute_error,
                                 )
from sklearn.utils.fixes import _joblib_parallel_args
from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:
#Load resistance data
resistance_data = pd.read_excel('PA_phenotypes_(MIC).xlsx', index_col=0)

resistance_data

In [None]:
# Load expression data from Sheet 1, skipping the 2nd and 3rd rows visually in the Excel file
expression = pd.read_excel('PA_Expression_data.xlsx', sheet_name=0, index_col=0, skiprows=None)

# Delete columns 'PA14_1' and 'PA14_2' from the DataFrame
expression = expression.drop(['PA14_1', 'PA14_2'], axis=1)

# Display the DataFrame
expression

In [None]:
# Load the Excel file for the feature set identified using GA.
file_path = 'Dataset_name' #For annotated feature sets follow Dataset EV6 and for iteration-specific feature sets, you can follow Dataset EV1

# Read the Excel file, assuming the gene names are in the sheets and column A
df = pd.read_excel(file_path, sheet_name=1, usecols="A", header=None) #change sheet number according to the antibiotic and feature set.
df

In [None]:
# Extract all gene names
Log_reg_acc_genes = df.iloc[:,0].tolist()

# Select these genes from your expression DataFrame
expression_red = expression.loc[Log_reg_acc_genes]
expression_red

In [None]:
expression = expression_red.T.iloc[:, :]
expression

In [None]:
#Define X
X = expression
X

In [None]:
# Create y matrix
y = resistance_data.reindex(X.index)
y

In [None]:
#remove rows with missing values for CAZ
y_nonan = y.dropna(subset=["CAZ"])
y_nonan

In [None]:
X_new = X[X.index.isin(y_nonan.index)]
X_new

In [None]:
# Create y_CAZ matrix to see fitting results for one drug type

y_CAZ = y_nonan['CAZ']
y_CAZ

In [None]:
# Create histogram to check distribution of MICs.
hist, bins = np.histogram(y_CAZ, bins=2)

print("Histogram:", hist)
print("Bins:", bins)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_CAZ, test_size = 0.2, random_state=1, stratify=y_CAZ)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [15]:
skf = StratifiedKFold(n_splits=3)

clf = AutoSklearnClassifier(time_left_for_this_task=18000,
                            #max_models_on_disc=5,
                            memory_limit = 10240,
                            resampling_strategy=skf,
                            ensemble_kwargs={'ensemble_size': 3},
                            metric=f1,
                            scoring_functions=[roc_auc, average_precision, accuracy, f1, precision, recall, log_loss])

In [None]:
clf.fit(X=X_train, y=y_train, X_test=X_test, y_test=y_test)

In [None]:
clf.sprint_statistics()

In [None]:
df_cv_results = pd.DataFrame(clf.cv_results_).sort_values(by = 'mean_test_score', ascending = False)
df_cv_results

In [None]:
df_cv_results.to_excel("AutoML_clf_run_results.xlsx") # you can name the file according to the no. of features and antibiotic

In [None]:
clf.leaderboard(detailed = True, ensemble_only=False)

In [None]:
df_cv_leaderboard = pd.DataFrame(clf.leaderboard(detailed = True, ensemble_only=False))
df_cv_leaderboard

In [None]:
df_cv_leaderboard.to_excel("AutoML_clf_run_leaderboard.xlsx.xlsx") # you can name the file according to the no. of features and antibiotic

In [None]:
#Save the model
joblib.dump(clf,'AutoML_clf_run_model.joblib') # you can name the file according to the no. of features and antibiotic

In [None]:
#load the model
clf2 = joblib.load('AutoML_clf_run_model.joblib')

In [None]:
clf2.sprint_statistics()

In [None]:
clf2.get_params

In [None]:
clf2.refit(X=X_train, y=y_train)

In [None]:
import sklearn

In [None]:
y_hat_test = clf2.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat_test))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

#print classification report for model
print(classification_report(y_test, y_hat_test))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_hat_test)
cm

In [None]:
# To get detailed information about feature preprocessor, classifier, and balancing strategy
models = clf2.get_models_with_weights()

for weight, model in models:
    print("Weight:", weight)
    print("Model:", model)
    # Each component of the pipeline can be accessed like this:
    print("Preprocessing steps:", model.named_steps['data_preprocessor'])
    print("Classifier:", model.named_steps['classifier'])
    # Balancing strategy (if any) will be part of preprocessing or classifier depending on the algorithm
    if 'balancing' in model.named_steps:
        print("Balancing strategy:", model.named_steps['balancing'])
    else:
        print("None.")