In [None]:
# purely for google colab
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
cd "/content/drive/MyDrive/fypj_final_submission"

In [None]:
pip install -r requirements.txt

In [None]:
pip install pipelineprofiler

In [None]:
pip install auto-sklearn

In [None]:
import PipelineProfiler
import sklearn
import pickle
import matplotlib.pyplot as plt
import pandas as pd 
import pickle
import os

In [None]:
# load one model
def load_pickle_data(pickle_path):
    return pickle.load(open(pickle_path,'rb'))

menu_str=""" Load what model? \n
"""
index_count = 0
for k in os.listdir("./saved_models"):
    menu_str+=f"{index_count}. {k} \n"
    index_count+=1
print(menu_str)
while True:
    try:
        selected_model_index = int(input("Select a model to load (enter integer index): ").strip())
        break
    except:
        print("Please select a valid model index.")
model_name = os.listdir("./saved_models")[selected_model_index]
print(f"Selected model to load: index = {selected_model_index}, model_name = {model_name}")
name_of_model = model_name
with open(f"./saved_models/{model_name}/{model_name}.pkl","rb") as f:
    automl_classification = pickle.load(f)
    print("Selected model has been loaded successfully!")

# load data
DATADIR = f"./saved_models/{model_name}/data/"
X_test = load_pickle_data(DATADIR+"X_test.pkl")
X_train = load_pickle_data(DATADIR+"X_train.pkl")
y_test = load_pickle_data(DATADIR+"y_test.pkl")
y_train = load_pickle_data(DATADIR+"y_train.pkl")
predictions = automl_classification.predict(X_test)
accu_ = sklearn.metrics.accuracy_score(y_test,predictions)
f1_ = sklearn.metrics.f1_score(y_test,predictions,average="weighted",pos_label="Present")
print("Training and testing data has been loaded successfully!\n")
print(f"Resulting accuracy : {accu_} , Resulting f1 : {f1_}")



# EVALUATE PERFORMANCE
# ONLY USE ON DATA WITHOUT UNKNOWN CLASS 
try:
    print(f"Ensemble size = {automl_classification.ensemble_size}")
    data = PipelineProfiler.import_autosklearn(automl_classification)
    PipelineProfiler.plot_pipeline_matrix(data)

    # evaluate performance 
    predictions_proba = automl_classification.predict_proba(X_test)
    predictions = automl_classification.predict(X_test)
    # training performance
    predictions_training = automl_classification.predict(X_train)
    # probability dataframe
    proba_df = pd.DataFrame(data=predictions_proba,columns=["Absent Probability","Present Probability"])
    proba_df["Predicted Label"] = predictions
    proba_df["Actual Label"] = y_test
    # accuracy and f1 score
    print("Ensemble Accuracy on testing set: ",sklearn.metrics.accuracy_score(y_test,predictions))
    print("Ensemble f1_score on testing set: ",sklearn.metrics.f1_score(y_test,predictions,average="weighted",pos_label="Present"),"\n")

    print("Ensemble Accuracy on training set: ",sklearn.metrics.accuracy_score(y_train,predictions_training))
    print("Ensemble f1_score on training set: ",sklearn.metrics.f1_score(y_train,predictions_training,average="weighted",pos_label="Present"),"\n")

    # precision and recall
    print("Ensemble Precision on testing set:",sklearn.metrics.precision_score(y_test,predictions,pos_label="Present"))
    print("Ensemble Recall on testing set:",sklearn.metrics.recall_score(y_test,predictions,pos_label="Present"),"\n")

    print("Ensemble Precision on training set:",sklearn.metrics.recall_score(y_train,predictions_training,pos_label="Present"))
    print("Ensemble Recall on training set:",sklearn.metrics.recall_score(y_train,predictions_training,pos_label="Present"),"\n")




    print(proba_df)
    fig,ax = plt.subplots(ncols=2,nrows=2,figsize=(15,10))
    sklearn.metrics.plot_confusion_matrix(X=X_test,y_true=y_test,estimator=automl_classification,ax=ax[0,0])
    ax[0,0].title.set_text("Confusion matrix of model on testing set")
    sklearn.metrics.plot_confusion_matrix(X=X_train,y_true=y_train,estimator=automl_classification,ax=ax[0,1])
    ax[0,1].title.set_text("Confusion matrix of model on training set")

    # show roc curve on both training and testing set
    sklearn.metrics.plot_roc_curve(automl_classification,X_test,y_test,ax=ax[1,0])
    ax[1,0].title.set_text("ROC Curve of model on testing set")
    sklearn.metrics.plot_roc_curve(automl_classification,X_train,y_train,ax=ax[1,1])
    ax[1,1].title.set_text("ROC Curve of model on training set")
except:
    print("Data contains unknown class. Cannot display ROC Curve")


In [None]:
# load , evaluate and produce dataframe
evaluate_model_df = pd.DataFrame(data=[],columns=["model_name","accuracy","f1"])
for k in os.listdir("./saved_models"):
    model_name = k
    with open(f"./saved_models/{model_name}/{model_name}.pkl","rb") as f:
        automl_classification = pickle.load(f)
    # load data
    DATADIR = f"./saved_models/{model_name}/data/"
    X_test = load_pickle_data(DATADIR+"X_test.pkl")
    X_train = load_pickle_data(DATADIR+"X_train.pkl")
    y_test = load_pickle_data(DATADIR+"y_test.pkl")
    y_train = load_pickle_data(DATADIR+"y_train.pkl")

    predictions = automl_classification.predict(X_test)
    resulting_f1=sklearn.metrics.f1_score(y_test,predictions,average="weighted",pos_label="Present")
    resulting_acc=sklearn.metrics.accuracy_score(y_test,predictions)
    evaluate_model_df.loc[evaluate_model_df.shape[0]] = [
        k,
        resulting_acc,
        resulting_f1
    ]
    print(f"Evaluation of model {model_name} successful.")

print(evaluate_model_df)
evaluate_model_df.to_excel("./f1_score_accuracy_model_results.xlsx",index=False)