In [None]:
# purely for google colab
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
cd "/content/drive/MyDrive/fypj_final_submission"

In [None]:
pip install -r requirements.txt

In [None]:
pip install pipelineprofiler

In [None]:
pip install auto-sklearn

In [None]:
import autosklearn 
import autosklearn.classification
import autosklearn.metrics 
import PipelineProfiler
import sklearn
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd 
import numpy as np

In [None]:
# load data
DATADIR ="./1.0.1/pickled_data/"
menu_str = """
Select a model data to load, what has been done to the data is described in parenthesis \n
1. Load final_model_v2 (Data is balanced, Data augmentation has been applied, Data scarcity is addressed, noise reduction is applied , only recordings were murmur is audbile is considered, unknown data is removed) \n
2. Load final_model_v1 (Data is balanced, Data augmentation has been applied, Data scarcity is addressed, noise reduction is applied, unknown data is removed) \n
3. Load model_imbalanced_noaug (noise reduction is applied, unknown data is removed) \n
4. Load model_imbalanced_noaug_unknown (noise reduction is applied) \n
5. Load model_imbalanced_noaug_unknown_noisy \n
-1. Load, train and evaluate all
"""
print(menu_str)
index_data = int(input("Select model data to load (Input integer index):"))
mode_num = 0
if index_data == -1:
    mode_num = -1
def load_pickle_data(pickle_path):
    return pickle.load(open(pickle_path,'rb'))
def load_train_evaluate(index_data,mode_num):
    if index_data == 1:
        model_name = "final_model_v2"
        # 1.
        X_test = load_pickle_data(DATADIR+"X_test_autosklearn_v2.pickle")
        X_train = load_pickle_data(DATADIR+"X_train_autosklearn_v2.pickle")
        y_test = load_pickle_data(DATADIR+"y_test_autosklearn_v2.pickle")
        y_train = load_pickle_data(DATADIR+"y_train_autosklearn_v2.pickle")
    elif index_data == 2:
        model_name = "final_model_v1"
        # 2. 
        X_test = load_pickle_data(DATADIR+"X_test_autosklearn.pickle")
        X_train = load_pickle_data(DATADIR+"X_train_autosklearn.pickle")
        y_test = load_pickle_data(DATADIR+"y_test_autosklearn.pickle")
        y_train = load_pickle_data(DATADIR+"y_train_autosklearn.pickle")
    elif index_data == 3:
        model_name = "model_imbalanced_noaug"
        # 3.
        X_test = load_pickle_data(DATADIR+"X_test_imbalanced_noaug.pickle")
        X_train = load_pickle_data(DATADIR+"X_train_imbalanced_noaug.pickle")
        y_test = load_pickle_data(DATADIR+"y_test_imbalanced_noaug.pickle")
        y_train = load_pickle_data(DATADIR+"y_train_imbalanced_noaug.pickle")
    elif index_data == 4:
        model_name = "model_imbalanced_noaug_unknown"
        # 4.
        X_test = load_pickle_data(DATADIR+"X_test_imbalanced_noaug_unknown.pickle")
        X_train = load_pickle_data(DATADIR+"X_train_imbalanced_noaug_unknown.pickle")
        y_test = load_pickle_data(DATADIR+"y_test_imbalanced_noaug_unknown.pickle")
        y_train = load_pickle_data(DATADIR+"y_train_imbalanced_noaug_unknown.pickle")
    elif index_data == 5:
        model_name = "Load model_imbalanced_noaug_unknown_noisy"
        # 5.
        X_test = load_pickle_data(DATADIR+"X_test_imbalanced_noaug_unknown_noisy.pickle")
        X_train = load_pickle_data(DATADIR+"X_train_imbalanced_noaug_unknown_noisy.pickle")
        y_test = load_pickle_data(DATADIR+"y_test_imbalanced_noaug_unknown_noisy.pickle")
        y_train = load_pickle_data(DATADIR+"y_train_imbalanced_noaug_unknown_noisy.pickle")

    print(f"Loaded model data at index {index_data}!\n")

    # start training
    # Training the autosklearn classifier
    print(f"Initiating training of model with data from index {index_data}\n")
    automl_classification = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=240,
        ensemble_size=1,
        memory_limit=10240,
        metric=autosklearn.metrics.f1_weighted,
        resampling_strategy="cv",
        resampling_strategy_arguments={'folds':15}

    )
    automl_classification.fit(X_train,y_train)
    automl_classification.refit(X_train,y_train)

    
    # evaluate performance 
    if index_data in [1,2,3] and mode_num != -1:
        print(f"Evaluating performance of trained model.\n")
        print(f"Ensemble size = {automl_classification.ensemble_size}")
        data = PipelineProfiler.import_autosklearn(automl_classification)
        PipelineProfiler.plot_pipeline_matrix(data)

        # evaluate performance 
        predictions_proba = automl_classification.predict_proba(X_test)
        predictions = automl_classification.predict(X_test)
        # training performance
        predictions_training = automl_classification.predict(X_train)
        # probability dataframe
        proba_df = pd.DataFrame(data=predictions_proba,columns=["Absent Probability","Present Probability"])
        proba_df["Predicted Label"] = predictions
        proba_df["Actual Label"] = y_test
        # accuracy and f1 score
        print("Ensemble Accuracy on testing set: ",sklearn.metrics.accuracy_score(y_test,predictions))
        print("Ensemble f1_score on testing set: ",sklearn.metrics.f1_score(y_test,predictions,average="weighted",pos_label="Present"),"\n")

        print("Ensemble Accuracy on training set: ",sklearn.metrics.accuracy_score(y_train,predictions_training))
        print("Ensemble f1_score on training set: ",sklearn.metrics.f1_score(y_train,predictions_training,average="weighted",pos_label="Present"),"\n")

        # precision and recall
        print("Ensemble Precision on testing set:",sklearn.metrics.precision_score(y_test,predictions,pos_label="Present"))
        print("Ensemble Recall on testing set:",sklearn.metrics.recall_score(y_test,predictions,pos_label="Present"),"\n")

        print("Ensemble Precision on training set:",sklearn.metrics.recall_score(y_train,predictions_training,pos_label="Present"))
        print("Ensemble Recall on training set:",sklearn.metrics.recall_score(y_train,predictions_training,pos_label="Present"),"\n")

        fig,ax = plt.subplots(ncols=2,nrows=2,figsize=(15,10))
        sklearn.metrics.plot_confusion_matrix(X=X_test,y_true=y_test,estimator=automl_classification,ax=ax[0,0])
        ax[0,0].title.set_text("Confusion matrix of model on testing set")
        sklearn.metrics.plot_confusion_matrix(X=X_train,y_true=y_train,estimator=automl_classification,ax=ax[0,1])
        ax[0,1].title.set_text("Confusion matrix of model on training set")

        # show roc curve on both training and testing set
        sklearn.metrics.plot_roc_curve(automl_classification,X_test,y_test,ax=ax[1,0])
        ax[1,0].title.set_text("ROC Curve of model on testing set")
        sklearn.metrics.plot_roc_curve(automl_classification,X_train,y_train,ax=ax[1,1])
        ax[1,1].title.set_text("ROC Curve of model on training set")

    elif index_data in [4,5] and mode_num != -1:
        print(f"Evaluating performance of trained model.\n")
        print(f"Ensemble size = {automl_classification.ensemble_size}")
        data = PipelineProfiler.import_autosklearn(automl_classification)
        PipelineProfiler.plot_pipeline_matrix(data)

        # evaluate performance 
        predictions_proba = automl_classification.predict_proba(X_test)
        predictions = automl_classification.predict(X_test)
        # accuracy and f1 score
        print("Ensemble Accuracy on testing set: ",sklearn.metrics.accuracy_score(y_test,predictions))
        print("Ensemble f1_score on testing set: ",sklearn.metrics.f1_score(y_test,predictions,average="weighted",pos_label="Present"),"\n")
    else:
        print(f"Model run {index_data} complete.")
        predictions = automl_classification.predict(X_test)
    return automl_classification,predictions,y_test,model_name
if index_data == -1:
    final_results = pd.DataFrame(data=[],columns=["model_data","accuracy","f1"])
    for k in tqdm(range(5)):
        k+=1
        automl_classification,predictions,y_test,model_name = load_train_evaluate(k,mode_num)
        accuracy_ = sklearn.metrics.accuracy_score(y_test,predictions)
        f1_score_ = sklearn.metrics.f1_score(y_test,predictions,average="weighted",pos_label="Present")
        final_results.loc[final_results.shape[0]] = [
            model_name,
            accuracy_,
            f1_score_
        ]
    print("\n",final_results,"\n")
else:
    load_train_evaluate(index_data,mode_num)


In [None]:
# used to export trained model
import os 
import sys
sys.exit()
exported_model = automl_classification
model_name = ""
try: 
    os.mkdir(f"./saved_models/{model_name}")

except:
    pass
try:
    os.mkdir(f"./saved_models/{model_name}/data")
except:
    pass
pickle.dump(exported_model,open(f"./saved_models/{model_name}/{model_name}.pkl","wb"))
pickle.dump(X_test,open(f"./saved_models/{model_name}/data/X_test.pkl","wb"))
pickle.dump(y_test,open(f"./saved_models/{model_name}/data/y_test.pkl","wb"))
pickle.dump(X_train,open(f"./saved_models/{model_name}/data/X_train.pkl","wb"))
pickle.dump(y_train,open(f"./saved_models/{model_name}/data/y_train.pkl","wb"))
