# What this notebook does

* Tries out different configurations to find the best model for the transformed data

# Imports and Setup

In [21]:
import os
import shutil
import warnings

import dill
import flaml
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ppscore as pps
import seaborn as sns
from pandas_profiling import ProfileReport
from phik import phik_matrix
from sklearn.metrics import (accuracy_score,
                             f1_score)

%matplotlib inline
sns.set()
warnings.filterwarnings("ignore")


# Loading the data

In [3]:
trainset = pd.read_csv('../Data/trainset.csv')
valset = pd.read_csv('../Data/valset.csv')
testset = pd.read_csv('../Data/testset.csv')

In [4]:
X_train = trainset.drop(["Anxiety", "Depression", "Compulsive behavior"], axis=1)
y_train_anxiety = trainset["Anxiety"]
y_train_depression = trainset["Depression"]
y_train_compulsive_behavior = trainset["Compulsive behavior"]
X_val = valset.drop(["Anxiety", "Depression", "Compulsive behavior"], axis=1)
y_val_anxiety = valset["Anxiety"]
y_val_depression = valset["Depression"]
y_val_compulsive_behavior = valset["Compulsive behavior"]
X_test = testset.drop(["Anxiety", "Depression", "Compulsive behavior"], axis=1)
y_test_anxiety = testset["Anxiety"]
y_test_depression = testset["Depression"]
y_test_compulsive_behavior = testset["Compulsive behavior"]

# Function to create models

In [5]:
def create_model(X_train, y_train, X_val, y_val, time_limit, ensemble, metric, path):
    
    if not os.path.exists(path):
        os.makedirs(path)
        
    automl_settings = {
            "time_budget": round(time_limit),
            "metric": metric,
            "task": 'classification',
            "ensemble": ensemble,
            "verbose": 0,
            "log_file_name":"",
            "n_jobs":-1,
            "estimator_list":["lgbm", "rf", "catboost", "xgboost", "extra_tree", "xgb_limitdepth"],
            
        }
    
    
    automl = flaml.AutoML(early_stopping=True)
    automl.fit(X_train, y_train, X_val=X_val, y_val=y_val, **automl_settings)
    
    
    print(" ")
    print("Model Performance")
    print(" ")
    print(f"Accuracy Score on train set: {round(accuracy_score(y_train, automl.predict(X_train))*100, 4)}%")
    print(f"Accuracy Score on validation set: {round(accuracy_score(y_val, automl.predict(X_val))*100, 4)}%")
    print(f"F1 Score on train set: {round(f1_score(y_train, automl.predict(X_train)), 4)}")
    print(f"F1 Score on validation set: {round(f1_score(y_val, automl.predict(X_val)), 4)}")
    print("\n--------------------\n\n")
    
    if ensemble:
        print("Estimator:")
        print(automl.model)
        print("\n--------------------\n\n")
    else:
        print("Estimator:")
        print(automl.model.estimator)
        print("\n--------------------\n\n")
        
    with open(path+"/metrics.txt", "w") as f:
        f.write("Model Performance\n")
        f.write(" ")
        f.write(f"Accuracy Score on validation set: {round(accuracy_score(y_val, automl.predict(X_val))*100, 4)}%")
        f.write(f"Accuracy Score on train set: {round(accuracy_score(y_train, automl.predict(X_train))*100, 4)}%")
        f.write(f"F1 Score on train set: {round(f1_score(y_train, automl.predict(X_train)), 4)}")
        f.write(f"F1 Score on validation set: {round(f1_score(y_val, automl.predict(X_val)), 4)}")
        f.write("\n--------------------\n\n")
        f.write("Estimator:\n")
        if ensemble:
            f.write(str(automl.model))
        else:
            f.write(str(automl.model.estimator))
        f.write("\n--------------------\n\n")
    
    with open(path+"/config.txt", "w") as f:
        f.write("Configuration:\n")
        f.write(str(automl_settings))
        f.write(f"\nNote: early_stopping was set to true")
    
    joblib.dump(automl.model, path+"/model.dgx")
    return automl.model
    

In [6]:
%%time
model_1 = create_model(X_train, y_train_anxiety, X_val, y_val_anxiety, time_limit=5*60, ensemble=True, metric="f1", path="../Artifacts/models/trial_1_anxiety")

 
Model Performance
 
Accuracy Score on train set: 77.0213%
Accuracy Score on validation set: 100.0%
F1 Score on train set: 0.4375
F1 Score on validation set: 1.0

--------------------


Estimator:
StackingClassifier(estimators=[('lgbm',
                                <flaml.model.LGBMEstimator object at 0x000001648343B430>),
                               ('rf',
                                <flaml.model.RandomForestEstimator object at 0x00000164832E6430>)],
                   final_estimator=<flaml.model.LGBMEstimator object at 0x0000016482421BB0>,
                   n_jobs=-1, passthrough=True)

--------------------


CPU times: total: 5min 37s
Wall time: 5min 32s


In [7]:
%%time
model_2 = create_model(X_train, y_train_depression, X_val, y_val_depression, time_limit=5*60, ensemble=True, metric="f1", path="../Artifacts/models/trial_1_depression")

 
Model Performance
 
Accuracy Score on train set: 87.6596%
Accuracy Score on validation set: 100.0%
F1 Score on train set: 0.7852
F1 Score on validation set: 1.0

--------------------


Estimator:
StackingClassifier(estimators=[('lgbm',
                                <flaml.model.LGBMEstimator object at 0x0000016482FCF5B0>),
                               ('rf',
                                <flaml.model.RandomForestEstimator object at 0x00000164833BEC70>)],
                   final_estimator=<flaml.model.XGBoostSklearnEstimator object at 0x000001648361E610>,
                   n_jobs=-1, passthrough=True)

--------------------


CPU times: total: 7min 34s
Wall time: 5min 31s


In [8]:
%%time
model_3 = create_model(X_train, y_train_compulsive_behavior, X_val, y_val_compulsive_behavior, time_limit=5*60, ensemble=True, metric="f1", path="../Artifacts/models/trial_1_compulsive_behavior")

 
Model Performance
 
Accuracy Score on train set: 94.0426%
Accuracy Score on validation set: 100.0%
F1 Score on train set: 0.4615
F1 Score on validation set: 1.0

--------------------


Estimator:
StackingClassifier(estimators=[('lgbm',
                                <flaml.model.LGBMEstimator object at 0x0000016483009FA0>),
                               ('rf',
                                <flaml.model.RandomForestEstimator object at 0x000001648366CEB0>)],
                   final_estimator=<flaml.model.XGBoostSklearnEstimator object at 0x0000016481388520>,
                   n_jobs=-1, passthrough=True)

--------------------


CPU times: total: 6min 47s
Wall time: 5min 37s


# Moving Finalized models to appropriate directory

In [19]:
# Move allfiles ending in .dgx to the models folder:
list_of_files = []
for (dirpath, dirnames, filenames) in os.walk("../Artifacts/models"):
    for filename in filenames:
        if filename.endswith('.dgx'): 
            list_of_files.append(os.path.join(dirpath, filename))

In [24]:
# Copy all files in list_of_files to the models folder:
shutil.copy(list_of_files[0], "../Models/anxiety_model.dgx")
shutil.copy(list_of_files[1], "../Models/depression_model.dgx")
shutil.copy(list_of_files[2], "../Models/compulsive_behavior_model.dgx")

'../Models/compulsive_behavior_model.dgx'

# Done!