### In this notebook we will learn:

* Getting Data: How to import data from PyCaret repository
* Setting up the sqlite database: To store the model artifacts, it is important to have a persistent database through sqlite.
* Setting up Environment: How to setup an experiment in PyCaret and get started with building classification models
* Create & Compare Models: How to create and compare various models, perform stratified cross validation and evaluate classification metrics
* Tune Model: How to automatically tune the hyper-parameters of a classification model
* Custom Tuning: How to manually change the searching strategy for hyperparameter tuning
* Ensemble Models: How to create ensemble of best performing models to improve the performance
* Plot Model: How to analyze model performance using various plots
* Finalize Model: How to finalize the best model at the end of the experiment
* Predict Model: How to make predictions on new / unseen data
* Save / Load Model: How to save / load a model for future use

## Loading Data 

### 1.1 Importing packages

In [None]:
# function
import os
import sys

new_directory = "E:/airflow/airflow"
current_directory = os.getcwd()
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '../scripts'))

def change_directory(current_directory, new_directory,scripts_path):
    # Get the current working directory
    print(f'Current directory: {current_directory}')
    # Define the path to change to
    new_directory = "E:/airflow/airflow"
    try:
        # Change the current working directory
        os.chdir(new_directory)
        # Verify the change
        current_directory = os.getcwd()
        print(f'Current directory changed to: {current_directory}')
    except FileNotFoundError:
        print(f'Error: The directory "{new_directory}" does not exist.')
    except PermissionError:
        print(f'Error: Permission denied to change to "{new_directory}".')
    except Exception as e:
        print(f'An unexpected error occurred: {e}')
    # Add the scripts directory to the Python path
    sys.path.append(scripts_path)

    
change_directory(current_directory, new_directory, scripts_path)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scripts.utils import *
from pycaret.classification import *
# Other Libraries
import mlflow

In [None]:

# Setting up all directory
root_folder = "E:/airflow/airflow"
database_path = root_folder+"/database/"
data_directory = root_folder+"/data/raw/"
data_profile_path = root_folder+"/data/profile_report/"
intermediate_data_path = root_folder+"/data/interim/"
final_processed_data_path = root_folder+"/data/processed/"

old_data_directory = root_folder+"/data/raw/"
new_data_directory = root_folder+"/data/new/"
intermediate_path = root_folder+"/data/interim/"


# Database
db_path = root_folder+"/database/"
db_file_name = "feature_store_v01.db"
drfit_db_name = "drift_db_name.db"
date_columns = ['registration_init_time','transaction_date_min','transaction_date_max','membership_expire_date_max','last_login']
drift_db_name = "drift_db_name.db"

# Mlflow
mlflow_tracking_uri = "http://Localhost:6006"
ml_flow_model_path = root_folder+ "/mlruns/2/cb66e22bcbf74ded99dc219eb29e7609/artifacts/models/"
ml_flow_path = root_folder+ "/mlruns/2/cb66e22bcbf74ded99dc219eb29e7609"

run_on = "old" #"old"
append=False
date_transformation = False
start_date = '2017-03-01'
end_date = '2017-03-31'


 ### 1.2 Reading Data

In [None]:
%%time
final_data = "final_train_data_process_1729853409.csv" # set the data recieved from the previous notebook
dataset = load_data( [f"{final_processed_data_path}{final_data}",
                            ]
                         )[0] #since we are only loading single data, we can access it with index 0, since it return multiple dfs in list
dataset.shape

In [None]:
dataset.head()

 ### 1.3 Splitting the data to seen and unseen
 

In [None]:
data_for_model, data_unseen = get_validation_unseen_set(dataset, validation_frac=0.05, sample=True, sample_frac=0.1)
print('Data for Modeling: ' + str(data_for_model.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

### 2 Setting up the sqlite database

In [None]:
create_sqlit_connection(database_path,r"mlflow_v01.db")

import subprocess

subprocess.run([
    'mlflow', 'server',
    '--backend-store-uri', 'sqlite://///home/charliethomasctg/airflow/database/mlflow_v01.db',
    '--default-artifact-root', '/home/charliethomasctg/airflow/mlruns',
    '--port=6006',
    '--host=0.0.0.0'
])


### 3 Setting up Environment: 

In [None]:
mlflow.set_tracking_uri(mlflow_tracking_uri)


In [None]:
# Convert to datetime
for col in date_columns:
    data_for_model[col] = pd.to_datetime(data_for_model[col])

In [None]:
#run this on terminal 
#MAKE mlrun FOLDER
#mlflow server --backend-store-uri='sqlite:///mlflow_v01.db' --default-artifact-root="mlruns/" --port=6006 --host=0.0.0.0

In [None]:
exp_clf102 = setup(data=data_for_model, target='is_churn',
                   fold_shuffle=True,
                   session_id=42,
                   fix_imbalance=True,
                   ignore_features=['msno'], 
                   normalize=True,
                   #transformation=True,
                   #ignore_low_variance=True,
                   remove_multicollinearity=True,
                   multicollinearity_threshold=0.95,
                   date_features=['registration_init_time', 'transaction_date_min',
                                  'transaction_date_max', 'membership_expire_date_max', 'last_login'],
                   n_jobs=-1,
                   use_gpu=True,
                   log_experiment=True,
                   experiment_name='model_exp02',
                   log_plots=True,
                   log_data=True,
                   verbose=True,
                   log_profile=False)


### 4 Compare models: 

In [None]:
best_model = compare_models(fold = 5,include=['lightgbm','rf','et','lr'])

### 4.1 Create a Model

In [None]:
#selecting the LGBM considering overall performance
lgbm  = create_model('lightgbm', fold = 5) 

In [None]:
plot_model(lgbm, plot = 'confusion_matrix', plot_kwargs = {'percent' : True})

### 5 Tuning the Models
When a model is created using the `create_model()` function it uses the default hyperparameters. In order to tune hyperparameters, the `tune_model()` function is used. This function automatically tunes the hyperparameters of a model on a pre-defined search space and scores it using stratified cross validation. The output prints a score grid that shows Accuracy, AUC, Recall, Precision, F1 and Kappa by fold.

> Note: `tune_model()` does not take a trained model object as an input. It instead requires a model name to be passed as an abbreviated string similar to how it is passed in `create_model()`. All other functions in pycaret.classification require a trained model object as an argument.

In [None]:
tuned_lgbm, tuner_0 = tune_model(lgbm, 
                            fold = 5,
                            optimize = 'F1', 
                            choose_better = True, 
                            return_tuner=True)

### 5.1 Custom Tuning

While creating models using `create_model()` method, you can observe the model's cross validation training score. 

To observe, it you have set `return_train_score=True`, by default it is set to False
If False, returns the CV Validation scores only. If True, returns the CV training scores along with the CV validation scores. This is useful when the user wants to do bias-variance tradeoff. A high CV training score with a low corresponding CV validation score indicates overfitting.


We can change the searching strategy used for Hyperparameter tuning using the parameter `search_algorithm` 
* search_algorithm: str, default = None

The search algorithm depends on the `search_library` parameter. Some search algorithms require additional libraries to be installed. If None, will use search library-specific default algorithm.

> ‘scikit-learn’ possible values:
* ‘random’ : random grid search (default)
* ‘grid’ : grid search

> ‘scikit-optimize’ possible values:
* ‘bayesian’ : Bayesian search (default)

> ‘tune-sklearn’ possible values:
* ‘random’ : random grid search (default)
*  ‘grid’ : grid search
* ‘bayesian’ : pip install scikit-optimize
* ‘hyperopt’ : pip install hyperopt
* ‘optuna’ : pip install optuna
* ‘bohb’ : pip install hpbandster ConfigSpace

> ‘optuna’ possible values:
* ‘random’ : randomized search
* ‘tpe’ : Tree-structured Parzen Estimator search (default)

#### 5.1.1 Custom tuning using `Random-search`

In [None]:
tuned_lgbm_skopt,tuner_1 = tune_model(lgbm, 
                                   search_library = 'scikit-optimize',
                                   fold = 5 ,
                                   optimize = 'F1',
                                   choose_better = True, 
                                   return_tuner=True)

In [None]:
tuned_lgbm_optuna,tuner_2 = tune_model(lgbm, 
                                    search_library='optuna',
                                    search_algorithm='random',
                                    fold = 5,
                                    optimize = 'F1',
                                    return_train_score=True,
                                    choose_better=True,
                                    return_tuner=True)

### 6 Analyzing the model performance


6.1 Learning Curve

In [None]:
plot_model(tuned_lgbm_skopt, plot = 'learning')

6.2 AUC Curve

In [None]:
plot_model(tuned_lgbm_skopt, plot = 'auc')

6.3 Precision-recall Curve

In [None]:
plot_model(tuned_lgbm_skopt, plot = 'pr')

6.4 Confusion Matrix

In [None]:
plot_model(tuned_lgbm_skopt, plot = 'confusion_matrix', plot_kwargs = {'percent' : True})

6.5 Feature Importance

In [None]:
plot_model(tuned_lgbm_skopt, plot = 'feature')

6.6 Model Interpretaion

In [None]:
#pip install shap

In [None]:
interpret_model(tuned_lgbm_skopt)

In [None]:
plot_model(tuned_lgbm_skopt, plot='error')

In [None]:
from pycaret.classification import interpret_model

interpret_model(tuned_lgbm_skopt, plot='shap', feature='bd')

import shap
shap_values = get_shap_values(tuned_lgbm_skopt)  # Get SHAP values for your model
print(shap_values.shape)  # Should show the shape of SHAP values


In [None]:
interpret_model(tuned_lgbm_skopt,plot='correlation',feature='registration_duration')

In [None]:
interpret_model(tuned_lgbm_skopt,plot='reason',observation=0) # index of observation in test data

### 7 Evaluating the model

In [None]:
# Convert to datetime
for col in date_columns:
    data_unseen[col] = pd.to_datetime(data_unseen[col])
predict_model(lgbm, data_unseen)
predict_model(tuned_lgbm_optuna, data=data_unseen).head()