### In this notebook we will learn:

* Getting Data: How to import data from PyCaret repository
* Setting up the sqlite database: To store the model artifacts, it is important to have a persistent database through sqlite.
* Setting up Environment: How to setup an experiment in PyCaret and get started with building classification models
* Create & Compare Models: How to create and compare various models, perform stratified cross validation and evaluate classification metrics
* Predict Model: How to make predictions on new / unseen data

## Loading Data 

### 1.1 Importing packages

In [1]:
# function
import os
import sys

new_directory = "E:/airflow/airflow"
current_directory = os.getcwd()
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '../scripts'))

def change_directory(current_directory, new_directory,scripts_path):
    # Get the current working directory
    print(f'Current directory: {current_directory}')
    # Define the path to change to
    #new_directory = "E:/airflow/airflow"
    try:
        # Change the current working directory
        os.chdir(new_directory)
        # Verify the change
        current_directory = os.getcwd()
        print(f'Current directory changed to: {current_directory}')
    except FileNotFoundError:
        print(f'Error: The directory "{new_directory}" does not exist.')
    except PermissionError:
        print(f'Error: Permission denied to change to "{new_directory}".')
    except Exception as e:
        print(f'An unexpected error occurred: {e}')
    # Add the scripts directory to the Python path
    sys.path.append(scripts_path)

    
change_directory(current_directory, new_directory, scripts_path)

Current directory: e:\airflow\airflow\notebooks
Current directory changed to: E:\airflow\airflow


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Imported Libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scripts.utils import *
from pycaret.classification import *
import mlflow
import subprocess

In [4]:
# Setting up all directory
root_folder = new_directory
database_path = root_folder+"/database/"
data_directory = root_folder+"/data/raw/"
data_profile_path = root_folder+"/data/profile_report/"
intermediate_data_path = root_folder+"/data/interim/"
final_processed_data_path = root_folder+"/data/processed/"


old_data_directory = root_folder+"/data/raw/"
new_data_directory = root_folder+"/data/new/"
intermediate_path = root_folder+"/data/interim/"

# Database
db_path = root_folder+"/database/"
db_file_name = "feature_store_v01.db"
drfit_db_name = "drift_db_name.db"
date_columns = ['registration_init_time','transaction_date_min','transaction_date_max','membership_expire_date_max','last_login']

 ### 1.2 Reading the merged data

In [5]:
%%time
interim_data = "final_train_data_interim_1729850876.csv" # set the data recieved from the previous notebook
dataset = load_data( [f"{intermediate_data_path}{interim_data}",
                            ]
                         )[0] #since we are only loading single data, we can access it with index 0, since it return multiple dfs in list
dataset.shape

CPU times: total: 969 ms
Wall time: 1.1 s


(324000, 24)

In [6]:
dataset.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,is_churn,payment_method_id,payment_plan_days,plan_list_price,...,is_cancel,transaction_date_max,membership_expire_date_max,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,/7XuYVGXYHPggWsdtok0JEurQ10CtUO1Y8dDgy1/B0M=,1,0,others,7,2016-12-23,0,41.0,30.0,149.0,...,0.0,20170222,20170322,0.0,0.0,0.0,0.0,2.289867,2.289867,7.807933
1,gB3/kawEQSauWFArU9Z0kZo+ikw9GqJv0rXqNbpVnTY=,1,0,others,7,2016-12-23,0,41.0,30.0,99.0,...,0.0,20170223,20170323,0.274653,0.0,0.0,0.0,2.845647,2.28193,8.362549
2,2aFAPs3QmxD+bNcCe8beuWcI7SZHg1k+1irALOxiw3k=,15,23,female,4,2016-12-24,0,40.0,30.0,149.0,...,0.0,20170227,20170326,0.0,0.0,0.0,0.0,4.708342,4.6837,10.238619
3,FjEZAhwFky8sWoaNGTp+p/r3/hH30WxLr396iSho3gs=,1,0,others,7,2016-12-25,0,41.0,30.0,99.0,...,0.0,20170224,20170324,0.621227,0.173287,0.0,0.346574,3.070758,2.640511,8.63034
4,C5PNTuQxUQmHOXPptQnokhqH1XQoAHHL8pMWIX0nAh0=,1,0,others,7,2016-12-25,0,41.0,30.0,99.0,...,0.0,20170224,20170324,0.0,0.0,0.0,0.0,1.595831,1.499937,7.084058


 ### 1.3 Splitting the data to seen and unseen

In [7]:
data_for_model, data_unseen = get_validation_unseen_set(dataset, validation_frac=0.05, sample=True, sample_frac=0.1)
print('Data for Modeling: ' + str(data_for_model.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (30780, 24)
Unseen Data For Predictions: (1620, 24)


In [8]:
data_for_model.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,is_churn,payment_method_id,payment_plan_days,plan_list_price,...,is_cancel,transaction_date_max,membership_expire_date_max,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,4lztQybZfBiBQdrzTq3LnHAxjFgNgAHLmt+Yt4r+jE4=,1,0,others,7,2015-05-31,0,41.0,30.0,105.666667,...,0.066667,20170222,20170322,2.258687,0.972275,0.68989,0.696475,2.231975,3.223953,7.972159
1,+3FRuyc6sgysGLbD2MLsAjsyNYb4UyLGYagsQl540p8=,11,34,female,7,2016-11-01,0,41.0,30.0,99.0,...,0.0,20170228,20170331,0.693147,0.0,0.274653,0.0,1.762314,1.958004,7.405536
2,CMJjkOQTdVlx3PCVyMMJS5za1HEWltkBTjEus5AF/g4=,10,23,male,9,2016-03-24,0,37.0,30.0,149.0,...,0.0,20170225,20170324,1.935601,0.346574,0.0,0.0,3.21486,3.35098,8.736635
3,XibaK346+VCuiYAQF7KzFRWBHM3JDOPYRDZvfayNuNA=,1,0,others,7,2016-01-08,1,41.0,30.0,99.0,...,0.071429,20170118,20170208,0.44794,0.0,0.0,0.0,2.900947,2.412986,8.357637
4,R+2+lc6ekPWCZLFwWMmGj2VqLAlIqSHd8g051rxjJ/A=,1,0,others,7,2016-03-05,0,41.0,30.0,99.0,...,0.0,20170204,20170304,0.346574,0.0,0.0,0.0,2.227646,2.042122,7.856441


In [9]:
data_for_model.columns

Index(['msno', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'is_churn', 'payment_method_id',
       'payment_plan_days', 'plan_list_price', 'actual_amount_paid',
       'is_auto_renew', 'transaction_date_min', 'membership_expire_date_min',
       'is_cancel', 'transaction_date_max', 'membership_expire_date_max',
       'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq',
       'total_secs'],
      dtype='object')

### 2 Setting up the sqlite database

In [5]:
create_sqlit_connection(database_path,r"mlflow_v01.db")
create_sqlit_connection(database_path,r"feature_store_v01.db")
create_sqlit_connection(database_path,r"drift_db_name.db")

2.6.0
2.6.0
2.6.0


In [None]:
ml_flow_model_path = root_folder+ "/mlruns/8/d62a87a0240f4cc8a01fe9d61b1e6426/artifacts/models/"
ml_flow_path = root_folder+ "/mlruns/8/d62a87a0240f4cc8a01fe9d61b1e6426"

In [14]:

mlflow.set_tracking_uri("http://Localhost:6006")

In [25]:
# do not go ahead unless you execute this step and mlflow is isntalled. 
 
#MAKE mlruns FOLDER on root folder
#run this on terminal where you are on root folder. 
# Makse sure to point the database to correct address. Assuming you have same folder structure you can use this

# mlflow server --backend-store-uri sqlite://///home/charliethomasctg/airflow/database/mlflow_v01.db --default-artifact-root /home/charliethomasctg/airflow/mlruns --port=6006 --host=0.0.0.0


### 3 Setting up Environment: 

The `setup()` function initializes the environment in pycaret and creates the transformation pipeline to prepare the data for modeling and deployment. `setup()`must be called before executing any other function in pycaret. 
* It takes two mandatory parameters: a pandas dataframe and the name of the target column. 
* All other parameters are optional and are used to customize the pre-processing pipeline (we will see them in later tutorials).

When `setup()` is executed, PyCaret's inference algorithm will automatically infer the data types for all features based on certain properties. The data type should be inferred correctly but this is not always the case. To account for this, PyCaret displays a table containing the features and their inferred data types after setup() is executed. If all of the data types are correctly identified enter can be pressed to continue or quit can be typed to end the expriment. Ensuring that the data types are correct is of fundamental importance in PyCaret as it automatically performs a few pre-processing tasks which are imperative to any machine learning experiment. These tasks are performed differently for each data type which means it is very important for them to be correctly configured.

In [12]:
# List of date columns
date_columns = ['registration_init_time', 'transaction_date_min', 'transaction_date_max', 'membership_expire_date_max']

# Convert to datetime
for col in date_columns:
    data_for_model[col] = pd.to_datetime(data_for_model[col])

In [15]:
# No Pre-Processing 
Baseline_model_exp01 = setup(data = data_for_model, target = 'is_churn', 
                   session_id = 42,fix_imbalance=True,ignore_features=['msno'],
                   date_features=['registration_init_time', 'transaction_date_min', 'transaction_date_max', 'membership_expire_date_max'],
                   n_jobs=-1,use_gpu=True,
                   log_experiment=True,experiment_name='Baseline_model_exp01',
                   log_plots=True, log_data=True,
                   verbose=True,
                   log_profile=False)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

Unnamed: 0,Description,Value
0,Session id,42
1,Target,is_churn
2,Target type,Binary
3,Original data shape,"(30780, 24)"
4,Transformed data shape,"(49442, 33)"
5,Transformed train set shape,"(40208, 33)"
6,Transformed test set shape,"(9234, 33)"
7,Ignore features,1
8,Numeric features,17
9,Date features,4


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

In [16]:
X_train, y_train, X_test, y_test = get_train_test_set_from_setup()
X_train.head()

Unnamed: 0,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,...,is_cancel,transaction_date_max,membership_expire_date_max,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
30221,4,27,male,7,2016-11-27,41.0,30.0,99.0,99.0,1.0,...,0.0,1970-01-01 00:00:00.020170227,1970-01-01 00:00:00.020170327,0.876249,0.298627,0.298627,0.346574,2.533617,2.471845,8.159374
9798,1,0,others,7,2016-09-19,41.0,30.0,129.0,129.0,1.0,...,0.0,1970-01-01 00:00:00.020170218,1970-01-01 00:00:00.020170318,0.699063,0.077016,0.154033,0.231049,3.97963,3.214958,9.518767
22292,1,0,others,7,2016-01-20,41.0,30.0,149.0,149.0,1.0,...,0.0,1970-01-01 00:00:00.020170219,1970-01-01 00:00:00.020170319,0.693147,0.0,0.0,0.0,0.0,0.693147,4.859812
9679,4,28,female,3,2015-09-06,38.444443,30.0,149.388885,141.111115,0.611111,...,0.0,1970-01-01 00:00:00.020170223,1970-01-01 00:00:00.020170322,0.899645,0.315654,0.184839,0.152145,3.146289,2.920103,8.643497
12771,1,0,others,7,2015-10-02,41.0,30.0,99.0,99.0,1.0,...,0.0,1970-01-01 00:00:00.020170202,1970-01-01 00:00:00.020170302,0.693147,0.0,0.0,0.0,0.486478,1.006338,4.348823


In [17]:
pipeline = get_transformation_pipeline_from_setup()
pipeline

### 4 Compare models: 

In [None]:
best_model = compare_models(fold = 5) 

* Two simple words of code (not even a line) have created over 15 models using 10 fold stratified cross validation and evaluated the 6 most commonly used classification metrics (Accuracy, AUC, Recall, Precision, F1, Kappa). 

* The score grid printed above highlights the highest performing metric for comparison purposes only. The grid by default is sorted using 'Accuracy' (highest to lowest) which can be changed by passing the sort parameter. For example compare_models(sort = 'Recall') will sort the grid by Recall instead of Accuracy. 

* If you want to change the fold parameter from the default value of 10 to a different value then you can use the fold parameter. For example compare_models(fold = 5) will compare all models on 5 fold cross validation. Reducing the number of folds will improve the training time.

In [None]:
#selecting the best model
lgbm  = create_model('lightgbm', fold = 5) 

In [None]:
lgbm

### 5 Analyzing the model performance

5.1 Learning Curve

In [22]:
%matplotlib inline

In [None]:
plot_model(lgbm, plot = 'learning')

5.2 ROC Curve

In [None]:
plot_model(lgbm, plot = 'auc')

5.3 Precision-recall Curve

In [None]:
plot_model(lgbm, plot = 'pr')

5.4 Confusion Matrix

In [None]:
plot_model(lgbm, plot = 'confusion_matrix', plot_kwargs = {'percent' : True})

5.5 Feature Importance

In [None]:
#top 10 features
plot_model(lgbm, plot='feature') #feature_all -> to check for all features 

5.6 Prediction class distribution

In [None]:
plot_model(lgbm, plot='error')

5.7 Model Interpretability

In [29]:
#pip install shap

In [None]:
# interpret model
interpret_model(lgbm)

In [None]:
interpret_model(lgbm,plot='correlation',feature='is_cancel')

In [None]:
interpret_model(lgbm,plot='reason',observation=0) # index of observation in test data

In [1]:
#pip install interpret

In [None]:
interpret_model(lgbm,plot='msa')

5.8 Model Evaluation

In [None]:
# Convert to datetime
for col in date_columns:
    data_unseen[col] = pd.to_datetime(data_unseen[col])
predict_model(lgbm, data_unseen)