In [33]:
import numpy as np
import pandas as pd
from mango.tuner import Tuner
from imblearn.under_sampling import NearMiss
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import average_precision_score,classification_report,roc_auc_score
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.model_selection import train_test_split, cross_validate

In [28]:
class DataSampling:

    def __init__(self,df,y_column,test_size,random_state,stratify=True,shuffle=True):
        self.df = df
        self.random_state = 42
        self.y_column = y_column
        self.test_size =test_size
        self.random_state = random_state
        self.stratify = stratify
        self.shuffle = shuffle

        def data_train_test_split():

            self.X,self.y = self.df[list(set(self.df.columns).difference([self.y_column]))],self.df[self.y_column]
            if stratify == True:
                self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X,self.y, test_size=self.test_size,shuffle=self.shuffle,stratify=self.y)
            else:
                self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X,self.y, test_size=test_size,shuffle=shuffle)

        data_train_test_split()

    
    def get_data_wthout_sample(self):
        return self.X_train, self.X_test, self.y_train, self.y_test
    
    def get_data_smote(self):
        sm = SMOTE(random_state=self.random_state)
        X_sm, y_sm = sm.fit_resample(self.X_train, self.y_train)
        return X_sm,self.X_test,y_sm,self.y_test

    def get_data_near_miss(self,sampling_strategy=0.1):
        nm = NearMiss(sampling_strategy=sampling_strategy)
        X_nm,y_nm = nm.fit_resample(self.X_train,self.y_train)
        return X_nm,self.X_test,y_nm,self.y_test
       
    def get_data_random_over_sample(self):
        os =  RandomOverSampler(random_state=self.random_state)
        X_os,y_os = os.fit_resample(self.X_train,self.y_train) 
        return X_os,self.X_test,y_os,self.y_test

    def print_class_percentage(self,X_train,X_test,y_train,y_test):

        print("----Train class count------")
        y_tr_cnts = y_train.value_counts()
        print(y_tr_cnts)
        print(y_tr_cnts/sum(y_tr_cnts))
        print("----Test class count------")
        y_ts_cnts = y_test.value_counts()
        print(y_ts_cnts)
        print(y_ts_cnts/sum(y_ts_cnts))


In [29]:
df = pd.read_csv("creditcard.csv")
ds = DataSampling(df,"Class",0.30,42,True,True)
X_nm_train,X_test,y_nm_train,y_test= ds.get_data_near_miss()
X_train,X_test,y_train,y_test = ds.get_data_wthout_sample()
# X_nm_train,y_nm_train= ds.get_data_random_over_sample()
# X_sm_train,y_sm_train= ds.get_data_smote()

In [24]:
ds.print_class_percentage(X_nm_train,X_test,y_nm_train,y_test)

----Train class count------
Class
0    3440
1     344
Name: count, dtype: int64
Class
0    0.909091
1    0.090909
Name: count, dtype: float64
----Test class count------
Class
0    85295
1      148
Name: count, dtype: int64
Class
0    0.998268
1    0.001732
Name: count, dtype: float64


In [47]:
class BayesianOpt:

    def __init__(self,classifier):
        self.classifier = classifier
        self.optimize_results = None
       

    def _fit(self,X_train,y_train,params):
        """
        Internal Function for bayesian Optimization
        """
        # Initialize Classifier
        clf = self.classifier

        # Extract Realted Params for Classifier
        validated_params = {}
        param_cls = clf.get_params().keys()
        for key in params:
            if key in param_cls:
                validated_params[key] = params[key]
            else:
                pass
        clf.set_params(**validated_params)
        
        return clf.fit(X_train,y_train)   
    
    def optimize(self,X_train,y_train,param_grid,conf_dict):

        """
        Optimization Function for classifier with data inputs and scoring function
        """
        def objective(args_list):
            accuracies = []
            for params in args_list:
                
                # Fit the model
                clf = self._fit(X_train,y_train,params)

                # Evaluate the model with cross validation
                accuracy = cross_validate(clf, X_train, y_train, cv=params["cv"], scoring=params["scoring"],n_jobs=params["n_jobs"])
                accuracies.append(np.mean(accuracy["test_score"]))

            return accuracies

        tuner_user = Tuner(param_grid, objective, conf_dict)
        optimize_results = tuner_user.maximize()
        self.optimize_results = optimize_results
        return optimize_results
    
    
    def optimize_fit(self,X_train,y_train,param_grid,conf_dict):
        """
        Optimization and Function for classifier with data inputs and scoring function
        Returns fitted object for the classifier
        """
        optimized_results = self.optimize(X_train,y_train,param_grid,conf_dict)
        return self._fit(X_train,y_train,optimized_results["best_params"])

def publish_model_socres(X,y,cv_fit,plotting=True):
    y_pred=cv_fit.predict(X)
    y_pred_prob = cv_fit.predict_proba(X)
    print(classification_report(y,y_pred))
    print("ROC_AUC Score",roc_auc_score(y,y_pred_prob[:,1]))

# Get the data

In [41]:
df = pd.read_csv("creditcard.csv")
ds = DataSampling(df,"Class",0.30,42,True,True)
X_train,X_test,y_train,y_test = ds.get_data_wthout_sample()


# Train the data with specified hyperparameters

In [52]:
# Specified the parameters
param_grid = { 'criterion':['gini','entropy'],
              'max_depth': np.arange(1,15),
              "max_features": ['sqrt', 'log2'],
              "min_samples_split": range(2,21),
              "class_weight": [{0:1, 1:1}, {0:1, 1:5}, {0:1, 1:50},'balanced'],
              "cv":[2],"scoring":["f1_macro"],"n_jobs":[-1]}

# Config dict for Bayesian Optimizer
conf_dict={"num_iteration":100}

# Fit and Optimize the Baysian model
dt_opt = BayesianOpt(DecisionTreeClassifier(random_state = 0))
best_model = dt_opt.optimize_fit(X_train,y_train,param_grid,conf_dict)

# Test the model on seen and unseen data

In [56]:
print("------Training model Results------")
publish_model_socres(X_train,y_train,best_model)
print("------Testing model Result----------")
publish_model_socres(X_test,y_test,best_model)

------Training model Results------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199020
           1       0.89      0.72      0.80       344

    accuracy                           1.00    199364
   macro avg       0.95      0.86      0.90    199364
weighted avg       1.00      1.00      1.00    199364

ROC_AUC Score 0.9404008420329382
------Testing model Result----------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.90      0.66      0.76       148

    accuracy                           1.00     85443
   macro avg       0.95      0.83      0.88     85443
weighted avg       1.00      1.00      1.00     85443

ROC_AUC Score 0.9178098111007427
