# Warning Run Time
### File has a run time of ~ 5 hours!

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import dependencies
import numpy as np
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline
from config import db_password
from sqlalchemy import create_engine

# stopwatch
import time

# ML
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb

## Connection to Database

In [3]:
# create connection string to postgres DB
db_string =f'postgresql://postgres:{db_password}@127.0.0.1:5432/Project Insights on the Beach'
engine = create_engine(db_string)

# read in the clean data from PGAdmin - SQL 
vacay_df = pd.read_sql_query('''SELECT*FROM cleaned_up_cust_marketing_table;''',engine)

# If not connected
#vacay_df = pd.read_csv("../cleaned_up_cust_marketing_table.csv")

#vacay_df

## Preprocessing

#### Remove target and unrelated columns

In [4]:
# Drop columns not needed
features_df = vacay_df.copy()
features_df = features_df.drop(["prodtaken","customerid","designation","numberofpersonvisiting","numberofchildrenvisiting"], axis=1)
features_df

Unnamed: 0,age,citytier,occupation,gender,preferredpropertystar,maritalstatus,numberoftrips,passport,owncar,monthlyincome,typeofcontact,durationofpitch,numberoffollowups,productpitched,pitchsatisfactionscore
0,41.000000,3,Salaried,Female,3.0,Single,1.0,1,1,20993.0,Self Enquiry,6.0,3.0,Deluxe,2
1,49.000000,1,Salaried,Male,4.0,Divorced,2.0,0,1,20130.0,Company Invited,14.0,4.0,Deluxe,3
2,37.000000,1,Free Lancer,Male,3.0,Single,7.0,1,0,17090.0,Self Enquiry,8.0,4.0,Basic,3
3,33.000000,1,Salaried,Female,3.0,Divorced,2.0,1,1,17909.0,Company Invited,9.0,3.0,Basic,5
4,37.622265,1,Small Business,Male,4.0,Divorced,1.0,0,1,18468.0,Self Enquiry,8.0,3.0,Basic,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,49.000000,3,Small Business,Male,4.0,Single,2.0,1,1,26576.0,Self Enquiry,9.0,5.0,Deluxe,1
4884,28.000000,1,Salaried,Male,3.0,Single,3.0,1,1,21212.0,Company Invited,31.0,5.0,Basic,3
4885,52.000000,3,Salaried,Female,4.0,Married,7.0,0,1,31820.0,Self Enquiry,17.0,4.0,Standard,1
4886,19.000000,3,Small Business,Male,3.0,Single,3.0,0,0,20289.0,Self Enquiry,16.0,4.0,Basic,5


In [None]:
# Generate our categorical variable list
features_df_cat = features_df.dtypes[features_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
features_df[features_df_cat].nunique()

#### OneHotEncoder

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(features_df[features_df_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(features_df_cat)
encode_df.head()

In [None]:
# Merge one-hot encoded features to features_df
features_df = features_df.merge(encode_df,left_index=True, right_index=True)

# Remove original unencoded columns
features_df = features_df.drop(features_df_cat,1)
features_df.head()

#### Scaling X, splitting test groups, and resampling with Naive Oversampling

In [None]:
# Define the features set.
X = features_df.copy()

# Define the target set.
y = vacay_df["prodtaken"]

# Check the balance of our target values
y.value_counts()

In [None]:
# Scale the data with StandardScaler()
scaler = StandardScaler()

# Fit and transform the data
X_scaled = scaler.fit_transform(X)

# View first row
X_scaled[:1]

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=78)

# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
# Checking the train vs test allocation
print(Counter(y_train))
print(Counter(y_test))

## Random Forest Model

### Without Resampling

In [None]:
# Using Stratified K-Fold Cross Validation (5 & 10-Fold)
n_folds = [5,10]

estimators = [100, 250, 500, 750, 1000]
accuracy_scores = []

for fold in n_folds:
    skf = StratifiedKFold(n_splits=fold)
    for e in estimators:

        # Instantiate random forest classifier and set results to 0 for each iteration
        brclf = BalancedRandomForestClassifier(random_state=1, n_estimators=e)
        results = 0

        # split the data in train and validation sets
        for train_index, test_index in skf.split(X_scaled, y):
            X_t = X_scaled[train_index]
            X_val = X_scaled[test_index]
            y_t = y[train_index]
            y_val = y[test_index]

            # fit
            brclf=brclf.fit(X_t, y_t)

            # predict
            y_pred_k = brclf.predict(X_val)

            # extract accuracy score
            results += balanced_accuracy_score(y_val, y_pred_k)

        # add mean of total result to accuracy score list
        accuracy_scores.append(results/fold)

        # Print results
        print(f'Acc Score with {fold} folds and {e} estimators: {accuracy_scores[-1]}')

In [None]:
# Instantiate model with optimal estimators
brclf = BalancedRandomForestClassifier(n_estimators=500, random_state=1)

# fit
brclf.fit(X_train, y_train)

# predict
y_pred_brf = brclf.predict(X_test)

# Accuracy
brf_acc_score = balanced_accuracy_score(y_test, y_pred_brf)
print(balanced_accuracy_score(y_test, y_pred_brf))

In [None]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred_brf)

#Display confusion matrix using ConfusinMatrixDisplay
display_brf = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=brclf.classes_)
display_brf.plot()

#Save Image
#plt.savefig("../Images/brf_cm.png")
plt.show()

class_brf = classification_report(y_test, y_pred_brf)
# Create balanced classification report for Random Forest
print("Random Forest Classification Report Without Resampling")
print(classification_report(y_test, y_pred_brf))
print("-----------------------------------")
print(f'Accuracy Score: {brf_acc_score}')
print("-----------------------------------")

In [None]:
# List the features sorted in descending order by feature importance
by_features = sorted(zip(brclf.feature_importances_, X.columns), reverse=True)
for feature_rank in by_features:
    print(f"{feature_rank[1]}: ({feature_rank[0]})")

In [None]:
# Chart important features in optimized Random Forest
feat_importances = pd.Series(brclf.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh',color=['blue', 'red', 'green', 'yellow', 'cyan']).invert_yaxis()

### With Naive Oversampling

In [None]:
# Using Stratified K-Fold Cross Validation (5 & 10-Fold)
n_folds = [5,10]

estimators = [100, 250, 500, 750, 1000]
accuracy_scores = []

for fold in n_folds:
    skf = StratifiedKFold(n_splits=fold)
    for e in estimators:

        # Instantiate random forest classifier and set results to 0 for each iteration
        brclf = BalancedRandomForestClassifier(random_state=1, n_estimators=e)
        results = 0

        # split the data in train and validation sets
        for train_index, test_index in skf.split(X_resampled, y_resampled):
            X_t = X_resampled[train_index]
            X_val = X_resampled[test_index]
            y_t = y_resampled[train_index]
            y_val = y_resampled[test_index]

            # fit
            brclf=brclf.fit(X_t, y_t)

            # predict
            y_pred_k = brclf.predict(X_val)

            # extract accuracy score
            results += balanced_accuracy_score(y_val, y_pred_k)

        # add mean of total result to accuracy score list
        accuracy_scores.append(results/fold)

        # Print results
        print(f'Acc Score with {fold} folds and {e} estimators: {accuracy_scores[-1]}')

In [None]:
# Instantiate model with optimal estimators
brclf = BalancedRandomForestClassifier(n_estimators=750, random_state=1)

# fit with resampled data
brclf.fit(X_resampled, y_resampled)

# predict
y_pred_brfr = brclf.predict(X_test)

# Accuracy
brfr_acc_score = balanced_accuracy_score(y_test, y_pred_brfr)
print(balanced_accuracy_score(y_test, y_pred_brfr))

In [None]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred_brfr)

#Display confusion matrix using ConfusinMatrixDisplay
display_brfr = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=brclf.classes_)
display_brfr.plot()

#Save Image
#plt.savefig("../Images/brf_cm.png")
plt.show()

class_brfr = classification_report(y_test, y_pred_brfr)
# Create balanced classification report for Random Forest
print("Random Forest Classification Report With Oversampling")
print(classification_report(y_test, y_pred_brfr))
print("-----------------------------------")
print(f'Accuracy Score: {brfr_acc_score}')
print("-----------------------------------")

In [None]:
# List the features sorted in descending order by feature importance
by_features = sorted(zip(brclf.feature_importances_, X.columns), reverse=True)
for feature_rank in by_features:
    print(f"{feature_rank[1]}: ({feature_rank[0]})")

In [None]:
# Chart important features in optimized, resampled Random Forest
feat_importances = pd.Series(brclf.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh',color=['blue', 'red', 'green', 'yellow', 'cyan']).invert_yaxis()

## Extreme Gradient Boosting (XGBoost)

### Without Resampling

In [None]:
# Define Stratified K-Fold --- current settings iterate 1,296 (2,592 with 5/10 fold) times :)

# start timer
start_time = time.process_time()

# Define parameters and set accuracy score list to blank
n_folds = [5,10]
estimators = [50, 150, 250, 350, 450, 550]
depths = [5, 6, 7, 8, 9, 10]
col_samples = [0.5, 0.55, 0.6, 0.65]
gammas = [0.2, 0.4, 0.6]
learn_rates = [0.2, 0.25, 0.3]

# Create dictionary to hold model with highest accuracy and the relative parameters
max_value_params = {"acc":0, "folds":0, "estimators":0, "depths":0, "colsample_bytree":0, "gamma":0, "learn":0}
accuracy_scores = []
iterations = 0

# Iterate through K-fold CV
for fold in n_folds:
    skf = StratifiedKFold(n_splits=fold)

    # Nested for loops to fine-tune parameters
    for e in estimators:

        for d in depths:

            for c in col_samples:

                for g in gammas:

                    for l in learn_rates:

                        # set results = 0 for each iteration
                        results = 0

                        # Instantiate XGB Classifier model and set results to 0 for each iteration
                        xg_clf = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False,
                                                       eval_metric='mlogloss', learning_rate=l,
                                                       n_estimators=e, gamma=g, colsample_bytree=c,
                                                       max_depth=d, random_state=1)

                        # Split the data into train and validation sets
                        for train_index, test_index in skf.split(X_scaled, y):
                            X_t = X_scaled[train_index]
                            X_val = X_scaled[test_index]
                            y_t = y[train_index]
                            y_val = y[test_index]

                            # Fit the model
                            xg_clf = xg_clf.fit(X_t, y_t)

                            # Predict
                            y_pred_xg = xg_clf.predict(X_val)

                            # Extract results
                            results += balanced_accuracy_score(y_val, y_pred_xg)

                        # add mean of total result to accuracy score list
                        accuracy_scores.append(results/fold)

                        # Update max_value_params dict if new max accuracy score appears
                        if max_value_params['acc'] < max(accuracy_scores):
                            max_value_params['acc'] = max(accuracy_scores)
                            max_value_params['folds'] = fold
                            max_value_params['estimators'] = e
                            max_value_params['depths'] = d
                            max_value_params['colsample_bytree'] = c
                            max_value_params['gamma'] = g
                            max_value_params['learn'] = l

                        # Print results and iteration number to see progress
                        iterations = iterations + 1

                        if iterations % 50 == 0:
                            print(f'iteration {iterations}')

                        print(f'{fold} folds, {e} estimators, {d} depths, colsample_bytree={c}, gamma={g}, learn={l}: \
                        {accuracy_scores[-1] * 100:.3f}% accuracy')


# stop timer and print execution duraion
end_time = time.process_time()
print(f"Elapsed time = {(end_time - start_time)}")

In [None]:
# Best model results
print(f'Best model performance and corresponding parameters: \n {max_value_params}')

In [None]:
# Use best XGBoost parameters to predict 
# Create model and set parameters
xg_clf = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False,
                            eval_metric='mlogloss',
                            n_estimators = max_value_params['estimators'],
                            max_depth = max_value_params['depths'],
                            colsample_bytree = max_value_params['colsample_bytree'],
                            gamma = max_value_params['gamma'],
                            learning_rate = max_value_params['learn'], random_state=1)

# Fit the model
xg_clf.fit(X_train, y_train)

# Predict
y_pred_xg = xg_clf.predict(X_test)

# Extract accuracy
xg_acc_score = balanced_accuracy_score(y_test, y_pred_xg)
balanced_accuracy_score(y_pred_xg, y_test)

In [None]:
# Create confusion matrix for XGBoost
cm = confusion_matrix(y_test, y_pred_xg)

#Display confusion matrix using ConfusinMatrixDisplay
display_xgb = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=xg_clf.classes_)
display_xgb.plot()

class_xg = classification_report(y_test, y_pred_xg)
# Create balanced classification report for XGBoost
print("XGBoost Classification Report Without Resampling")
print(classification_report(y_test, y_pred_xg))
print("-----------------------------------")
print(f'Accuracy Score: {xg_acc_score}')
print("-----------------------------------")

In [None]:
# Plot the XGBoost important features
xgb.plot_importance(xg_clf)

### With Naive Oversampling

In [None]:
# Define Stratified K-Fold --- current settings iterate 1,296 (2,592 with 5/10 fold) times :)

# start timer
start_time = time.process_time()

# Define parameters and set accuracy score list to blank
n_folds = [5,10]
estimators = [50, 150, 250, 350, 450, 550]
depths = [5, 6, 7, 8, 9, 10]
col_samples = [0.5, 0.55, 0.6, 0.65]
gammas = [0.2, 0.4, 0.6]
learn_rates = [0.2, 0.25, 0.3]

# Create dictionary to hold model with highest accuracy and the relative parameters
max_value_params = {"acc":0, "folds":0, "estimators":0, "depths":0, "colsample_bytree":0, "gamma":0, "learn":0}
accuracy_scores = []
iterations = 0

# Iterate through K-fold CV
for fold in n_folds:
    skf = StratifiedKFold(n_splits=fold)

    # Nested for loops to fine-tune parameters
    for e in estimators:

        for d in depths:

            for c in col_samples:

                for g in gammas:

                    for l in learn_rates:

                        # set results = 0 for each iteration
                        results = 0

                        # Instantiate XGB Classifier model and set results to 0 for each iteration
                        xg_clf = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False,
                                                       eval_metric='mlogloss', learning_rate=l,
                                                       n_estimators=e, gamma=g, colsample_bytree=c,
                                                       max_depth=d, random_state=1)

                        # Split the data into train and validation sets
                        for train_index, test_index in skf.split(X_resampled, y_resampled):
                            X_t = X_resampled[train_index]
                            X_val = X_resampled[test_index]
                            y_t = y_resampled[train_index]
                            y_val = y_resampled[test_index]

                            # Fit the model
                            xg_clf = xg_clf.fit(X_t, y_t)

                            # Predict
                            y_pred_xgr = xg_clf.predict(X_val)

                            # Extract results
                            results += balanced_accuracy_score(y_val, y_pred_xgr)

                        # add mean of total result to accuracy score list
                        accuracy_scores.append(results/fold)

                        # Update max_value_params dict if new max accuracy score appears
                        if max_value_params['acc'] < max(accuracy_scores):
                            max_value_params['acc'] = max(accuracy_scores)
                            max_value_params['folds'] = fold
                            max_value_params['estimators'] = e
                            max_value_params['depths'] = d
                            max_value_params['colsample_bytree'] = c
                            max_value_params['gamma'] = g
                            max_value_params['learn'] = l

                        # Print results and iteration number to see progress
                        iterations = iterations + 1

                        if iterations % 50 == 0:
                            print(f'iteration {iterations}')

                        print(f'{fold} folds, {e} estimators, {d} depths, colsample_bytree={c}, gamma={g}, learn={l}: \
                        {accuracy_scores[-1] * 100:.3f}% accuracy')


# stop timer and print execution duraion
end_time = time.process_time()
print(f"Elapsed time = {(end_time - start_time)}")

In [None]:
# Confirmation of best model
print(f'Best model performance and corresponding parameters: \n {max_value_params}')

In [None]:
# Use best XGBoost parameters to predict 
# Create model and set parameters
xg_clf = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False,
                            eval_metric='mlogloss',
                            n_estimators = max_value_params['estimators'],
                            max_depth = max_value_params['depths'],
                            colsample_bytree = max_value_params['colsample_bytree'],
                            gamma = max_value_params['gamma'],
                            learning_rate = max_value_params['learn'], random_state=1)

# Fit the model
xg_clf.fit(X_resampled, y_resampled)

# Predict
y_pred_xgr = xg_clf.predict(X_test)

# Extract accuracy
xgr_acc_score = balanced_accuracy_score(y_pred_xgr, y_test)
balanced_accuracy_score(y_pred_xgr, y_test)

In [None]:
# Create the confusion matrix for XGBoost
cm = confusion_matrix(y_test, y_pred_xgr)

#Display confusion matrix using ConfusinMatrixDisplay
display_xgbr = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=xg_clf.classes_)
display_xgbr.plot()

class_xgr = classification_report(y_test, y_pred_xgr)
# Create balanced classification report for XGBoost
print("XGBoost Classification Report (Oversampled)")
print(classification_report(y_test, y_pred_xgr))
print("-----------------------------------")
print(f'Accuracy Score: {xgr_acc_score}')
print("-----------------------------------")

In [None]:
# Plot the XGBoost important features
xgb.plot_importance(xg_clf)

## Results Comparison

In [None]:
print("-----------------Balanced Random Forest------------------")
print(f'Random Forest Without Oversampling Accuracy: {brf_acc_score:.3f}')
print(class_brf)
print(" ")
print("-----------------Balanced Random Forest (Oversampled) ------------------")
print(f'Random Forest With Oversampling Accuracy: {brfr_acc_score:.3f}')
print(class_brfr)
print(" ")

In [None]:
print("----------------Extreme Gradient Boost (XGBoost)-------------------")
print(f'XGBoost Without Oversampling Accuracy: {xg_acc_score:.3f}')
print(class_xg)
print(" ")

In [None]:
print("----------------Extreme Gradient Boost (Oversampled)-------------------")
print(f'XGBoost With Oversampling Accuracy: {xgr_acc_score:.3f}')
print(class_xgr)
print(" ")
print("-----------------------------------")

### Results Comparison: Confusion Matrices

In [None]:
print("-----------------Balanced Random Forest------------------")
#Display confusion matrix using ConfusionMatrixDisplay
display_brf.plot()

In [None]:
print("-----------------Balanced Random Forest (Oversampled)------------------")
#Display confusion matrix using ConfusionMatrixDisplay
display_brfr.plot()

In [None]:
print("----------------Extreme Gradient Boost (XGBoost)-------------------")
#Display confusion matrix using ConfusionMatrixDisplay
display_xgb.plot()

In [None]:
print("----------------Extreme Gradient Boost (Oversampled)-------------------")
#Display confusion matrix using ConfusionMatrixDisplay
display_xgbr.plot()