# Warning Run Time
### File has excessive run time over 2+ hours!

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# import dependencies
import numpy as np
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline
from config import db_password
from sqlalchemy import create_engine

# stopwatch
import time

# ML
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb

## Connection to Database

In [None]:
# create connection string to postgres DB
# db_string =f'postgresql://postgres:{db_password}@127.0.0.1:5432/Project Insights on the Beach'
# engine = create_engine(db_string)

# read in the clean data from PGAdmin - SQL 
#vacay_df = pd.read_sql_query('''SELECT*FROM cleaned_up_cust_marketing_table;''',engine)

# If not connected
vacay_df = pd.read_csv("../cleaned_up_cust_marketing_table.csv")

vacay_df

## Preprocessing

#### Remove target and unrelated columns

In [None]:
# Drop columns not needed
features_df = vacay_df.copy()
features_df = features_df.drop(["prodtaken","customerid","designation","numberofpersonvisiting","numberofchildrenvisiting"], axis=1)
features_df

In [None]:
# Generate our categorical variable list
features_df_cat = features_df.dtypes[features_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
features_df[features_df_cat].nunique()

#### OneHotEncoder

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(features_df[features_df_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(features_df_cat)
encode_df.head()

In [None]:
# Merge one-hot encoded features to features_df
features_df = features_df.merge(encode_df,left_index=True, right_index=True)

# Remove original unencoded columns
features_df = features_df.drop(features_df_cat,1)
features_df.head()

#### Scaling X, splitting test groups, and resampling with Naive Oversampling

In [None]:
# Define the features set.
X = features_df.copy()

# Define the target set.
y = vacay_df["prodtaken"]

# Check the balance of our target values
y.value_counts()

In [None]:
# Scale the data with StandardScaler()
scaler = StandardScaler()

# Fit and transform the data
X_scaled = scaler.fit_transform(X)

# View first row
X_scaled[:1]

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=78)

# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
# Checking the train vs test allocation
print(Counter(y_train))
print(Counter(y_test))

## Random Forest Model

In [None]:
# Using Stratified K-Fold Cross Validation (5 & 10-Fold)
n_folds = [5,10]

estimators = [100, 250, 500, 750, 1000]
accuracy_scores = []

for fold in n_folds:
    skf = StratifiedKFold(n_splits=fold)
    for e in estimators:

        # Instantiate random forest classifier and set results to 0 for each iteration
        brclf = BalancedRandomForestClassifier(random_state=1, n_estimators=e)
        results = 0

        # split the data in train and validation sets
        for train_index, test_index in skf.split(X_scaled, y):
            X_t = X_scaled[train_index]
            X_val = X_scaled[test_index]
            y_t = y[train_index]
            y_val = y[test_index]

            # fit
            brclf=brclf.fit(X_t, y_t)

            # predict
            y_pred_k = brclf.predict(X_val)

            # extract accuracy score
            results += balanced_accuracy_score(y_val, y_pred_k)

        # add mean of total result to accuracy score list
        accuracy_scores.append(results/fold)

        # Print results
        print(f'Acc Score with {fold} folds and {e} estimators: {accuracy_scores[-1]}')

In [None]:
# Instantiate model with optimal estimators
brclf = BalancedRandomForestClassifier(n_estimators=500, random_state=1)

# fit
brclf.fit(X_train, y_train)

# predict
y_pred = brclf.predict(X_test)

# Accuracy
print(balanced_accuracy_score(y_test, y_pred))

In [None]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

#Display confusion matrix using ConfusinMatrixDisplay
display = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=brclf.classes_)
display.plot()

#Save Image
#plt.savefig("../Images/brf_cm.png")
plt.show()

# Create balanced classification report for Random Forest
print("Balanced Random Forest Classifier")
print(classification_report(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
by_features = sorted(zip(brclf.feature_importances_, X.columns), reverse=True)
for feature_rank in by_features:
    print(f"{feature_rank[1]}: ({feature_rank[0]})")

In [None]:
# Chart important features in K-Fold Random Forest
feat_importances = pd.Series(brclf.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh',color=['blue', 'red', 'green', 'yellow', 'cyan']).invert_yaxis()