## Assignment 6
### Author: Emily McAfee
### Targeted Marketing Campaign

In [None]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import category_encoders as ce

### 1. Read in, parse, and pre-process the data

In [None]:
# Read in data
filename = "https://library.startlearninglabs.uw.edu/DATASCI420/2019/Datasets/Bank%20Data.csv"
bank = pd.read_csv(filename)

In [None]:
# Check data
print(bank.head())
bank.dtypes

### 2. Perform a brief exploratory analysis

In [None]:
# Explore data
bank.describe

In [None]:
bank.info()

In [None]:
bank.isnull().sum()

In [None]:
# Plot male female
plt.hist(bank.iloc[:,1])
plt.show()
bank.iloc[:,1].value_counts()

In [None]:
bank.dtypes

In [None]:
bank['region'].value_counts()

In [None]:
bank['children'].value_counts()

In [None]:
#income/age looked at through region
seaborn.scatterplot(bank['age'], bank['income'], hue = bank['region'], alpha = .7)

# Another way to do it
#myplot = seaborn.FacetGrid(data = bank, hue = 'region', aspect = 2)
#myplot.map(plt.scatter, 'age', 'income').add_legend()

In [None]:
# Plot age/income as a factor of sex
seaborn.scatterplot('age', 'income', hue = 'sex', alpha = .7, data = bank)

### 3. Compare the performance between a decision tree and a random forest approach by assessing the accuracy of each model

#### Decision Tree model

In [None]:
# Check data
bank

In [None]:
# One-hot encode all data
bank2 = pd.get_dummies(bank, drop_first = True)
bank2

In [None]:
# Check data
bank2.dtypes

In [None]:
# Set features and target
x = bank2.iloc[:,:-1]
y = bank2.iloc[:,-1]

In [None]:
# Split the training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)




In [None]:
# Generate and evaluate the model
from sklearn.tree import DecisionTreeClassifier

# gini
model_gini = DecisionTreeClassifier(min_samples_leaf = 3).fit(x_train, y_train)

y_gini_pred = model_gini.predict(x_test)

In [None]:
# Examine accuracies
from sklearn.metrics import accuracy_score

print('Decision Tree Model Accuracy: ', accuracy_score(y_test, y_gini_pred)*100)

#### Random forest model

In [None]:
# Set features and target
x1_traindata, x2_testdata, y1_traindata, y2_testdata = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [None]:
# Train the random forest model
from sklearn.ensemble import RandomForestClassifier

# Establish parameters
nTrees = 13
max_depth = 5
min_node_size = 5
verbose = 0

# Initiate model
clf = RandomForestClassifier(n_estimators = nTrees, max_depth = max_depth, random_state = 1, verbose = verbose, min_samples_leaf = min_node_size)
clf.fit(x1_traindata, y1_traindata)
print(clf.feature_importances_)

In [None]:
# Predict the testing data
y_test_hat = clf.predict(x2_testdata)

# Find accuracy
print('Random Forest Model Accuracy: ', accuracy_score(y2_testdata, y_test_hat)*100)

# Another way to do it
# Accuracy = [1 for i in range(len(y_test_hat)) if y2_testdata.iloc[i] == y_test_hat[i]]
# Accuracy = round(float(np.sum(Accuracy))/len(y_test_hat)*100,2)
# print("Accuracy on Testing Data = %.2f%%"%Accuracy)

In [None]:
# Compare accuracies
print('Decision Tree Model Accuracy: ', accuracy_score(y_test, y_gini_pred)*100)
print('Random Forest Model Accuracy: ', accuracy_score(y2_testdata, y_test_hat)*100)

### 4. Graph the structure of your decision tree(s)

#### Decision tree

In [None]:
estimator = model_gini
feature_names = bank2.columns[:-1]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = feature_names,
                class_names = bank2.columns[-1],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

#### Random Forest

In [None]:
estimator = clf.estimators_[5]
feature_names = bank2.columns[:-1]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = feature_names,
                class_names = bank2.columns[-1],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

### 5. Graph the confusion matrix of your random forest model

In [None]:
# Make conofusion matrix out of df
from sklearn.metrics import confusion_matrix
pd.DataFrame(
    confusion_matrix(y2_testdata, y_test_hat),
    columns = ['Predicted Customer', 'Predicted No Customer'],
    index = ['True Customer', 'True No Customer']
)

In [None]:
# Just for completeness look at the decision tree model
pd.DataFrame(
    confusion_matrix(y_test, y_gini_pred),
    columns = ['Predicted Customer', 'Predicted No Customer'],
    index = ['True Customer', 'True No Customer']
)

#### Summary
The random forest model (87.5%) performed better than the decision tree classifier model (85%) with slightly varied main features chosen for each model (see visualizations for both above).