In [4]:
import pandas as pd
import numpy as np
import seaborn as sb
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from IPython.display import HTML
import statsmodels.formula.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
apiKey = 'lQg24SpLGpWSL3Yn35VS'
plotly.tools.set_credentials_file(username='amcdonne', api_key=apiKey)
plotly.tools.set_config_file(world_readable=False, sharing='private')
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 1000)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### Random Forest
Ensemble ML using a group of weak classifers to determine class based on vote
1. Standardize the variables
2. Split the training and testing data for all the features and the classifier

```python
# Split the data into training and testing sets; set random state to a fixed number while testing 
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
#look at the shape to make sure everything matches up
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
```

4. Baseline model: Generate one to see if the model beats this or not

```python
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('average')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))
```

5. Run the random forest for training

```python
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);
```

```python
RF = RandomForestClassifier()
RF.fit(xTrainFor, yTrainFor)
predicted = RF.predict(xTestFor)
accuracy = accuracy_score(yTestFor, predicted)

print("Train Accuracy :: ", accuracy_score(yTrainFor, RF.predict(xTrainFor)))
print("Test Accuracy  :: ", accuracy_score(yTestFor, predicted))
```

6. Make predictions
```python
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
```
7. Calculate Errors:
```python
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
```
8. Generate the Confusion matrix:

```python
from sklearn.metrics import confusion_matrix
cm = pd.DataFrame(confusion_matrix(yTestFor, predicted))
sb.heatmap(cm, annot=True)

cm = confusion_matrix(test_labels, predictions)
plot_confusion_matrix(cm, classes = ['Poor Health', 'Good Health'],
                      title = 'Health Confusion Matrix')
```

9. Visualize a tree from the random forest:
```python
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')
```

10. Smaller Tree version:
```python
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');
```

11. Getting Variables of Importance:
```python
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
```

#### PCA Analysis
PCA projects the entire dataset onto a different feature (sub)space, and LDA tries to determine a suitable feature (sub)space in order to distinguish between patterns that belong to different classes.
1. Standardize the data
```python
#Where x are the feature values, and y the target is separated
x = StandardScaler().fit_transform(x)
```
2.

#### Seaborn Data Visualization
1. Pair wise scatter plots; essentially a covariance matrix of plots and distribution densities
```python
cols = ['density', 'residual sugar', 'total sulfur dioxide', 'fixed acidity']
pp = sns.pairplot(wines[cols], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True))
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Wine Attributes Pairwise Plots', fontsize=14)
```


#### Plotly Charts
1. Large Scatter Plot Example:
```python
trace7 = go.Scattergl(
    x = dataDict['defaulted']['creditScore']['Super Prime 781-850+'].sort_values(by=['originalInterestRatePercentage'])['originalInterestRatePercentage'],
    name = 'Super Prime 781-850+',
    mode='markers',
    opacity = 0.7,
    marker=dict(
        size=6,
        #showscale=True,
        #colorscale='Blues',
        #reversescale=True,
        #color=defaulted['obligorCreditScore'],
        color = '#2ca02c', #set color equal to a variable
    )
)
```

2. Multiple plots in one: Change the layout and add charts where needed:
```python
fig = tools.make_subplots(rows=3, cols=2, subplot_titles=('Super Prime 781-850+', 'Prime 661 780','Nonprime 601-660', 'Subprime 501-600','Deep Subprime 0-500', 'No Score')
fig.append_trace(trace7, 1, 1)
fig.append_trace(trace8, 1, 2)
fig.append_trace(trace9, 2, 1)
fig.append_trace(trace10, 2, 2)
fig.append_trace(trace11, 3, 1)
fig.append_trace(trace12, 3, 2)
fig['layout']['xaxis'].update(dict(tickformat='.0%'))
fig['layout']['xaxis2'].update(dict(tickformat='.0%'))
fig['layout']['xaxis3'].update(dict(tickformat='.0%'))
fig['layout']['xaxis4'].update(dict(tickformat='.0%'))
fig['layout']['xaxis5'].update(dict(tickformat='.0%'))
fig['layout']['xaxis6'].update(dict(tickformat='.0%'))
fig['layout'].update(showlegend=False, height=800, width=1300, title='Paid Down Loans: APR vs Credit Score')
```

3. Parallel Coordinates: Good for visualizing large multi-dimensional data:

```python
trace1 = go.Parcoords(
    line = dict(color = '#d62728'),
    opacity=.5,
    dimensions = list([
        dict(range = [.0,.3],
             label = 'APR', values = defaulted['originalInterestRatePercentage'],
            tickformat= ".0%"),
        dict(tickvals = [1, 2, 3, 4, 5, 6],
             ticktext = ['No Score','Deep Subprime 0-500','Subprime 501-600','Nonprime 601-660','Prime 661-780', 'Super Prime 781-850+'],
             label = 'Credit Score', values = defaulted['CreditScoreCat']
            ),
    ]),

)
layout = go.Layout(
    title='Defaulted Loans',
    hovermode = 'closest',
)
fig = go.Figure(data = [trace1], layout = layout)
```

4. Overlay Histogram

```python
trace1 = go.Histogram(
    name='Paid Down',
    x=paidDown['obligorCreditScore'][paidDown['obligorCreditScore']>=300],
    opacity=0.75,
    autobinx=False,
    xbins= dict(start=paidDownMin, end=paidDownMax, size=2),
)
trace2 = go.Histogram(
    name='Defaulted',
    x=defaulted['obligorCreditScore'][defaulted['obligorCreditScore']>=300],
    opacity=0.75,
    autobinx=False,
    xbins=dict(start=defaultedMin, end=defaultedMax, size=2),
)
data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='PaidDown_Defaulted_Overlay_Histogram', sharing='private')
```