In [None]:
# If you're running this on colab, then you can uncomment the bellow command to
# install the pmlb library.
# !pip install pmlb

In [None]:
import altair as alt
import numpy as np
import pandas as pd
import pmlb
from itertools import combinations

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
# If you're running this code locally, then you can uncomment this to automatically
# save the chart data in files, rather than including the data in the spec. 

# !mkdir -p data
# alt.data_transformers.enable('json', prefix='data/altair-data')

## Date Preparation and Exploration

For this lab, we'll be using a dataset about a telephone service provider's customers. Each instance is a customer. The target is whether or not the customer churns, or switches providers. We load it from [Penn Machine Learning Benchmarks](https://epistasislab.github.io/pmlb/).

In [None]:
df = pmlb.fetch_data('churn')

In [None]:
df.shape

In [None]:
df.head()

In preparation for modeling this dataset, we split the dataset into a train and test set and separate the instances from the labels.

In [None]:
df_train, df_test = train_test_split(df, test_size=0.25)

In [None]:
X_train = df_train.drop(columns=['target'])
y_train = df_train['target'].values

X_test = df_test.drop(columns=['target'])
y_test = df_test['target'].values

Before we train a model, let's explore the training dataset first.

**Exercise 1:** Create a visualization that compares the number of customers who churned vs. did not churn.

In [None]:
alt.Chart(df_train).mark_bar().encode(
    x='target:O',
    y='count()'
)

**Exercise 2:** Create a histogram for each feature. Are there any improvements that could make the histograms more usfeul?

In [None]:
features = list(X_train.columns)

In [None]:
histograms = []

for feature in features:
    histogram = alt.Chart(df_train).mark_bar().encode(
        x=alt.X(feature, bin=True),
        y=alt.Y('count()')
    )

    histograms.append(histogram)

alt.hconcat(*histograms).resolve_scale(y='shared')

In [None]:
alt.Chart(df_train).mark_bar().encode(
    x=alt.X(alt.repeat('column'), bin=True, type='quantitative'),
    y=alt.Y('count()'),
    color='target:N'
).repeat(
    column=features
)

**Exercise 3:** Create a [scattplot matrix](https://observablehq.com/@d3/splom) for the subset of features selected below. What insights can you make from these plots?

In [None]:
features_subset = [f for f in features if 'total day' in f] + ['number customer service calls', 'number vmail messages']
features_subset

In [None]:
alt.Chart(df_train).mark_point().encode(
    x=alt.X(alt.repeat('column'), type='quantitative'),
    y=alt.Y(alt.repeat('row'), type='quantitative'),
    color='target:N'
).properties(
    width=120,
    height=120
).repeat(
    column=features_subset,
    row=features_subset
)

**Exercise 4**: Here we have one of the scatterplots from above. It is suffering from overplotting, making it hard to reason about how these two features impact the target. Try to improve it.

In [None]:
alt.Chart(df_train).mark_point().encode(
    x='total day calls',
    y='total day minutes',
    color='target:N',
)

In [None]:
alt.Chart(df_train).mark_point().encode(
    x='total day calls',
    y='total day minutes',
    color='target:N',
    column='target:N'
)

In [None]:
alt.Chart(df_train).mark_rect().encode(
    x=alt.X('total day calls', bin=alt.Bin(maxbins=20)),
    y=alt.Y('total day minutes', bin=alt.Bin(maxbins=20)),
    color='mean(target)',
)

**Exercise 5:** "total day minutes" and "total day charge" are perfectly correlated. Is the same true for "total eve minutes" and "total eve charge" and for "total night minutes" and "total night charge"?

In [None]:
alt.Chart(df_train).mark_point().encode(
    x=alt.X('total eve charge'),
    y=alt.Y('total eve minutes'),
    color='target:N',
)

In [None]:
alt.Chart(df_train).mark_point().encode(
    x=alt.X('total night charge'),
    y=alt.Y('total night minutes'),
    color='target:N',
)

## Modelling

To start, we'll remove the charge features from the dataset, since they are redundant.

In [None]:
X_train.drop(columns=['total day charge', 'total eve charge', 'total night charge'], inplace=True)
X_test.drop(columns=['total day charge', 'total eve charge', 'total night charge'], inplace=True)

We will use [scikit-learn](https://scikit-learn.org/stable/) to train a random forest model on this dataset. Below is an example of performing a grid search and cross validation to find reasonable hyperparameters for our model.

In [None]:
param_grid = {
    'n_estimators': [100],
    'criterion': ['entropy'],
    'bootstrap': [True],
    'max_features': ['sqrt', 1.0],
    'max_depth': [6, 12],
    'min_samples_split': [2, 8],
    'class_weight': ['balanced', None]
}

cv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, scoring='f1', n_jobs=-1)

cv.fit(X_train, y_train)

We can see what the best parameters are and what the corresponding F1 score for those parameters is.

In [None]:
cv.best_params_

In [None]:
cv.best_score_

As a basic sanity check, we can use the model's built-in feature importance score to check if the most important features match what we expect from our exploration and intuition.

In [None]:
model = cv.best_estimator_

In [None]:
sorted(zip(model.feature_importances_, X_train.columns), reverse=True)

Next, we use our model to generate predictions on our test data.

In [None]:
predictions = model.predict(X_test)

In [None]:
df_test['prediction'] = predictions

## Confusion Matrices

sklearn has functions that let us [calculate](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) and [plot](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html#sklearn.metrics.ConfusionMatrixDisplay) a confusion matrix.

In [None]:
cm = confusion_matrix(y_test, predictions)
disp = ConfusionMatrixDisplay(cm)
disp.plot()

Rather than relying on those, let's compute and plot the confusion matrix on our own.

**Exercise 6:** Calculate the confusion matrix. We want a pandas dataframe that will look similar to this, except not hard-coded:

In [None]:
cm = pd.DataFrame({
    'target': [0, 0, 1, 1],
    'prediction': [0, 1, 0, 1],
    'size': [1080, 12, 45, 113]
})

cm

In [None]:
cm = df_test.groupby(['target', 'prediction'], as_index=False).size()

From this data, we can create a confusion matrix that looks similar to the standard one from [this paper by Hong Shen et al.](https://www.andrew.cmu.edu/user/hongs/files/CM_CSCW2020.pdf).

In [None]:
# replace the values 0 and 1 with negative and positive
cm_pn = cm.replace({0: 'negative', 1: 'positive'})

alt.Chart(cm_pn).mark_text().encode(
    x=alt.X('prediction:N', title='predicted class'),
    y=alt.Y('target:N', title='actual class'),
    text='size'
).properties(
    width=300,
    height=300
)

**Exercise 7**: Create a confusion matrix that looks similar to the contextualized confusion matrix from the above paper.

In [None]:
cm_rc = cm.replace({0: 'retained', 1: 'churned'})

base = alt.Chart(cm_rc).encode(
    x=alt.X('prediction:N', title='predicted class', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('target:N', title='actual class'),
).properties(
    width=300,
    height=300
)

rect = base.mark_rect().encode(
    color=alt.Color('prediction', legend=None),
    opacity=alt.Opacity('size', legend=None),
)

text = base.mark_text().encode(
    text='size',
)

rect + text

**Exercise 8**: In class, we noted how this visualization could possibly be improved by using the size of the squares to encode the number of instances. Implement that below.

Tips:
- You'll want to change your mark from `mark_rect` to `mark_square`, so that you can more easily set the size.
- You'll want to change the width and height to be based on the step size of the scale. See the intro to Altair notebook for an example of how to do that.

In [None]:
cm_rc = cm.replace({0: 'retained', 1: 'churned'})

base = alt.Chart(cm_rc).encode(
    x=alt.X('prediction:N', title='predicted class'),
    y=alt.Y('target:N', title='actual class'),
).properties(
    width={'step': 100},
    height={'step': 100}
)

rect = base.mark_square().encode(
    color=alt.Color('prediction', legend=None),
    opacity=alt.Opacity('size', legend=None),
    size='size:Q'
)

text = base.mark_text().encode(
    text='size',
)

rect + text

#### Faceting Layered Charts

Previously when we covered faceting, we used the "row", "column", or "facet" encodings. In order to facet layered charts, we need to do things a bit differently. We need to do the faceting after the layering. Let's make a faceted lollipop chart that compares the distribution of number of customer service calls for customers that did and did not churn.

Faceting before layering does not work:

In [None]:
base = alt.Chart(df_train).encode(
    x='number customer service calls:O',
    y='count()',
    column='target'
)

base.mark_circle() + base.mark_rule()

Instead, we use the `facet()` after 

In [None]:
base = alt.Chart(df_train).encode(
    x='number customer service calls:O',
    y='count()',
)

(base.mark_rule() + base.mark_circle()).facet(column='target')

**Exercise 9:** Implement the bar chart confusion matrix design.

In [None]:
cm_rc = cm.replace({0: 'retained', 1: 'churned'})

base = alt.Chart(cm_rc).encode(
    x=alt.X('prediction:N', title='predicted class'),
).properties(
    width=100,
    height=100
)

rect = base.mark_bar().encode(
    color=alt.Color('prediction', legend=None),
    y=alt.Y('size', axis=None),
    opacity=alt.Opacity('size', legend=None),
)

text = base.mark_text(baseline='bottom').encode(
    y='size',
    text='size',
)

(rect + text).facet(
    row=alt.Row('target:N', title='actual class')
)