# Customer Churn Preds .. Imbalaced Data
# Scenario: Imagine you have trained and fine-tuned your model and used it to make predictions
=========================================

# Import Libraries and Load Data

## Customers that belong to class 0 are normal
## Customers that belong to class 1 require follow up

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import itertools
import matplotlib.pyplot as plt

In [None]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_csv(r'churn_preds.csv')

In [None]:
df.head()

# Function to Plot Confusion Matrix

In [None]:
# Reference: 
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
    #    print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

### Remember: Class 1 is the class of interest (customers that require follow up)¶


In [None]:
# Create confusion matrix
cnf_matrix = confusion_matrix(df['Actual'], df['Predicted'],labels=[1,0])

# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Followup','Normal'],
                      title='Confusion Matrix')
print('Among the positive customers, the model predicted 106 of them correctly and mis-classified 53 as normal')
print('Among the negative customers, the model predicted 928 of them correctly and mis-classified 13 as positive')
print('===============')
# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Followup','Normal'],
                      title='Confusion Matrix - Fraction', normalize=True)
print('When viewed as a percentage, the model missed 33% of positive customers and 1% are false alarms')




## Plot Actual, Predicted and Probability values

In [None]:
normal = df[df['Actual']==0]
followup = df[df['Actual']==1]
plt.figure()
plt.scatter(df.index,df['Actual'],label='actual')
plt.scatter(normal.index,normal['Predicted_Proba_Class1'],label='normal')
plt.scatter(followup.index,followup['Predicted_Proba_Class1'],label='followup')
plt.plot([df.index.min(),df.index.max()],[0.5,0.5],color='r')
plt.xlim(left=0)
plt.xlabel('Sample')
plt.ylabel('Predicted Probability')
plt.title('Followup / Normal')
plt.legend(loc=0)
plt.show()

* The model has done a decent job with negative customers
* However, there are lots of positive customers with a probability < 0.5 and the model is classifying them as negative

#### One approach to improve Recall is to lower the threshold so we can identify more positive customers
* Let's say every customer that needs to be followed up has some cost associated with it (say ```$10```)
* We can ignore true negatives as no action is needed with them
* False negatives can be highly costly as it's a missed opportuinty to address some customer concern or an issue raised by the customer
* Let's say that cost is ```$50 ```


#### Finding the optimal cutoff

```txt
$50 * FN(C) + $0 * TN(C) + $10 * FP(C) + $10 * TP(C)
```

FN(C) means that the false negative percentage is a function of the cutoff, C, and similar for TN, FP, and TP.  We need to find the cutoff, C, where the result of the expression is smallest.

A straightforward way to do this, is to simply run a simulation over a large number of possible cutoffs.  We test 100 possible values in the for loop below.

In [None]:
pd.crosstab(index=df['Actual'],columns=np.where(df['Predicted_Proba_Class1'] > .5, 1, 0))

In [None]:
cutoffs = np.arange(0.1, .9, 0.01)
costs = []
for c in cutoffs:
    costs.append(np.sum(np.sum(np.array([[0, 10], [50, 10]]) * 
                               pd.crosstab(index=df['Actual'],columns=np.where(df['Predicted_Proba_Class1'] > c, 1, 0)))))

In [None]:
costs = np.array(costs)
plt.plot(cutoffs, costs)
plt.ylabel('Cost')
plt.xlabel('Cutoff')
plt.show()

print('Cost is minimized near a cutoff of:', cutoffs[np.argmin(costs)], 'for a cost of:', np.min(costs))

In [None]:
normal = df[df['Actual']==0]
followup = df[df['Actual']==1]
plt.figure()
plt.scatter(df.index,df['Actual'],label='actual')
plt.scatter(normal.index,normal['Predicted_Proba_Class1'],label='normal')
plt.scatter(followup.index,followup['Predicted_Proba_Class1'],label='followup')
plt.plot([df.index.min(),df.index.max()+50],[cutoffs[np.argmin(costs)],cutoffs[np.argmin(costs)]],color='r',linewidth=3)
plt.xlim(left=0)
plt.xlabel('Sample')
plt.ylabel('Predicted Probability')
plt.title('Followup / Normal')
plt.legend(loc=0)
plt.show()

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(df['Actual'], np.where(df['Predicted_Proba_Class1'] > cutoffs[np.argmin(costs)], 1, 0),labels=[1,0])

# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Followup','Normal'],
                      title='Confusion Matrix at {0:0.2f}'.format(cutoffs[np.argmin(costs)]), normalize=True)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(df['Actual'], np.where(df['Predicted_Proba_Class1'] > .5, 1, 0),labels=[1,0])

# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Followup','Normal'],
                      title='Confusion Matrix at 0.5', normalize=True)

### If you compare the two confusion matrices, we can now identify 79% of the positives compared to the 67% when cutoff was 0.5

### For a classifier, finding the optimal cutoff based on business cost is a great approach