 
## Week 3
### Model Risk

#### Import libraries

We use materials from **Week 2 - Benefit curve (credit)**

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix


from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score as f1

import matplotlib.colors


sns.set(style="white")

np.random.seed(2020)

#settings for plots
plt.rcParams.update({'font.size': 16,
                     'xtick.labelsize' : 14, 
                     'ytick.labelsize' : 14,
                     'axes.labelsize' : 16,
                     'axes.titlesize' : 20})

In [None]:
import warnings
warnings.filterwarnings("ignore")

Define some functions that will help us to plot graphs. 

In [None]:
def roc_auc_plot(model, X_train, y_train,
                X_oos, y_oos, X_oot, y_oot, names = ['train', 'oos' ]):
    
    plt.figure(figsize=(10,8))
    colors = ['olivedrab','deepskyblue',  'salmon']
    
    for it, i in enumerate(names):
        
        #choose the data
        if i == 'train':
            X = X_train
            y_ = y_train
        elif i == 'oos':
            X = X_oos
            y_ = y_oos
        else:
            X = X_oot
            y_ = y_oot
        
        y_hat = model.predict_proba(X)[:, 1]
        fpr, tpr, _ = roc_curve(y_, y_hat)
        gini = 2 * roc_auc_score(y_, y_hat) - 1
        plt.plot(fpr, tpr, label = i, color = colors[it], linewidth=2)
        print('Model',i, 'gini: ', np.round(gini,2))

    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('Model old')
    plt.legend(bbox_to_anchor=(1, 1));
    plt.plot([0,1], [0,1], '--', color='grey', label='Random model')

    plt.grid()
    _ = plt.legend(loc= 0, prop= {'size': 16})

In [None]:
def benefit_plot(model, X_train, y_train,
                X_oos, y_oos, X_oot, y_oot, names = ['train', 'oos' ]):
                 
    plt.figure(figsize=(10,8))

    colors = ['olivedrab','deepskyblue',  'salmon']
    
    max_val = []
    for it, i in enumerate(names):

        if i == 'train':
            X = X_train
            y_ = y_train
        elif i == 'oos':
            X = X_oos
            y_ = y_oos
        else:
            X = X_oot
            y_ = y_oot

        benefit = []    
        c_acceptance_rate = []
        
        y_hat = model.predict_proba(X)[:, 1]
        
        for t in thr:
            #calculate confusion matrix
            CM = confusion_matrix(y_, (y_hat > t)*1.)  
            
            #calculate accaptance rate as amount of non-defaulted clients
            c_acceptance_rate.append((len(y_hat) - np.sum((y_hat > t)*1.)) / len(y_hat))
            TN = CM[0][0]
            FN = CM[1][0]
            FP = CM[0][1]
            
            #calculate the financial effect
            benefit.append(TN * e_fp - FN * e_fn)

        fpr, tpr, _ = roc_curve(y_, y_hat)
        gini = 2 * roc_auc_score(y_, y_hat) - 1
        print('Model',i, 'gini: ', np.round(gini,2)) 
        print('Model',i, 'Max Benefit: ', np.max(benefit)) 
        
        plt.plot(c_acceptance_rate, benefit, label = i, color = colors[it], linewidth=2)  
        plt.plot(c_acceptance_rate[np.argmax(benefit)], np.max(benefit), color = colors[it], marker='*', markersize=10)
         


    plt.xlabel('Acceptance rate')
    plt.ylabel('Benefit')
    plt.title('Benefit curve for old model')
    plt.legend(bbox_to_anchor=(1, 1));

    plt.grid()
    _ = plt.legend(loc= 0, prop= {'size': 16})
    


__Consider a binary classification model $X -> Prob$, e.g. credit scoring__:

#### Load Data

Load **train**, **out-of-sample** and **out-of-time** samples

In [None]:
df_train = pd.read_csv('../data/benefit-curve/df_train.csv')
df_oos = pd.read_csv('../data/benefit-curve/df_oos.csv')
df_oot = pd.read_csv('../data/benefit-curve/df_oot.csv')
## based on kaggle https://www.kaggle.com/c/GiveMeSomeCredit

In [None]:
df_train.head()

__Column description:__

- `issue_d` - The month which the loan was funded
- `addr_state` - The state provided by the borrower in the loan application
- `emp_title` - The job title supplied by the Borrower when applying for the loan.
- `installment` - The monthly payment owed by the borrower if the loan originates.
- `dti` - A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
- `funded_amnt` - The total amount committed to that loan at that point in time.
- `annual_inc` - The self-reported annual income provided by the borrower during registration.
- `emp_length` - Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years. 
- `term` - The number of payments on the loan. Values are in months and can be either 36 or 60.
- `inq_last_6mths` - The number of inquiries in past 6 months (excluding auto and mortgage inquiries)
- `mths_since_recent_inq` - Months since most recent inquiry.
- `delinq_2yrs` - The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years
- `chargeoff_within_12_mths` - Number of charge-offs within 12 months
- `num_accts_ever_120_pd` - Number of accounts ever 120 or more days past due
- `num_tl_90g_dpd_24m` - Number of accounts 90 or more days past due in last 24 months
- `acc_open_past_24mths` - Number of trades opened in past 24 months.
- `avg_cur_bal` - Average current balance of all accounts
- `tot_hi_cred_lim` - Total high credit/credit limit
- `delinq_amnt` - The past-due amount owed for the accounts on which the borrower is now delinquent.

And all categorical variables are encoded:
- `sub_grade` - External assigned loan subgrade
- `purpose` - A category provided by the borrower for the loan request.
- `home_ownership` - The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER


__Target variable:__
- `def`

#### Modelling

##### *Define target*

In [None]:
targ_cols = [i for i in df_train.columns if i != 'def']

X_train = df_train[targ_cols]
X_oos = df_oos[targ_cols]
X_oot = df_oot[targ_cols]

y_train = df_train["def"]
y_oos = df_oos["def"]
y_oot = df_oot["def"]

##### *Define FP and FN costs*

Financial result of model performance depends on FP and FN error

In [None]:
S = 1000 # amount of loan
r = 0.03 # interest rate
lgd = 0.2 # losses in case of default
e_fp = r * S # 1 type error cost
e_fn = lgd * S # 2 type error cost

In [None]:
#the grid to calculate acceptance rate
thr = np.linspace(0,1,41) #41

##### *Train old model*

In [None]:
model_old = LogisticRegression(C=0.02, fit_intercept=True, max_iter=100,
          penalty='l1', random_state=123, solver = 'liblinear',
          tol=0.01).fit(X_train, y_train)


model_old.fit(X_train, y_train)

**Plot roc-auc** in order to compare the quality of model on different samples

In [None]:
roc_auc_plot(model_old, X_train, y_train,
                X_oos, y_oos, X_oot, y_oot, ['train', 'oos' ])

We see that the quality of model on oos and oot data is lower than on the train

___
**Plot benefit curve** to see the dependence between the benefit and acceptance rate 

Simple threshold decision is a level $a$: 
if $Prob > a$ then some action is undertaken, i.e. $\hat{𝑌}=1$ and otherwise $\hat{𝑌}=0$


**Acceptance rate** $c$ is a percentage of observations that satisfy rule:
$$
if Prob \le a, i.e. c = \frac{\sum_{i=1}^{N}I\{Prob_i\le a\}}{N}
$$

We use the function that is defined above. For each acceptance rate we calculate the benefit:
1. Pick threshold level a. Calculate c (x axis)
2. Calculate FP and FN for given c
3. Weigh FP and FN with error costs (e_FP and e_FN) and plot on y axis
4. Reiterate 1-3 from $c = 0$ to $c = 1$

In [None]:
benefit_plot(model_old, X_train, y_train, X_oos, y_oos, 
                X_oot, y_oot,['train', 'oos' ])

##### *Train new model*

Define better model with optimized hyperparameters (outside this notebook)

You can also train your own more complex model (random forest or gradient-boosting models)

In [None]:
model_new = LogisticRegression(C=2, fit_intercept=True, max_iter=100,
          penalty='l1', random_state=123, solver = 'liblinear',
          tol=0.01).fit(X_train, y_train)

model_new.fit(X_train, y_train)

**Plot roc-auc** in order to compare the quality of model on different samples

In [None]:
roc_auc_plot(model_new, X_train, y_train,X_oos, y_oos, 
                X_oot, y_oot, ['train', 'oos'])

**Plot benefit curve** to see the dependence between the benefit and acceptance rate 

In [None]:
benefit_plot(model_new, X_train, y_train, X_oos, y_oos, 
                X_oot, y_oot, ['train', 'oos'])

In our example we get that the higher model quality metrics are, the higher is financial result (in general)

#### 1.Model non-optimal implementation

To calculate model risk we should find the difference between the benefit of using old model and the benefit of using current model:
$$Model Risk = Benefit (Optimal Model) - Benefit (Current Model) $$

Let's compare the benefit of using optimal cut-off and the one that equals 0.1

In [None]:
plt.figure(figsize=(10,8))

colors = ['salmon']
labels_m = ['Old model']


val_oos = [] 
ar_oos = []

y_oos_ = model_old.predict_proba(X_oos)[:, 1]

for t in thr:
    CM_oos = confusion_matrix(y_oos, (y_oos_ > t)*1.)  
    ar_oos.append((len(y_oos) - np.sum((y_oos_ > t)*1.)) / len(y_oos_))
    val_oos.append(CM_oos[0][0] * e_fp - CM_oos[1][0] * e_fn)

plt.plot(ar_oos, val_oos, label = 'Current Model' , color = 'salmon', linewidth=2) 
## OPTIMAL CUT-OFF
plt.plot(ar_oos[np.argmax(val_oos)], np.max(val_oos), color = 'red', marker='*', markersize=15)
optimal_benefit = np.max(val_oos)    

## Current cut-off
t = 0.1


y_oos_ = model_old.predict_proba(X_oos)[:, 1]

CM_oos = confusion_matrix(y_oos, (y_oos_ > t)*1.)  
ar_oos = ((len(y_oos) - np.sum((y_oos_ > t)*1.)) / len(y_oos_))
val_oos = (CM_oos[0][0] * e_fp - CM_oos[1][0] * e_fn)

## CURRENT CUT-OFF
plt.plot(ar_oos, val_oos, color = 'black', marker='*', markersize=12)
cutrrent_benefit = val_oos       
    
plt.xlabel('Acceptance rate')
plt.ylabel('Benefit')
plt.title('Benefit curve ')
plt.legend(bbox_to_anchor=(1, 1));

plt.grid()
_ = plt.legend(loc= 0, prop= {'size': 16})

print('Model Risk = Benefit (Optimal Model) - Benefit (Current Model) = ', optimal_benefit - cutrrent_benefit) 


##### 2. Ignoring better model

Now we compare the benefit of using better model and the current one.
$$Model Risk = Benefit (Better Model) - Benefit (Current Model)$$

In [None]:
#this code duplicates the code from benefit_plot for two models
plt.figure(figsize=(10,8))

colors = ['salmon', 'olivedrab']
labels_m = ['New model', 'Old model']

max_val = []
for it, model in enumerate([model_new, model_old]):
    val_oos = [] 
    ar_oos = []

    y_oos_ = model.predict_proba(X_oos)[:, 1]

    for t in thr:

        CM_oos = confusion_matrix(y_oos, (y_oos_ > t)*1.)  
        ar_oos.append((len(y_oos) - np.sum((y_oos_ > t)*1.)) / len(y_oos_))
        val_oos.append(CM_oos[0][0] * e_fp - CM_oos[1][0] * e_fn)


    plt.plot(ar_oos, val_oos, label = labels_m[it] + ' oos', color = colors[it], linewidth=2) 
    plt.plot(ar_oos[np.argmax(val_oos)], np.max(val_oos), color = colors[it], marker='*', markersize=10)
    max_val.append(np.max(val_oos))

plt.xlabel('Acceptance rate')
plt.ylabel('Benefit')
plt.title('Benefit curve ')
plt.legend(bbox_to_anchor=(1, 1));

plt.grid()
_ = plt.legend(loc= 0, prop= {'size': 16})

print('Model Risk = Benefit(Better Model) -  Benefit(Current Model)) = ', np.round(max_val[0] - max_val[1],1)) 

##### 3.*We have out-of-time sample to check model quality*

Now let's see how benefit depends on model degradation over time
$$Model Risk = Benefit (Current Model) - Benefit (Model Decay)$$

In [None]:
roc_auc_plot(model_old, X_train, y_train, X_oos, y_oos,
                X_oot, y_oot, ['oos', 'oot'])

In [None]:
#this code duplicates the code from benefit_plot for two models
plt.figure(figsize=(10,8))

colors = ['salmon', 'olivedrab']
labels_m = ['New model', 'Old model']

max_val = []

val_oos = [] 
ar_oos = []
val_oot = [] 
ar_oot = []


y_oos_ = model_old.predict_proba(X_oos)[:, 1]
y_oot_ = model_old.predict_proba(X_oot)[:, 1]

for t in thr:

    CM_oos = confusion_matrix(y_oos, (y_oos_ > t)*1.)  
    ar_oos.append((len(y_oos) - np.sum((y_oos_ > t)*1.)) / len(y_oos_))
    val_oos.append(CM_oos[0][0] * e_fp - CM_oos[1][0] * e_fn)

    CM_oot = confusion_matrix(y_oot, (y_oot_ > t)*1.)  
    ar_oot.append((len(y_oot) - np.sum((y_oot_ > t)*1.)) / len(y_oot_))
    val_oot.append(CM_oot[0][0] * e_fp - CM_oot[1][0] * e_fn)



plt.plot(ar_oos, val_oos, label = 'Now', color = 'salmon', linewidth=2) 
plt.plot(ar_oos[np.argmax(val_oos)], np.max(val_oos), color = 'salmon', marker='*', markersize=12)

plt.plot(ar_oot, val_oot, label = 'In 1 Year' , color = 'olivedrab', linewidth=2) 
plt.plot(ar_oot[np.argmax(val_oot)], np.max(val_oot), color = 'olivedrab', marker='*', markersize=12)

plt.xlabel('Acceptance rate')
plt.ylabel('Benefit')
plt.title('Benefit curve ')
plt.legend(bbox_to_anchor=(1, 1));

plt.grid()
_ = plt.legend(loc= 0, prop= {'size': 16})

print('Model Risk = Benefit(Current Model) -  Benefit(Decay Model)) = ',
      ar_oos[np.argmax(val_oos)] - ar_oot[np.argmax(val_oot)]) 

## Model risk

To sum up, we can see the components of midel risk 

In [None]:
plt.figure(figsize=(10,8))

val_oos = [] 
ar_oos = []
val_oot = [] 
ar_oot = []
val_oos_alt = [] 
ar_oos_alt = []

y_oos_ = model_old.predict_proba(X_oos)[:, 1]
y_oot_ = model_old.predict_proba(X_oot)[:, 1]
y_oos_alt_ = model_new.predict_proba(X_oos)[:, 1]

for t in thr:

    CM_oos = confusion_matrix(y_oos, (y_oos_ > t)*1.)  
    ar_oos.append((len(y_oos) - np.sum((y_oos_ > t)*1.)) / len(y_oos_))
    val_oos.append(CM_oos[0][0] * e_fp - CM_oos[1][0] * e_fn)

    CM_oot = confusion_matrix(y_oot, (y_oot_ > t)*1.)  
    ar_oot.append((len(y_oot) - np.sum((y_oot_ > t)*1.)) / len(y_oot_))
    val_oot.append(CM_oot[0][0] * e_fp - CM_oot[1][0] * e_fn)

    CM_oos_alt = confusion_matrix(y_oos, (y_oos_alt_ > t)*1.)  
    ar_oos_alt.append((len(y_oos) - np.sum((y_oos_alt_ > t)*1.)) / len(y_oos_alt_))
    val_oos_alt.append(CM_oos_alt[0][0] * e_fp - CM_oos_alt[1][0] * e_fn)


plt.plot(ar_oos, val_oos, color = 'salmon', linewidth=2) 
plt.plot(ar_oos[np.argmax(val_oos)], np.max(val_oos), color = 'salmon', marker='*', markersize=12, 
         label = 'A - current model')

plt.plot(ar_oot, val_oot, color = 'olivedrab', linewidth=2) 
plt.plot(ar_oot[np.argmax(val_oot)], np.max(val_oot), color = 'olivedrab', marker='*', markersize=12,
        label = 'B - model decay')

plt.plot(ar_oos_alt, val_oos_alt, color = 'deepskyblue', linewidth=2) 
plt.plot(ar_oos_alt[np.argmax(val_oos_alt)], np.max(val_oos_alt), color = 'deepskyblue', marker='*', markersize=12,
        label = 'C - better model')


## Current cut-off
t = 0.1

y_oos_ = model_old.predict_proba(X_oos)[:, 1]

CM_oos = confusion_matrix(y_oos, (y_oos_ > t)*1.)  
ar_oos = ((len(y_oos) - np.sum((y_oos_ > t)*1.)) / len(y_oos_))
val_oos = (CM_oos[0][0] * e_fp - CM_oos[1][0] * e_fn)

## CURRENT CUT-OFF
plt.plot(ar_oos, val_oos, color = 'black', marker='*', markersize=12,
        label = 'D - optimal model')
cutrrent_benefit = val_oos       
    

plt.xlabel('Acceptance rate')
plt.ylabel('Benefit')
plt.title('Benefit curve ')
plt.legend(bbox_to_anchor=(1, 1));

plt.grid()
_ = plt.legend(loc= 0, prop= {'size': 16})

# Model Risk:
   ## Model Decay = A - B
   ## Better Model = C - A
   ## Optimal model = D - A