## Week 2
### Model quality and decision making. Benefit curve

In this jupyter-notebook we will learn how to calculate the profit of using the better model

#### Import libraries

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix


from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score as f1

import matplotlib.colors


sns.set(style="white")

np.random.seed(2020)

#settings for plots
plt.rcParams.update({'font.size': 16,
                     'xtick.labelsize' : 14, 
                     'ytick.labelsize' : 14,
                     'axes.labelsize' : 16,
                     'axes.titlesize' : 20})

In [None]:
import warnings
warnings.filterwarnings("ignore")

Define some functions that will help us to plot graphs. 

In [None]:
def roc_auc_plot(model, X_train, y_train,
                X_oos, y_oos, X_oot, y_oot, names = ['train', 'oos' ]):
    
    plt.figure(figsize=(10,8))
    colors = ['olivedrab','deepskyblue',  'salmon']
    
    for it, i in enumerate(names):
        
        #choose the data
        if i == 'train':
            X = X_train
            y_ = y_train
        elif i == 'oos':
            X = X_oos
            y_ = y_oos
        else:
            X = X_oot
            y_ = y_oot
        
        y_hat = model.predict_proba(X)[:, 1]
        fpr, tpr, _ = roc_curve(y_, y_hat)
        gini = 2 * roc_auc_score(y_, y_hat) - 1
        plt.plot(fpr, tpr, label = i, color = colors[it], linewidth=2)
        print('Model',i, 'gini: ', np.round(gini,6))

    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('Model old')
    plt.legend(bbox_to_anchor=(1, 1));
    plt.plot([0,1], [0,1], '--', color='grey', label='Random model')

    plt.grid()
    _ = plt.legend(loc= 0, prop= {'size': 16})

In [None]:
def benefit_plot(model, X_train, y_train,
                X_oos, y_oos, X_oot, y_oot, names = ['train', 'oos' ]):
                 
    plt.figure(figsize=(10,8))

    colors = ['olivedrab','deepskyblue',  'salmon']
    
    max_val = []
    for it, i in enumerate(names):
        
        if i == 'train':
            X = X_train
            y_ = y_train
        elif i == 'oos':
            X = X_oos
            y_ = y_oos
        else:
            X = X_oot
            y_ = y_oot

        benefit = []    
        c_acceptance_rate = []
        
        y_hat = model.predict_proba(X)[:, 1]
        
        for t in thr:
            #calculate confusion matrix
            CM = confusion_matrix(y_, (y_hat > t)*1.)  
            
            #calculate accaptance rate as amount of non-defaulted clients
            c_acceptance_rate.append((len(y_hat) - np.sum((y_hat > t)*1.)) / len(y_hat))
            TN = CM[0][0]
            FN = CM[1][0]
            FP = CM[0][1]
            
            #calculate the financial effect
            benefit.append(TN * e_fp - FN * e_fn)

        fpr, tpr, _ = roc_curve(y_, y_hat)
        print('Model',i, 'Max Benefit: ', np.max(benefit)) 
        
        plt.plot(c_acceptance_rate, benefit, label = i, color = colors[it], linewidth=2)  
        plt.plot(c_acceptance_rate[np.argmax(benefit)], np.max(benefit), color = colors[it], marker='*', markersize=10)
         

    plt.xlabel('Acceptance rate')
    plt.ylabel('Benefit')
    plt.title('Benefit curve for old model')
    plt.legend(bbox_to_anchor=(1, 1));

    plt.grid()
    _ = plt.legend(loc= 0, prop= {'size': 16})
    


__Consider a binary classification model $X -> Prob$, e.g. credit scoring__:

#### Load Data

Load **train**, **out-of-sample** and **out-of-time** samples

In [None]:
df_train = pd.read_csv('../../notebooks/data/w2/benefit-curve/df_train.csv')
df_oos = pd.read_csv('../../notebooks/data/w2/benefit-curve//df_oos.csv')
df_oot = pd.read_csv('../../notebooks/data/w2/benefit-curve//df_oot.csv')
## based on kaggle https://www.kaggle.com/c/GiveMeSomeCredit

In [None]:
df_train.head()

__Column description:__

- `issue_d` - The month which the loan was funded
- `addr_state` - The state provided by the borrower in the loan application
- `emp_title` - The job title supplied by the Borrower when applying for the loan.
- `installment` - The monthly payment owed by the borrower if the loan originates.
- `dti` - A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
- `funded_amnt` - The total amount committed to that loan at that point in time.
- `annual_inc` - The self-reported annual income provided by the borrower during registration.
- `emp_length` - Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years. 
- `term` - The number of payments on the loan. Values are in months and can be either 36 or 60.
- `inq_last_6mths` - The number of inquiries in past 6 months (excluding auto and mortgage inquiries)
- `mths_since_recent_inq` - Months since most recent inquiry.
- `delinq_2yrs` - The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years
- `chargeoff_within_12_mths` - Number of charge-offs within 12 months
- `num_accts_ever_120_pd` - Number of accounts ever 120 or more days past due
- `num_tl_90g_dpd_24m` - Number of accounts 90 or more days past due in last 24 months
- `acc_open_past_24mths` - Number of trades opened in past 24 months.
- `avg_cur_bal` - Average current balance of all accounts
- `tot_hi_cred_lim` - Total high credit/credit limit
- `delinq_amnt` - The past-due amount owed for the accounts on which the borrower is now delinquent.

And all categorical variables are encoded:
- `sub_grade` - External assigned loan subgrade
- `purpose` - A category provided by the borrower for the loan request.
- `home_ownership` - The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER


__Target variable:__
- `def`

#### Modelling

##### *1. Define target*

In [None]:
targ_cols = [i for i in df_train.columns if i != 'def']

X_train = df_train[targ_cols]
X_oos = df_oos[targ_cols]
X_oot = df_oot[targ_cols]

y_train = df_train["def"]
y_oos = df_oos["def"]
y_oot = df_oot["def"]

##### *2. Define FP and FN costs*

Financial result of model performance depends on FP and FN error

In [None]:
S = 900 # amount of loan
r = 0.035 # interest rate
lgd = 0.25 # losses in case of default
e_fp = r * S # 1 type error cost
e_fn = lgd * S # 2 type error cost

In [None]:
#the grid to calculate acceptance rate
thr = np.linspace(0,1,41) #41

##### 3.*Train old model*

In [None]:
model_old = LogisticRegression(C=0.02, fit_intercept=True, max_iter=100,
          penalty='l1', random_state=123, solver = 'liblinear',
          tol=0.01).fit(X_train, y_train)


model_old.fit(X_train, y_train)

**Plot roc-auc** in order to compare the quality of model on different samples

In [None]:
roc_auc_plot(model_old, X_train, y_train,
                X_oos, y_oos, X_oot, y_oot, ['train', 'oos' ])

We see that the quality of model on oos and oot data is lower than on the train

___
**Plot benefit curve** to see the dependence between the benefit and acceptance rate 

Simple threshold decision is a level $a$: 
if $Prob > a$ then some action is undertaken, i.e. $\hat{𝑌}=1$ and otherwise $\hat{𝑌}=0$


**Acceptance rate** $c$ is a percentage of observations that satisfy rule:
$$
if Prob \le a, i.e. c = \frac{\sum_{i=1}^{N}I\{Prob_i\le a\}}{N}
$$

We use the function that is defined above. For each acceptance rate we calculate the benefit:
1. Pick threshold level a. Calculate c (x axis)
2. Calculate FP and FN for given c
3. Weigh FP and FN with error costs (e_FP and e_FN) and plot on y axis
4. Reiterate 1-3 from $c = 0$ to $c = 1$

In [None]:
benefit_plot(model_old, X_train, y_train, X_oos, y_oos, 
                X_oot, y_oot,['train', 'oos' ])

##### 4.*Train new model*

Define better model with optimized hyperparameters (outside this notebook)

You can also train your own more complex model (random forest or gradient-boosting models)

In [None]:
model_new = LogisticRegression(C=2, fit_intercept=True, max_iter=100,
          penalty='l1', random_state=123, solver = 'liblinear',
          tol=0.01).fit(X_train, y_train)

model_new.fit(X_train, y_train)

**Plot roc-auc** in order to compare the quality of model on different samples

In [None]:
roc_auc_plot(model_new, X_train, y_train,X_oos, y_oos, 
                X_oot, y_oot, ['train', 'oos'])

**Plot benefit curve** to see the dependence between the benefit and acceptance rate 

In [None]:
benefit_plot(model_new, X_train, y_train, X_oos, y_oos, 
                X_oot, y_oot, ['train', 'oos'])

In our example we get that the higher model quality metrics are, the higher is financial result (in general)

___

There are some tasks to the examples above.

### Task 1 
#### Calculate *gini metrics* for train and oos samples for both models

In [None]:
y_hat_train_old = model_old.predict_proba(X_train)[:, 1]
y_hat_oos_old = model_old.predict_proba(X_oos)[:, 1]

y_hat_train_new = model_new.predict_proba(X_train)[:, 1]
y_hat_oos_new = model_new.predict_proba(X_oos)[:, 1]

In [None]:
# your code here

train_gini_old_model = 2 * roc_auc_score(y_train, y_hat_train_old) - 1
oos_gini_old_model = 2 * roc_auc_score(y_oos, y_hat_oos_old) - 1

train_gini_new_model = 2 * roc_auc_score(y_train, y_hat_train_new) - 1
oos_gini_new_model = 2 * roc_auc_score(y_oos, y_hat_oos_new) - 1


# your code here


In [None]:
print("oos_gini_old_model: ", oos_gini_old_model)
print("train_gini_old_model", train_gini_old_model)
print("train_gini_new_model", train_gini_new_model)
print("oos_gini_new_model", oos_gini_new_model)

### Task 2 
#### Calculate the difference between max benefits for train and oos max for new model

In [None]:
# your code here
def benefit_calc(model, X_train, y_train,
                X_oos, y_oos, X_oot, y_oot, names = ['train', 'oos' ]):
                     
    
    dict_benefit = {}
    
    max_val = []
    for it, i in enumerate(names):
          
        if i == 'train':
            X = X_train
            y_ = y_train
        elif i == 'oos':
            X = X_oos
            y_ = y_oos
        else:
            X = X_oot
            y_ = y_oot

        benefit = []    
        c_acceptance_rate = []
        
        y_hat = model.predict_proba(X)[:, 1]
        
        for t in thr:
            #calculate confusion matrix
            CM = confusion_matrix(y_, (y_hat > t)*1.)  
            
            #calculate accaptance rate as amount of non-defaulted clients
            c_acceptance_rate.append((len(y_hat) - np.sum((y_hat > t)*1.)) / len(y_hat))
            TN = CM[0][0]
            FN = CM[1][0]
            FP = CM[0][1]
            
            #calculate the financial effect
            benefit.append(TN * e_fp - FN * e_fn)
          
        # YOUR CODE 
        dict_benefit[i] = pd.DataFrame(columns= ['acc_rate', 'benefit'])
        dict_benefit[i]['acc_rate'] = c_acceptance_rate
        dict_benefit[i]['benefit'] = benefit
        ######
    
    return dict_benefit

In [None]:
d_2 = benefit_calc(model_new, X_train, y_train, X_oos, y_oos, X_oot, y_oot)

benefit_train_new_max = d_2['train']['benefit'].max()
benefit_oos_new_max = d_2['oos']['benefit'].max()
print("Max new train benefit {:.5f}".format(benefit_train_new_max))
print("Max new OOS benefit {:.5f}".format(benefit_oos_new_max))


In [None]:
# your code here
benefit_diff = benefit_train_new_max - benefit_oos_new_max

# your code here


### Task 3 
__Rewrite the__ `benefit_plot` to `benefit_calc` __function and find__
1. benefit that we get for oos sample for the new model at $acceptance\_rate = 0.62535$
2. the difference between the best benefit for oos sample for the new model and benefit from previous item (at $acceptance\_rate = 0.62535$)

In [None]:
# your code here
def benefit_calc(model, X_train, y_train,
                X_oos, y_oos, X_oot, y_oot, names = ['train', 'oos' ]):
                     
    
    dict_benefit = {}
    
    max_val = []
    for it, i in enumerate(names):
          
        if i == 'train':
            continue
            print("skipped")
            X = X_train
            y_ = y_train
        elif i == 'oos':
            X = X_oos
            y_ = y_oos
        else:
            continue
            print("skipped")
            X = X_oot
            y_ = y_oot

        benefit = []    
        c_acceptance_rate = []
        
        y_hat = model.predict_proba(X)[:, 1]
        
        for t in np.linspace(0,1, 1000):
            #calculate accaptance rate as amount of non-defaulted clients
            c_acceptance_rate.append((len(y_hat) - np.sum((y_hat > t)*1.)) / len(y_hat))
            
            #calculate confusion matrix
            CM = confusion_matrix(y_, (y_hat > t)*1.)
            TN = CM[0][0]
            FN = CM[1][0]
            FP = CM[0][1]
            
            #calculate the financial effect
            benefit.append(TN * e_fp - FN * e_fn)
          
        # YOUR CODE 
        dict_benefit[i] = pd.DataFrame(columns= ['acc_rate', 'benefit'])
        dict_benefit[i]['acc_rate'] = c_acceptance_rate
        dict_benefit[i]['benefit'] = benefit
        ######
    
    return dict_benefit



In [None]:
d = benefit_calc(model_new, X_train, y_train, X_oos, y_oos, X_oot, y_oot)
df_oos = d['oos']

In [None]:
from sklearn import linear_model

my_data = df_oos.copy()
X = my_data[['acc_rate']]
X['acc_rate_2'] = X['acc_rate'] ** 2
# X['acc_rate_3'] = X['acc_rate'] ** 3
y = df_oos['benefit']
regression = linear_model.LinearRegression().fit(X, y)

In [None]:
linspace = np.linspace(0, 1, 10000)
linspace_data = pd.DataFrame(linspace, columns=['acc_rate'])
linspace_data['acc_rate_2'] = linspace_data['acc_rate'] ** 2
# linspace_data['acc_rate_3'] = linspace_data['acc_rate'] ** 3
y_hats = regression.predict(linspace_data)
plt.plot(linspace, y_hats, linewidth=2)

In [None]:
benefit_at_point = regression.predict([[0.62535, 0.62535**2]])[0]
diff_benefit = df_oos.benefit.max() - benefit_at_point

print("Benefit at a=0.62535 -> {:.3f}".format(benefit_at_point))
print("Diff benefit -> {:.3f}".format(diff_benefit))
# your code here