<a href="https://colab.research.google.com/github/EmilSeyfullayev/Credit_Risk_Modeling_in_Python/blob/main/Scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Model loading and data insertion

In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
import scipy.stats as stat
import pickle

In [152]:
class LogisticRegression_with_p_values:
    
    def __init__(self,*args,**kwargs):#,**kwargs):
        self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)

    def fit(self,X,y):
        self.model.fit(X,y)
        
        #### Get p-values for the fitted model ####
        denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X))))
        denom = np.tile(denom,(X.shape[1],1)).T
        F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
        Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
        sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
        z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient
        p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores] ### two tailed test for p-values
        
        self.coef_ = self.model.coef_
        self.intercept_ = self.model.intercept_
        #self.z_scores = z_scores
        self.p_values = p_values
        #self.sigma_estimates = sigma_estimates
        #self.F_ij = F_ij

loaded_model = pickle.load(
    open("/content/drive/MyDrive/Credit Risk Modeling/finalized_model.sav", 'rb')
)

lg = loaded_model

In [153]:
train = pd.read_csv('/content/drive/MyDrive/Credit Risk Modeling/train_features_selected.csv')
test = pd.read_csv("/content/drive/MyDrive/Credit Risk Modeling/test_features_selected.csv")
columns_needed_to_be_dropped = ['Unnamed: 0','since_issue_date:<87',
'since_issue_date:<90', 'since_issue_date:<92',
'since_issue_date:<94', 'since_issue_date:<99',
'since_issue_date:<103', 'since_issue_date:<112',
'since_issue_date:<142', 'since_issue_date:>=142',]
train.drop(columns_needed_to_be_dropped, axis=1, inplace=True)
test.drop(columns_needed_to_be_dropped, axis=1, inplace=True)
#train.columns.values

reference_variables = [
'grade:G',
'verification_status:Verified',
'addr_state:NE_IA_NV_AL_NM',
'purpose:educ__sm_b__wedd__ren_en__mov__house',
'initial_list_status:f',
'term: 60 months',
'emp_length:0',
'since_issue_date:>=142.1',
'interest_rate:>20.281',
'earl_cr_line:>=437',
'delinq_2yrs:>=12',
'inq_last_6mths:>6',
'open_acc:0',
'pub_rec:0',
'total_acc:<=3',
'acc_now_delinq:0',
'total_rev_hi_lim:<=5K',
'annual_inc:<20K',
'dti:>35',
'mths_since_last_delinq:0-3',
'mths_since_last_rec:0-2'
]
chp_train = train
chp_test = test
train.drop(reference_variables, axis=1, inplace=True)
test.drop(reference_variables, axis=1, inplace=True)

# ['open_acc', 'total_acc', 'delinq_2yrs', 'acc_now_delinq']

dropped_according_to_low_p_value = [
 'open_acc:1-3',
 'open_acc:4-9',
 'open_acc:10-12',
 'open_acc:13-17',
 'open_acc:18-22',
 'open_acc:23-27',
 'open_acc:28-35',
 'open_acc:>=36',
 'total_acc:4-15',
 'total_acc:16-21',
 'total_acc:22-27',
 'total_acc:28-48',
 'total_acc:49-63',
 'total_acc:64-78',
 'total_acc:>=79',
 'delinq_2yrs:0', 'delinq_2yrs:1-3', 'delinq_2yrs:4-11',
 'acc_now_delinq:>=1'
]
len(dropped_according_to_low_p_value)

train.drop(dropped_according_to_low_p_value, axis=1, inplace=True)
test.drop(dropped_according_to_low_p_value, axis=1, inplace=True)

intercept = lg.intercept_
coefficients = lg.coef_

summary_table = pd.DataFrame()
summary_table['feature_name'] = np.array(['intercept'])
summary_table['coefficients'] = intercept

X_train = train.drop('good_loan', axis=1)
y_train = train['good_loan']
X_test = test.drop('good_loan', axis=1)
y_test = test['good_loan']

# Beautiful code
# [x for x in train.columns.values if x.startswith('acc_now_delinq')]


temp = pd.DataFrame()
temp['feature_name'] = np.array(X_train.columns.values)
temp['coefficients'] = np.array(coefficients[0]) # coefs were in [[]], so we accessed first 
temp

summary_table = pd.concat([summary_table, temp])
summary_table.reset_index(inplace=True)

summary_table

Unnamed: 0,index,feature_name,coefficients
0,0,intercept,-1.107269
1,0,grade:A,1.199206
2,1,grade:B,0.960957
3,2,grade:C,0.767240
4,3,grade:D,0.570963
...,...,...,...
91,90,since_issue_date:<99.1,0.686175
92,91,since_issue_date:<103.1,0.508828
93,92,since_issue_date:<112.1,0.253833
94,93,since_issue_date:<142.1,-0.063938


### Creating scorecard

In [154]:
summary_table = summary_table[['feature_name', 'coefficients']]
summary_table['p_values'] = lg.p_values
summary_table

Unnamed: 0,feature_name,coefficients,p_values
0,intercept,-1.107269,
1,grade:A,1.199206,1.225368e-27
2,grade:B,0.960957,1.123348e-40
3,grade:C,0.767240,7.081904e-33
4,grade:D,0.570963,5.847956e-21
...,...,...,...
91,since_issue_date:<99.1,0.686175,9.108250e-61
92,since_issue_date:<103.1,0.508828,8.116011e-33
93,since_issue_date:<112.1,0.253833,1.049014e-09
94,since_issue_date:<142.1,-0.063938,1.000279e-01


In [155]:
del(temp)
temp = pd.DataFrame()
temp['feature_name'] = reference_variables
temp['coefficients'] = 0
temp['p_values'] = np.nan

In [156]:
temp

Unnamed: 0,feature_name,coefficients,p_values
0,grade:G,0,
1,verification_status:Verified,0,
2,addr_state:NE_IA_NV_AL_NM,0,
3,purpose:educ__sm_b__wedd__ren_en__mov__house,0,
4,initial_list_status:f,0,
5,term: 60 months,0,
6,emp_length:0,0,
7,since_issue_date:>=142.1,0,
8,interest_rate:>20.281,0,
9,earl_cr_line:>=437,0,


In [157]:
summary_table = pd.concat([summary_table, temp])
summary_table

Unnamed: 0,feature_name,coefficients,p_values
0,intercept,-1.107269,
1,grade:A,1.199206,1.225368e-27
2,grade:B,0.960957,1.123348e-40
3,grade:C,0.767240,7.081904e-33
4,grade:D,0.570963,5.847956e-21
...,...,...,...
16,total_rev_hi_lim:<=5K,0.000000,
17,annual_inc:<20K,0.000000,
18,dti:>35,0.000000,
19,mths_since_last_delinq:0-3,0.000000,


In [158]:
original_var_name = [x.split(":")[0] for x in summary_table['feature_name']]
summary_table['original_var_name'] = original_var_name
summary_table = summary_table[['original_var_name', 'feature_name', 'coefficients', 'p_values']]
summary_table

Unnamed: 0,original_var_name,feature_name,coefficients,p_values
0,intercept,intercept,-1.107269,
1,grade,grade:A,1.199206,1.225368e-27
2,grade,grade:B,0.960957,1.123348e-40
3,grade,grade:C,0.767240,7.081904e-33
4,grade,grade:D,0.570963,5.847956e-21
...,...,...,...,...
16,total_rev_hi_lim,total_rev_hi_lim:<=5K,0.000000,
17,annual_inc,annual_inc:<20K,0.000000,
18,dti,dti:>35,0.000000,
19,mths_since_last_delinq,mths_since_last_delinq:0-3,0.000000,


In [159]:
summary_table.reset_index(drop=True, inplace=True)

In [160]:
min_score = 300
max_score = 850

In [161]:
summary_table.groupby('original_var_name')['coefficients'].min()
# in perfect world min values shoul be zero
# but in real world we have some exceptions
# even those exceptions are close to zeros

original_var_name
acc_now_delinq            0.000000
addr_state                0.000000
annual_inc               -0.040339
delinq_2yrs               0.000000
dti                      -0.019125
earl_cr_line             -0.065808
emp_length                0.000000
grade                     0.000000
home_ownership           -0.079565
initial_list_status       0.000000
inq_last_6mths            0.000000
intercept                -1.107269
interest_rate             0.000000
mths_since_last_delinq    0.000000
mths_since_last_rec      -0.007604
open_acc                  0.000000
pub_rec                   0.000000
purpose                  -0.006576
since_issue_date         -0.063938
term                      0.000000
total_acc                 0.000000
total_rev_hi_lim          0.000000
verification_status      -0.009455
Name: coefficients, dtype: float64

In [162]:
sum_of_minimum_coefs = summary_table.groupby('original_var_name')['coefficients'].min().sum()
sum_of_minimum_coefs

-1.3996792540740468

In [163]:
sum_of_maximum_coefs = summary_table.groupby('original_var_name')['coefficients'].max().sum()
sum_of_maximum_coefs

6.179429516945129

In [164]:
difference_max_min = sum_of_maximum_coefs - sum_of_minimum_coefs
difference_max_min

7.579108771019176

In [165]:
summary_table['scores'] = summary_table['coefficients']*(max_score-min_score)/difference_max_min

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [169]:
summary_table

Unnamed: 0,original_var_name,feature_name,coefficients,p_values,scores
0,intercept,intercept,-1.107269,,-5429.415156
1,grade,grade:A,1.199206,1.225368e-27,87.023887
2,grade,grade:B,0.960957,1.123348e-40,69.734655
3,grade,grade:C,0.767240,7.081904e-33,55.677028
4,grade,grade:D,0.570963,5.847956e-21,41.433613
...,...,...,...,...,...
112,total_rev_hi_lim,total_rev_hi_lim:<=5K,0.000000,,0.000000
113,annual_inc,annual_inc:<20K,0.000000,,0.000000
114,dti,dti:>35,0.000000,,0.000000
115,mths_since_last_delinq,mths_since_last_delinq:0-3,0.000000,,0.000000


In [167]:
(summary_table['scores'][0] - sum_of_minimum_coefs) / (sum_of_maximum_coefs-sum_of_minimum_coefs) * (max_score-min_score) + min_score

-5429.415155731904

In [170]:
summary_table['scores'][0] = ((summary_table['coefficients'][0] 
                               - sum_of_minimum_coefs) / difference_max_min) * (max_score-min_score) + min_score

summary_table

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,original_var_name,feature_name,coefficients,p_values,scores
0,intercept,intercept,-1.107269,,321.219630
1,grade,grade:A,1.199206,1.225368e-27,87.023887
2,grade,grade:B,0.960957,1.123348e-40,69.734655
3,grade,grade:C,0.767240,7.081904e-33,55.677028
4,grade,grade:D,0.570963,5.847956e-21,41.433613
...,...,...,...,...,...
112,total_rev_hi_lim,total_rev_hi_lim:<=5K,0.000000,,0.000000
113,annual_inc,annual_inc:<20K,0.000000,,0.000000
114,dti,dti:>35,0.000000,,0.000000
115,mths_since_last_delinq,mths_since_last_delinq:0-3,0.000000,,0.000000
