### import packages

In [10]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from collections import Counter
from copy import deepcopy

import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
 
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, \
    AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier, \
    ExtraTreesClassifier
from sklearn.model_selection import train_test_split #to create validation data set
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from pandas.tools.plotting import scatter_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder, Imputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_curve, auc, mean_squared_error, make_scorer
from sklearn import preprocessing, model_selection

import statsmodels.formula.api as sm

import utilities

from utilities.woe import woe_conversion, woe_graph, woe_analysis, mono_bin, char_bin
from utilities.scorecard import scorecard
from utilities.adverse_action import adverse_action

# pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 500)

In [11]:
# the following csv files are output from accept_reject_dataset.ipynb

accept_df = pd.read_csv('data/germancredit_accept.csv')
reject_df = pd.read_csv('data/germancredit_reject.csv')

In [12]:
accept_df

Unnamed: 0,checking_status,duration,credit_history,purpose,loan_amt,savings_status,emp_length,rate_inc_perc,other_debtors,resid_length,...,external_credit,housing,internal_credit,job_type,dependent_count,has_phone,foreign_ind,target,female,married
0,0.0,6,current_other,radio/television,1169,,10,4,none,4,...,none,own,2,skilled,1,1,1,0,0,0
1,100.0,48,existing_current,radio/television,5951,100.0,4,2,none,2,...,none,own,1,skilled,1,0,1,1,1,1
2,,12,current_other,education,2096,100.0,7,2,none,3,...,none,own,1,unskilled_nonresid,2,0,1,0,0,0
3,,36,existing_current,education,9055,,4,2,none,4,...,none,for free,1,unskilled_nonresid,2,1,1,0,0,0
4,,24,existing_current,furniture/equipment,2835,1000.0,10,3,none,4,...,none,own,1,skilled,1,0,1,0,0,0
5,100.0,36,existing_current,car (used),6948,100.0,4,2,none,2,...,none,rent,1,management,1,1,1,0,0,0
6,,12,existing_current,radio/television,3059,1500.0,7,2,none,4,...,none,own,1,unskilled_nonresid,1,0,1,0,1,1
7,100.0,30,current_other,car (new),5234,100.0,0,4,none,2,...,none,own,2,management,1,0,1,1,0,1
8,100.0,12,existing_current,car (new),1295,100.0,1,3,none,1,...,none,rent,1,skilled,1,0,1,1,1,1
9,100.0,12,existing_current,radio/television,1567,100.0,4,1,none,1,...,none,own,1,skilled,1,1,1,0,1,1


### Reject inference techniques

From https://www.philadelphiafed.org/-/media/consumer-finance-institute/payment-cards-center/events/conferences/2002/ashmeester.pdf?la=en

1. No reject inference
2. Reclassification
3. Re-weighting / Parceling
4. Heckman's bias correction / Heckman (1979), Hand & Henley (1993)
5. Supplemental bureau data

From https://analytics.ncsu.edu/sesug/2008/ST-160.pdf

1. Manual estimation
2. Augmentation
3. Extrapolation (Parceling, Fuzzy Augmentation)

From https://support.sas.com/resources/papers/proceedings09/305-2009.pdf

1. Hard Cutoff
2. Parceling

predict p1, xb

Above calculate predicted value from regression (equivalent to Ziγ in (2))

replace p1=-p1

Above calculates -Ziγ

generate phi = (1/sqrt(2*_pi))*exp(-(p1^2/2))

This is the normal distribution density function: phi is equivalent to φ(- Ziγ) in (11)

generate capphi = normal(p1)

This is the cumulative debsity function: capphi is equivalent to Φ(- Ziγ ) in (11)

generate invmills1 = phi/(1-capphi)

This calculates Inverse Mills ratio λi(-Ziγ)

In [20]:
df = pd.read_stata('mroz.dta')

In [21]:
df

Unnamed: 0,inlf,hours,kidslt6,kidsge6,age,educ,wage,repwage,hushrs,husage,...,faminc,mtr,motheduc,fatheduc,unem,city,exper,nwifeinc,lwage,expersq
0,1,1610,1,0,32,12,3.3540,2.65,2708,34,...,16310.0,0.7215,12,7,5.0,0,14,10.910060,1.210154,196
1,1,1656,0,2,30,12,1.3889,2.65,2310,30,...,21800.0,0.6615,7,7,11.0,1,5,19.499981,0.328512,25
2,1,1980,1,3,35,12,4.5455,4.04,3072,40,...,21040.0,0.6915,12,7,5.0,0,15,12.039910,1.514138,225
3,1,456,0,3,34,12,1.0965,3.25,1920,53,...,7300.0,0.7815,7,7,5.0,0,6,6.799996,0.092123,36
4,1,1568,1,2,31,14,4.5918,3.60,2000,32,...,27300.0,0.6215,12,14,9.5,1,7,20.100058,1.524272,49
5,1,2032,0,0,54,12,4.7421,4.70,1040,57,...,19495.0,0.6915,14,7,7.5,1,33,9.859054,1.556480,1089
6,1,1440,0,2,37,16,8.3333,5.95,2670,37,...,21152.0,0.6915,14,7,5.0,0,11,9.152048,2.120260,121
7,1,1020,0,0,54,12,7.8431,9.98,4120,53,...,18900.0,0.6915,3,3,5.0,0,35,10.900038,2.059634,1225
8,1,1458,0,2,48,12,2.1262,0.00,1995,52,...,20405.0,0.7515,7,7,3.0,0,24,17.305000,0.754336,576
9,1,1600,0,2,39,12,4.6875,4.15,2100,43,...,20425.0,0.6915,7,7,5.0,0,21,12.925000,1.544899,441
