In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import statsmodels.formula.api as smf

pd.options.display.max_columns = 2000
pd.options.display.max_rows = 200

In [16]:
def print_def(variable_name):
    """
    Pass in a variable name as a string, print its defintion.
    Definitions included with the original dataset. 
    Only works with ORIGINAL feature names.
    """
    
    try:
        if  not ("definitions_" in globals()):
            try:
                global definitions_
                definitions_ = pd.read_csv("LC_definitions.csv")       
            except:
                print(f"Could not find \"LC_definitions.csv\" in {pwd}.")
        print(variable_name, "-", definitions_.loc[definitions_.LoanStatNew == variable_name, "Description"].iloc[0])
    except:
        print("New feature, not in original dictionary.")

In [28]:
# Either pass in a list of columns to use via logit_cols_ 
# or pass in a list of columns to drop via logit_drop_
def smf_logit(df_, y_, logit_cols_=None, logit_drop_=None):
    if logit_drop_:
        sm_df = df_.drop(labels=logit_drop_, axis=1).dropna()
    elif logit_cols_:
        sm_df = df_.loc[:, logit_cols_+[y_]].copy().dropna()   
    else:
        sm_df = df_.copy().dropna()
        
    formula = y_ + " ~ "
    for column in sm_df.drop(labels=[y_], axis=1).columns:
        formula += column + " + "
    formula = formula[:-3]
    
    print("Formula") 
    print("-------", "\n", f"{formula}\n")
    sm_model = smf.logit(formula, data=sm_df).fit()
    print(sm_model.summary())

In [39]:
# read in datasets
df = pd.read_csv("lending_club_sample_preprocessed.csv")
df_biased = pd.read_csv("lending_club_biased_preprocessed.csv")

In [40]:
# Convert objects to dummy-series.
object_cols = ["grade", "sub_grade", "home_ownership", 
                   "purpose", "addr_state", "issue_date_month"]


df = pd.get_dummies(df, columns=object_cols)
df_biased = pd.get_dummies(df_biased, columns=object_cols)

In [41]:
df.head()

Unnamed: 0,funded_amnt,installment,annual_inc,issue_d,loan_status,dti,delinq_2yrs,earliest_cr_line,fico_range_high,inq_last_6mths,open_acc,pub_rec,revol_bal,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,last_pymnt_amnt,last_fico_range_high,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_il_tl,num_op_rev_tl,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,target,term_36months,initial_list_status_w,individual,hardhship,employed_over_10yrs,emp_years,interest_rate,revolving_util,issue_date_year,years_since_earliest_cr_line,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,sub_grade_A1,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,addr_state_AK,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,issue_date_month_1,issue_date_month_2,issue_date_month_3,issue_date_month_4,issue_date_month_5,issue_date_month_6,issue_date_month_7,issue_date_month_8,issue_date_month_9,issue_date_month_10,issue_date_month_11,issue_date_month_12
0,32075.0,830.99,80000.0,2019-02-01,Charged Off,17.63,1.0,2004-04-01,729.0,5.0,19.0,0.0,15972.0,34.0,6614.17,6614.17,2746.05,3868.12,0.0,0.0,830.99,534.0,0.0,0.0,0.0,153469.0,63800.0,2.0,8077.0,26686.0,28.6,0.0,0.0,19.0,178.0,44.0,9.0,4.0,50.0,1.0,0.0,8.0,10.0,2.0,17.0,10.0,19.0,0.0,0.0,0.0,1.0,90.9,0.0,0.0,0.0,207800.0,34130.0,37400.0,20000.0,1,1,1,1,0,1,10.0,18.94,25.0,2019,15.0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,27200.0,840.11,70000.0,2019-11-01,Current,20.57,0.0,1997-06-01,714.0,0.0,12.0,0.0,30621.0,31.0,5030.05,5030.05,4189.85,840.2,0.0,0.0,840.11,759.0,0.0,0.0,0.0,63051.0,64600.0,3.0,5254.0,18559.0,30.2,0.0,0.0,90.0,126.0,12.0,12.0,1.0,12.0,,0.0,3.0,8.0,10.0,11.0,8.0,12.0,0.0,0.0,0.0,1.0,100.0,20.0,0.0,0.0,104070.0,63051.0,26600.0,39470.0,0,1,1,1,0,0,,7.02,47.4,2019,22.0,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,20000.0,461.13,60000.0,2017-09-01,Current,33.44,1.0,2001-06-01,704.0,0.0,9.0,0.0,27915.0,22.0,14801.46,14801.46,8988.45,5813.01,0.0,0.0,461.13,684.0,0.0,0.0,59.0,83456.0,40100.0,0.0,9273.0,7018.0,76.2,0.0,0.0,159.0,195.0,52.0,27.0,2.0,52.0,,0.0,2.0,4.0,14.0,4.0,4.0,9.0,0.0,0.0,0.0,0.0,95.5,50.0,0.0,0.0,123750.0,83456.0,29500.0,83650.0,0,1,1,0,0,0,0.0,13.59,69.6,2017,16.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,9600.0,307.48,38000.0,2016-09-01,Fully Paid,24.6,0.0,1984-11-01,694.0,0.0,18.0,0.0,12359.0,52.0,10455.217527,10455.22,9600.0,855.22,0.0,0.0,6770.52,714.0,0.0,0.0,0.0,86367.0,25900.0,14.0,4798.0,5887.0,60.8,0.0,0.0,76.0,382.0,1.0,1.0,1.0,1.0,11.0,7.0,4.0,9.0,16.0,14.0,9.0,18.0,0.0,0.0,0.0,3.0,82.7,40.0,0.0,0.0,111584.0,17484.0,15000.0,7751.0,0,1,0,1,0,1,10.0,9.49,47.7,2016,32.0,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
4,16000.0,404.3,48000.0,2013-07-01,Fully Paid,12.98,0.0,1996-03-01,664.0,1.0,12.0,2.0,6728.0,28.0,22309.3,22309.3,16000.0,6309.3,0.0,0.0,9166.1,689.0,0.0,0.0,1752.0,161875.0,8400.0,4.0,13490.0,1552.0,77.8,0.0,0.0,188.0,210.0,6.0,6.0,1.0,12.0,6.0,3.0,3.0,4.0,15.0,4.0,4.0,12.0,0.0,0.0,0.0,2.0,89.0,33.3,0.0,2.0,173047.0,29534.0,7000.0,29239.0,0,1,1,1,0,0,9.0,17.77,80.1,2013,17.0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False


In [43]:
# A list of 1 category from each object set as well as DateTime features for ease of dropping.
logit_drop_cols = ["grade_G", "sub_grade_G5", "home_ownership_NONE", 
                   "purpose_other", "addr_state_WY", "issue_date_month_12"] + \
                    list(df.select_dtypes(include='datetime').columns)

# df.drop(labels=logit_drop_cols, axis=1, inplace=True)

In [44]:
# Run a simple logistic regression using statsmodels
use_cols = ["last_fico_range_high", "last_pymnt_amnt", "total_rec_prncp", 
            "funded_amnt", "issue_date_year", "installment", 
            "total_pymnt_inv", "total_pymnt"]

smf_logit(df, "target", logit_cols_=use_cols)

Formula
------- 
 target ~ last_fico_range_high + last_pymnt_amnt + total_rec_prncp + funded_amnt + issue_date_year + installment + total_pymnt_inv + total_pymnt

Optimization terminated successfully.
         Current function value: 0.109749
         Iterations 13
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                60000
Model:                          Logit   Df Residuals:                    59991
Method:                           MLE   Df Model:                            8
Date:                Wed, 09 Aug 2023   Pseudo R-squ.:                  0.7090
Time:                        11:49:11   Log-Likelihood:                -6584.9
converged:                       True   LL-Null:                       -22628.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
-------------