In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import statsmodels.formula.api as smf

pd.options.display.max_columns = 2000
pd.options.display.max_rows = 200

In [16]:
def print_def(variable_name):
    """
    Pass in a variable name as a string, print its defintion.
    Definitions included with the original dataset. 
    Only works with ORIGINAL feature names.
    """
    
    try:
        if  not ("definitions_" in globals()):
            try:
                global definitions_
                definitions_ = pd.read_csv("LC_definitions.csv")       
            except:
                print(f"Could not find \"LC_definitions.csv\" in {pwd}.")
        print(variable_name, "-", definitions_.loc[definitions_.LoanStatNew == variable_name, "Description"].iloc[0])
    except:
        print("New feature, not in original dictionary.")

In [28]:
# Either pass in a list of columns to use via logit_cols_ 
# or pass in a list of columns to drop via logit_drop_
def smf_logit(df_, y_, logit_cols_=None, logit_drop_=None):
    if logit_drop_:
        sm_df = df_.drop(labels=logit_drop_, axis=1).dropna()
    elif logit_cols_:
        sm_df = df_.loc[:, logit_cols_+[y_]].copy().dropna()   
    else:
        sm_df = df_.copy().dropna()
        
    formula = y_ + " ~ "
    for column in sm_df.drop(labels=[y_], axis=1).columns:
        formula += column + " + "
    formula = formula[:-3]
    
    print("Formula") 
    print("-------", "\n", f"{formula}\n")
    sm_model = smf.logit(formula, data=sm_df).fit()
    print(sm_model.summary())

In [45]:
# read in datasets
df_train = pd.read_csv("lending_club_sample_preprocessed_train.csv")
df_test = pd.read_csv("lending_club_sample_preprocessed_test.csv")
df_validate = pd.read_csv("lending_club_sample_preprocessed_validate.csv")
df_biased = pd.read_csv("lending_club_biased_preprocessed.csv")

In [48]:
# Drop duplicate target column and object-type date columns
XG_drop_cols = ["loan_status"]+list(df_train.select_dtypes(include='object').columns)

df_train = df_train.drop(labels=XG_drop_cols, axis=1)
df_test = df_test.drop(labels=XG_drop_cols, axis=1)
df_validate = df_validate.drop(labels=XG_drop_cols, axis=1)
df_biased = df_biased.drop(labels=XG_drop_cols, axis=1)

In [49]:
df_train.head()

Unnamed: 0,funded_amnt,installment,annual_inc,dti,delinq_2yrs,fico_range_high,inq_last_6mths,open_acc,pub_rec,revol_bal,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,last_pymnt_amnt,last_fico_range_high,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_il_tl,num_op_rev_tl,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,term_36months,initial_list_status_w,individual,hardhship,employed_over_10yrs,emp_years,interest_rate,revolving_util,issue_date_year,years_since_earliest_cr_line,target,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,sub_grade_A1,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,addr_state_AK,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,issue_date_month_1,issue_date_month_2,issue_date_month_3,issue_date_month_4,issue_date_month_5,issue_date_month_6,issue_date_month_7,issue_date_month_8,issue_date_month_9,issue_date_month_10,issue_date_month_11,issue_date_month_12
0,8000.0,293.37,49355.8,32.47,0.0,704.0,1.0,12.0,0.0,2638.0,18.0,7396.24,7396.24,4858.69,2522.55,15.0,0.0,293.37,619.0,0.0,0.0,0.0,83307.0,13700.0,6.0,6942.0,8796.0,9.3,0.0,0.0,121.0,124.0,7.0,3.0,0.0,7.0,3.0,1.0,3.0,4.0,10.0,7.0,4.0,12.0,0.0,0.0,0.0,4.0,83.3,0.0,0.0,0.0,78243.0,83307.0,9700.0,64543.0,1,0,1,0,1,10.0,19.03,19.3,2017,10.0,1,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,15000.0,341.22,55000.0,23.91,0.0,704.0,0.0,13.0,0.0,17438.0,27.0,16586.15,16586.15,15000.0,1586.15,0.0,0.0,13515.17,744.0,0.0,0.0,0.0,60285.0,47100.0,6.0,4637.0,20782.0,45.2,0.0,0.0,72.0,134.0,0.0,0.0,0.0,12.0,17.0,0.0,6.0,7.0,5.0,11.0,7.0,13.0,0.0,0.0,0.0,2.0,100.0,28.6,0.0,0.0,103879.0,60285.0,37900.0,56779.0,1,0,1,0,1,10.0,12.99,37.0,2014,11.0,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
2,15000.0,524.48,75000.0,26.9,0.0,684.0,2.0,18.0,0.0,12550.0,28.0,14279.17,14279.17,10112.04,3491.92,0.0,675.21,524.48,619.0,1.0,0.0,407.0,167197.0,57700.0,12.0,9289.0,18777.0,31.5,0.0,0.0,51.0,95.0,1.0,1.0,2.0,15.0,1.0,0.0,4.0,9.0,8.0,15.0,9.0,18.0,0.0,0.0,0.0,2.0,100.0,16.7,0.0,0.0,247581.0,82085.0,27400.0,80381.0,1,1,1,0,0,8.0,15.61,21.8,2015,8.0,1,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
3,14000.0,357.42,45000.0,28.51,0.0,674.0,3.0,21.0,1.0,15463.0,29.0,10664.0,10664.0,4761.39,4888.9,86.48,927.23,357.42,669.0,0.0,0.0,0.0,35054.0,21500.0,4.0,1845.0,5282.0,72.5,0.0,0.0,172.0,162.0,3.0,3.0,0.0,11.0,1.0,0.0,8.0,13.0,6.0,17.0,13.0,21.0,0.0,0.0,0.0,3.0,100.0,70.0,1.0,0.0,55493.0,35054.0,19200.0,33993.0,1,0,1,0,0,8.0,18.25,71.9,2014,15.0,1,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
4,2400.0,78.48,55000.0,13.22,0.0,664.0,0.0,11.0,1.0,389.0,46.0,2529.608644,2529.61,2400.0,129.61,0.0,0.0,1481.7,669.0,0.0,0.0,989.0,64020.0,7400.0,9.0,5820.0,3447.0,6.8,0.0,0.0,136.0,144.0,4.0,4.0,1.0,6.0,5.0,0.0,2.0,3.0,19.0,8.0,3.0,11.0,0.0,0.0,0.0,6.0,100.0,0.0,1.0,0.0,64358.0,64020.0,3700.0,56958.0,1,1,1,0,1,10.0,10.91,5.3,2017,12.0,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [50]:
# A list of 1 category from each object set as well as DateTime features for ease of dropping.
logit_drop_cols = ["grade_G", "sub_grade_G5", "home_ownership_NONE", 
                   "purpose_other", "addr_state_WY", "issue_date_month_12"] + \
                    list(df.select_dtypes(include='datetime').columns)

# df.drop(labels=logit_drop_cols, axis=1, inplace=True)

In [54]:
# Run a simple logistic regression using statsmodels
use_cols = ["last_fico_range_high", "last_pymnt_amnt", "total_rec_prncp", 
            "funded_amnt", "issue_date_year", "installment", 
            "total_pymnt_inv", "total_pymnt"]

smf_logit(df_train, "target", logit_cols_=use_cols)

Formula
------- 
 target ~ last_fico_range_high + last_pymnt_amnt + total_rec_prncp + funded_amnt + issue_date_year + installment + total_pymnt_inv + total_pymnt

Optimization terminated successfully.
         Current function value: 0.110114
         Iterations 13
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                50000
Model:                          Logit   Df Residuals:                    49991
Method:                           MLE   Df Model:                            8
Date:                Wed, 09 Aug 2023   Pseudo R-squ.:                  0.7067
Time:                        13:28:26   Log-Likelihood:                -5505.7
converged:                       True   LL-Null:                       -18768.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
-------------