In [1]:
%run src/columns.py

In [3]:
from src.data_cleaning import *

In [4]:
csv_file_names = ('LoanStats3a_securev1.csv', 'LoanStats3b_securev1.csv', 'LoanStats3c_securev1.csv', 'LoanStats3d_securev1.csv',
                  'LoanStats_securev1_2016Q1.csv', 'LoanStats_securev1_2016Q2.csv', 'LoanStats_securev1_2016Q3.csv','LoanStats_securev1_2016Q4.csv', 
                  'LoanStats_securev1_2017Q1.csv', 'LoanStats_securev1_2017Q2.csv', 'LoanStats_securev1_2017Q3.csv', 'LoanStats_securev1_2017Q4.csv',
                  'LoanStats_securev1_2018Q1.csv', 'LoanStats_securev1_2018Q2.csv', 'LoanStats_securev1_2018Q3.csv', 'LoanStats_securev1_2018Q4.csv',
                  'LoanStats_securev1_2019Q1.csv', 'LoanStats_securev1_2019Q2.csv', 'LoanStats_securev1_2019Q3.csv', 'LoanStats_securev1_2019Q4.csv',
                  'LoanStats_securev1_2020Q1.csv')

df_raw = load_loan_data_from_local_machine(csv_file_names, columns_to_use)

In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2880954 entries, 0 to 105046
Data columns (total 89 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   id                              object 
 1   loan_amnt                       float64
 2   term                            object 
 3   int_rate                        object 
 4   installment                     float64
 5   grade                           object 
 6   emp_length                      object 
 7   home_ownership                  object 
 8   annual_inc                      float64
 9   verification_status             object 
 10  issue_d                         object 
 11  loan_status                     object 
 12  purpose                         object 
 13  zip_code                        object 
 14  addr_state                      object 
 15  dti                             float64
 16  delinq_2yrs                     float64
 17  earliest_cr_line            

In [6]:
def clean_and_prepare_raw_data_for_EDA(df):
    '''
    Take in the raw dataframe containing all loan data and run through all functions required to prepare it for model training.

    Args:
        df (dataframe): Dataframe of loans.

    Returns:
        Dataframe: Returns the loan dataframe after all the data cleaning and feature engineering functions have been applied.

    TODO:
        This function currently relies on functions stored in feature-engineering.py. This is acceptable for working in the
        Jupyter notebook I have but I need to change the organization of my code later on.
    '''
    df = drop_loan_status(df)
    df = drop_joint_applicant_loans(df)
    df = fix_rate_cols(df)
    df.dropna(subset=['issue_d'], inplace=True)
    df = fix_date_cols(df)
    df.sort_values(by='issue_d', inplace=True)
    df = exclude_loans_before_2010(df)
    df = clean_loan_term_col(df)
    df = only_include_36_month_loans(df)
    df = clean_employment_length(df)
    # I doubt we need missing data boolean columns for tree models.\
    #df = add_issue_date_and_month(df) # Ditch this?
    #df = add_supplemental_rate_data(df)
    #df = create_rate_difference_cols(df)
    df = create_months_since_earliest_cl_col(df)
    #df = create_loan_life_months_col(df)
    df = change_data_types(df)
    #df = create_dummy_cols(df)
    df = drop_unnecessary_cols(df)
    df.set_index('id', inplace=True)

    return df

In [7]:
df_EDA = clean_and_prepare_raw_data_for_EDA(df_raw)
df_EDA.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,purpose,...,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,mths_since_earliest_cr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
477567,10000.0,14.61,344.76001,D,5.0,MORTGAGE,62400.0,Not Verified,2010-01-01,educational,...,,,,0,0,,,,,162
477585,15000.0,16.0,527.359985,D,7.0,RENT,96000.0,Not Verified,2010-01-01,debt_consolidation,...,,,,0,0,,,,,134
474990,20000.0,11.83,662.679993,B,5.0,MORTGAGE,105000.0,Not Verified,2010-01-01,debt_consolidation,...,,,,0,0,,,,,180
477531,15000.0,15.31,522.22998,D,10.0,MORTGAGE,90000.0,Not Verified,2010-01-01,debt_consolidation,...,,,,0,0,,,,,183
477639,5600.0,13.22,189.289993,C,8.0,RENT,75797.0,Not Verified,2010-01-01,other,...,,,,0,0,,,,,150


In [9]:
df_EDA.to_pickle('df_EDA.pkl.bz2', compression='bz2')

In [11]:
df_Test = pd.read_pickle('df_EDA.pkl.bz2', compression='bz2')
df_Test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1886735 entries, 477567 to 167717983
Data columns (total 80 columns):
 #   Column                          Dtype         
---  ------                          -----         
 0   loan_amnt                       float32       
 1   int_rate                        float32       
 2   installment                     float32       
 3   grade                           category      
 4   emp_length                      float32       
 5   home_ownership                  category      
 6   annual_inc                      float32       
 7   verification_status             category      
 8   issue_d                         datetime64[ns]
 9   purpose                         category      
 10  addr_state                      category      
 11  dti                             float32       
 12  delinq_2yrs                     uint8         
 13  fico_range_low                  float32       
 14  fico_range_high                 float32       
 