In [9]:
import pandas as pd


df_sample = pd.read_csv("/Users/macbook/Documents/GitHub/DSfinProject/dsfinlendclub/data/01_raw/Loan_status_sample.csv", low_memory=False)
df_tot = pd.read_csv("/Users/macbook/Documents/GitHub/DSfinProject/dsfinlendclub/data/01_raw/Loan_status_2007-2020Q3.csv", low_memory=False)
df_sample.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,...,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag
0,1746494,25450,167338079,4000.0,4000.0,4000.0,36 months,13.08%,134.93,B,...,,,,,,,,,,N
1,2370717,81861,71016917,24000.0,24000.0,24000.0,60 months,9.16%,500.07,B,...,,,,,,,,,,N
2,2264870,397109,39589826,5000.0,5000.0,5000.0,36 months,10.49%,162.49,B,...,,,,,,,,,,N
3,595422,15492,134798709,24000.0,24000.0,24000.0,60 months,11.05%,522.42,B,...,,,,,,,,,,N
4,562657,90591,127097355,14000.0,14000.0,14000.0,60 months,13.59%,322.79,C,...,,,,,,,,,,N


In [2]:
print(df_sample.shape)  # Check dataset dimensions
print(df_sample.info())  # Check data types and missing values

(100000, 143)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 143 entries, Unnamed: 0.1 to debt_settlement_flag
dtypes: float64(106), int64(3), object(34)
memory usage: 109.1+ MB
None


### Step 1: Deduplication based on id field

Initial deduplication check. Create a function 'check_and_remove_duplicates' to set into pipline

In [10]:
# Removing duplicate rows
df_deduped = df_tot.drop_duplicates(inplace=True)

try:
    df_deduped.shape # Note: it will fail if empty (i.e. no duplicates)
except:
    print("No duplicates")

No duplicates


### Step 2: Handling missing values

In [11]:
# Checking for missing values
missing_percentage = (df_tot.isnull().sum() / len(df_tot)) * 100
missing_percentage[missing_percentage > 0].sort_values(ascending=False)

hardship_loan_status       95.097886
hardship_reason            95.090332
hardship_status            95.090229
hardship_dpd               95.090161
payment_plan_start_date    95.090127
                             ...    
last_pymnt_amnt             0.000034
last_fico_range_high        0.000034
last_fico_range_low         0.000034
policy_code                 0.000034
debt_settlement_flag        0.000034
Length: 140, dtype: float64

### Step 3. Remove unwanted columns.
After dataset exploration we will remove specific columns. We will use 'drop_unwanted_columns' method in the pipelines.
We will combine the columns into `columns_to_drop` list.
Columns will be selected according to:
- **business/problem relevance**: is the data relevant to the problem I am trying to solve (e.g. Do I need IDs?)
- **high % of missing values**: especially if the data is not easily recoverable or imputation might introduce significant bias.
- **little to no variance**: provide minimal information for the model and can be dropped.
- **high correlation**: if two or more columns are highly correlated, they contain redundant information. You might drop one of them to reduce 'multicollinearity'.
- **high cardinality**: Categorical columns with too many unique values (high cardinality) can lead to overfitting and increased computational cost. These can be dropped or encoded differently.

In [None]:
# noname columns
delete_no_library_cols = ['Unnamed: 0.1', 'Unnamed: 0']
columns_to_drop = delete_no_library_cols.copy()

In [14]:
# Set threshold
missing_threshold = 0.8

# Calculate missing value ratio
missing_ratio = df_tot.isnull().mean()

# Filter columns that have less than 60% missing values
columns_to_keep = missing_ratio[missing_ratio < missing_threshold].index.tolist()

# Print the columns that will be kept
print("✅ Columns to keep (less than 80% missing):")
print(columns_to_keep)

# Optional: to see dropped ones too
columns_to_drop = missing_ratio[missing_ratio >= missing_threshold].index.tolist()
print("\n🚫 Columns to drop (80% or more missing):")
print(columns_to_drop)

✅ Columns to keep (less than 80% missing):
['Unnamed: 0', 'id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open

### Feature engineering

In [None]:
# Not all the columns with high values for NaNs are uninformative

# Create hardship flag based on information are there a hardship event (1) or not (0)
df_tot['has_hardship'] = df_tot['hardship_status'].notnull().astype(int)

# Create a flag for joint or individual loan signal
df_tot['annual_inc_final'] = df_tot['annual_inc']
df_tot.loc[df_tot['is_joint_app'] == 1, 'annual_inc_final'] = df_tot['annual_inc_joint']

# Create a debt-to-income ratio for all individual and joined applications
df_tot['dti_final'] = df_tot['dti']
df_tot.loc[df_tot['is_joint_app'] == 1, 'dti_final'] = df_tot['dti_joint']

# Create a feature for verification status combil=ning individual and joint loans
df_tot['verification_status_final'] = df_tot['verification_status']
df_tot.loc[df_tot['is_joint_app'] == 1, 'verification_status_final'] = df_tot['verification_status_joint']

# Create a joint feature for the revolving balance of individual or joint applications
df_tot['revol_bal_final'] = df_tot['revol_bal']
df_tot.loc[df_tot['is_joint_app'] == 1, 'revol_bal_final'] = df_tot['revol_bal_joint']

# Create a flag for a simple fact if hardship started on Late or other loan status
df_tot['was_late_before_hardship'] = df_tot['hardship_loan_status'].str.contains('Late', na=False).astype(int)

# Fill missing values for 'hardship_dpd' with 0 assuming no days past due for non-hardship loans
df_tot['hardship_dpd_filled'] = df_tot['hardship_dpd'].fillna(0)



In [15]:
# Delete the 'dti', 'dti_joint', 'annual_inc', 'annual_inc_joint', 'hardship_status', etc. To reduce noise and redundancy.
delete_flag_related = ['dti', 'dti_joint', 'annual_inc', 'annual_inc_joint', 'hardship_status', 'hardship_type', 'hardship_reason', 'hardship_start_date', 'hardship_end_date', 'hardship_amount', 'hardship_length', 'deferral_term','verification_status', 'verification_status_joint', 'revol_bal', 'revol_bal_joint', 'hardship_loan_status', 'hardship_payoff_balance_amount', 'hardship_last_payment_amount']
columns_to_drop.extend(delete_flag_related)

In [18]:
df_tot['hardship_dpd'].dtype


dtype('float64')