In [1]:
import sys

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from pprint import pprint as pp
import csv
from pathlib import Path
import seaborn as sns
from itertools import product
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN

import gensim
from gensim import corpora

In [4]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [5]:
pd.set_option('display.max_columns', None)

### Retrieving the data

In [6]:
# Specify the file path
file_path = 'data/lending_club_loans.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [7]:
df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011-12,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985-01,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015-01,171.62,,2016-09,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False,,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011-12,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999-04,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013-04,119.66,,2016-09,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False,,,,


In [8]:
df['issue_d'] = df['issue_d'].str[:4]
df['earliest_cr_line'] = df['earliest_cr_line'].str[:4]
df['last_pymnt_d'] = df['last_pymnt_d'].str[:4]
df['next_pymnt_d'] = df['next_pymnt_d'].str[:4]
df['last_credit_pull_d'] = df['last_credit_pull_d'].str[:4]

In [9]:
# Show unique values in the column
unique_values = df['total_il_high_credit_limit'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['total_il_high_credit_limit'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['total_il_high_credit_limit']))

Unique values: [nan]
Number of missing values: 42538
42538


In [10]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['total_il_high_credit_limit'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False,,,


In [11]:
# Show unique values in the column
unique_values = df['total_bc_limit'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['total_bc_limit'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['total_bc_limit']))

Unique values: [nan]
Number of missing values: 42538
42538


In [12]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['total_bc_limit'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False,,


In [13]:
# Show unique values in the column
unique_values = df['total_bal_ex_mort'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['total_bal_ex_mort'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['total_bal_ex_mort']))

Unique values: [nan]
Number of missing values: 42538
42538


In [14]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['total_bal_ex_mort'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False,


In [15]:
# Show unique values in the column
unique_values = df['tot_hi_cred_lim'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['tot_hi_cred_lim'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['tot_hi_cred_lim']))

Unique values: [nan]
Number of missing values: 42538
42538


In [16]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['tot_hi_cred_lim'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False


In [17]:
# Show unique values in the column
unique_values = df['pub_rec_bankruptcies'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['pub_rec_bankruptcies'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['pub_rec_bankruptcies']))

Unique values: [ 0.  1.  2. nan]
Number of missing values: 1368
42538


In [18]:
value_percentages = df['pub_rec_bankruptcies'].value_counts(normalize=True) * 100
print(value_percentages)

pub_rec_bankruptcies
0.0    95.496721
1.0     4.483847
2.0     0.019432
Name: proportion, dtype: float64


In [19]:
# Replace missing values with 3
# 3 symbolizing unknown
df['pub_rec_bankruptcies'] = df['pub_rec_bankruptcies'].fillna(3)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False


In [20]:
# Show unique values in the column
unique_values = df['percent_bc_gt_75'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['percent_bc_gt_75'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['percent_bc_gt_75']))

Unique values: [nan]
Number of missing values: 42538
42538


In [21]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['percent_bc_gt_75'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,False


In [22]:
# Show unique values in the column
unique_values = df['pct_tl_nvr_dlq'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['pct_tl_nvr_dlq'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['pct_tl_nvr_dlq']))

Unique values: [nan]
Number of missing values: 42538
42538


In [23]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['pct_tl_nvr_dlq'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,,0.0,False


In [24]:
# Show unique values in the column
unique_values = df['num_tl_op_past_12m'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_tl_op_past_12m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_tl_op_past_12m']))

Unique values: [nan]
Number of missing values: 42538
42538


In [25]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_tl_op_past_12m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,,0.0,False


In [26]:
# Show unique values in the column
unique_values = df['num_tl_90g_dpd_24m'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_tl_90g_dpd_24m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_tl_90g_dpd_24m']))

Unique values: [nan]
Number of missing values: 42538
42538


In [27]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_tl_90g_dpd_24m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,False


In [28]:
# Show unique values in the column
unique_values = df['num_tl_30dpd'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_tl_30dpd'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_tl_30dpd']))

Unique values: [nan]
Number of missing values: 42538
42538


In [29]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_tl_30dpd'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,,0.0,False


In [30]:
# Show unique values in the column
unique_values = df['num_tl_120dpd_2m'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_tl_120dpd_2m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_tl_120dpd_2m']))

Unique values: [nan]
Number of missing values: 42538
42538


In [31]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_tl_120dpd_2m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,,0.0,False


In [32]:
# Show unique values in the column
unique_values = df['num_sats'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_sats'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_sats']))

Unique values: [nan]
Number of missing values: 42538
42538


In [33]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_sats'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,,0.0,False


In [34]:
# Show unique values in the column
unique_values = df['num_rev_tl_bal_gt_0'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_rev_tl_bal_gt_0'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_rev_tl_bal_gt_0']))

Unique values: [nan]
Number of missing values: 42538
42538


In [35]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_rev_tl_bal_gt_0'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,,0.0,False


In [36]:
# Show unique values in the column
unique_values = df['num_rev_accts'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_rev_accts'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_rev_accts']))

Unique values: [nan]
Number of missing values: 42538
42538


In [37]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_rev_accts'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,,0.0,False


In [38]:
# Show unique values in the column
unique_values = df['num_op_rev_tl'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_op_rev_tl'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_op_rev_tl']))

Unique values: [nan]
Number of missing values: 42538
42538


In [39]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_op_rev_tl'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,,0.0,False


In [40]:
# Show unique values in the column
unique_values = df['num_il_tl'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_il_tl'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_il_tl']))

Unique values: [nan]
Number of missing values: 42538
42538


In [41]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_il_tl'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,,0.0,False


In [42]:
# Show unique values in the column
unique_values = df['num_bc_tl'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_bc_tl'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_bc_tl']))

Unique values: [nan]
Number of missing values: 42538
42538


In [43]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_bc_tl'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,,0.0,False


In [44]:
# Show unique values in the column
unique_values = df['num_bc_sats'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_bc_sats'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_bc_sats']))

Unique values: [nan]
Number of missing values: 42538
42538


In [45]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_bc_sats'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,,0.0,False


In [46]:
# Show unique values in the column
unique_values = df['num_actv_rev_tl'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_actv_rev_tl'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_actv_rev_tl']))

Unique values: [nan]
Number of missing values: 42538
42538


In [47]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_actv_rev_tl'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,,0.0,False


In [48]:
# Show unique values in the column
unique_values = df['num_actv_bc_tl'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_actv_bc_tl'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_actv_bc_tl']))

Unique values: [nan]
Number of missing values: 42538
42538


In [49]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_actv_bc_tl'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,,0.0,False


In [50]:
# Show unique values in the column
unique_values = df['num_accts_ever_120_pd'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['num_accts_ever_120_pd'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['num_accts_ever_120_pd']))

Unique values: [nan]
Number of missing values: 42538
42538


In [51]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['num_accts_ever_120_pd'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,,0.0,False


In [52]:
# Show unique values in the column
unique_values = df['mths_since_recent_revol_delinq'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mths_since_recent_revol_delinq'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mths_since_recent_revol_delinq']))

Unique values: [nan]
Number of missing values: 42538
42538


In [53]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mths_since_recent_revol_delinq'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,,0.0,False


In [54]:
# Show unique values in the column
unique_values = df['mths_since_recent_inq'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mths_since_recent_inq'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mths_since_recent_inq']))

Unique values: [nan]
Number of missing values: 42538
42538


In [55]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mths_since_recent_inq'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,,0.0,False


In [56]:
# Show unique values in the column
unique_values = df['mths_since_recent_bc_dlq'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mths_since_recent_bc_dlq'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mths_since_recent_bc_dlq']))

Unique values: [nan]
Number of missing values: 42538
42538


In [57]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mths_since_recent_bc_dlq'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,,0.0,False


In [58]:
# Show unique values in the column
unique_values = df['mths_since_recent_bc'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mths_since_recent_bc'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mths_since_recent_bc']))

Unique values: [nan]
Number of missing values: 42538
42538


In [59]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mths_since_recent_bc'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,,0.0,False


In [60]:
# Show unique values in the column
unique_values = df['mort_acc'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mort_acc'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mort_acc']))

Unique values: [nan]
Number of missing values: 42538
42538


In [61]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mort_acc'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,,0.0,False


In [62]:
# Show unique values in the column
unique_values = df['mo_sin_rcnt_tl'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mo_sin_rcnt_tl'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mo_sin_rcnt_tl']))

Unique values: [nan]
Number of missing values: 42538
42538


In [63]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mo_sin_rcnt_tl'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,,0.0,False


In [64]:
# Show unique values in the column
unique_values = df['mo_sin_rcnt_rev_tl_op'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mo_sin_rcnt_rev_tl_op'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mo_sin_rcnt_rev_tl_op']))

Unique values: [nan]
Number of missing values: 42538
42538


In [65]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mo_sin_rcnt_rev_tl_op'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,,0.0,False


In [66]:
# Show unique values in the column
unique_values = df['mo_sin_old_rev_tl_op'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mo_sin_old_rev_tl_op'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mo_sin_old_rev_tl_op']))

Unique values: [nan]
Number of missing values: 42538
42538


In [67]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mo_sin_old_rev_tl_op'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,,0.0,False


In [68]:
# Show unique values in the column
unique_values = df['mo_sin_old_il_acct'].unique()
print("Unique values:", unique_values)

# Count missing values in the column
missing_values_count = df['mo_sin_old_il_acct'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mo_sin_old_il_acct']))

Unique values: [nan]
Number of missing values: 42538
42538


In [69]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mo_sin_old_il_acct'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [70]:
# Show unique values in the column
unique_values = df['delinq_amnt'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['delinq_amnt'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['delinq_amnt']))

Unique values: [   0.   nan   27. 6053.]
4
Number of missing values: 32
42538


In [71]:
# Assuming df is your DataFrame and 'column_name' is the name of your categorical column
value_percentages = df['delinq_amnt'].value_counts(normalize=True) * 100
print(value_percentages)

delinq_amnt
0.0       99.995295
27.0       0.002353
6053.0     0.002353
Name: proportion, dtype: float64


In [72]:
# Replace missing values with 1 to signify unknown
df['delinq_amnt'] = df['delinq_amnt'].fillna(1)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [73]:
# Show unique values in the column
unique_values = df['chargeoff_within_12_mths'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['chargeoff_within_12_mths'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['chargeoff_within_12_mths']))

Unique values: [False nan]
2
Number of missing values: 148
42538


In [74]:
# Replace missing values with 3
df['chargeoff_within_12_mths'] = df['chargeoff_within_12_mths'].fillna(True)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [75]:
# Show unique values in the column
unique_values = df['bc_util'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['bc_util'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['bc_util']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [76]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['bc_util'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [77]:
# Show unique values in the column
unique_values = df['bc_open_to_buy'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['bc_open_to_buy'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['bc_open_to_buy']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [78]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['bc_open_to_buy'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [79]:
# Show unique values in the column
unique_values = df['avg_cur_bal'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['avg_cur_bal'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['avg_cur_bal']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [80]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['avg_cur_bal'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [81]:
# Show unique values in the column
unique_values = df['acc_open_past_24mths'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['acc_open_past_24mths'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['acc_open_past_24mths']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [82]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['acc_open_past_24mths'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [83]:
# Show unique values in the column
unique_values = df['inq_last_12m'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['inq_last_12m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['inq_last_12m']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [84]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['inq_last_12m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [85]:
# Show unique values in the column
unique_values = df['total_cu_tl'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['total_cu_tl'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['total_cu_tl']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [86]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['total_cu_tl'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,,False,0.0,0.0,False


In [87]:
# Show unique values in the column
unique_values = df['inq_fi'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['inq_fi'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['inq_fi']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [88]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['inq_fi'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,,False,0.0,0.0,False


In [89]:
# Show unique values in the column
unique_values = df['total_rev_hi_lim'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['total_rev_hi_lim'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['total_rev_hi_lim']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [90]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['total_rev_hi_lim'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,,False,0.0,0.0,False


In [91]:
# Show unique values in the column
unique_values = df['all_util'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['all_util'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['all_util']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [92]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['all_util'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,,False,0.0,0.0,False


In [93]:
# Show unique values in the column
unique_values = df['max_bal_bc'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['max_bal_bc'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['max_bal_bc']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [94]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['max_bal_bc'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,,False,0.0,0.0,False


In [95]:
# Show unique values in the column
unique_values = df['open_rv_24m'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['open_rv_24m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['open_rv_24m']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [96]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['open_rv_24m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,,False,0.0,0.0,False


In [97]:
# Show unique values in the column
unique_values = df['open_rv_12m'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['open_rv_12m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['open_rv_12m']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [98]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['open_rv_12m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,,False,0.0,0.0,False


In [99]:
# Show unique values in the column
unique_values = df['il_util'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['il_util'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['il_util']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [100]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['il_util'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,,False,0.0,0.0,False


In [101]:
# Show unique values in the column
unique_values = df['total_bal_il'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['total_bal_il'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['total_bal_il']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [102]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['total_bal_il'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,,False,0.0,0.0,False


In [103]:
# Show unique values in the column
unique_values = df['mths_since_rcnt_il'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['mths_since_rcnt_il'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mths_since_rcnt_il']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [104]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mths_since_rcnt_il'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,,False,0.0,0.0,False


In [105]:
# Show unique values in the column
unique_values = df['open_il_24m'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['open_il_24m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['open_il_24m']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [106]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['open_il_24m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,,False,0.0,0.0,False


In [107]:
# Show unique values in the column
unique_values = df['open_il_12m'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['open_il_12m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['open_il_12m']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [108]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['open_il_12m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,,False,0.0,0.0,False


In [109]:
# Show unique values in the column
unique_values = df['open_il_6m'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['open_il_6m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['open_il_6m']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [110]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['open_il_6m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,,False,0.0,0.0,False


In [111]:
# Show unique values in the column
unique_values = df['open_acc_6m'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['open_acc_6m'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['open_acc_6m']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [112]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['open_acc_6m'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,False,0.0,0.0,False


In [113]:
# Show unique values in the column
unique_values = df['acc_now_delinq'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['acc_now_delinq'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['acc_now_delinq']))

Unique values: [False nan True]
3
Number of missing values: 32
42538


In [114]:
# Assuming df is your DataFrame and 'column_name' is the name of your categorical column
value_percentages = df['acc_now_delinq'].value_counts(normalize=True) * 100
print(value_percentages)

acc_now_delinq
False    99.99059
True      0.00941
Name: proportion, dtype: float64


In [115]:
# Replace missing values with 3
df['acc_now_delinq'] = df['acc_now_delinq'].fillna(True)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,,False,0.0,0.0,False


In [116]:
# Show unique values in the column
unique_values = df['tot_cur_bal'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['tot_cur_bal'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['tot_cur_bal']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [117]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['tot_cur_bal'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,,False,0.0,0.0,False


In [118]:
# Show unique values in the column
unique_values = df['tot_coll_amt'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['tot_coll_amt'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['tot_coll_amt']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [119]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['tot_coll_amt'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,,False,False,0.0,0.0,False


In [120]:
# Show unique values in the column
unique_values = df['verification_status_joint'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['verification_status_joint'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['verification_status_joint']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [121]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['verification_status_joint'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,,False,False,0.0,0.0,False


In [122]:
# Show unique values in the column
unique_values = df['dti_joint'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['dti_joint'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['dti_joint']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [123]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['dti_joint'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,,False,False,0.0,0.0,False


In [124]:
# Show unique values in the column
unique_values = df['annual_inc_joint'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['annual_inc_joint'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['annual_inc_joint']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [125]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['annual_inc_joint'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,False,False,0.0,0.0,False


In [126]:
# Show unique values in the column
unique_values = df['application_type'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['application_type'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['application_type']))

Unique values: ['INDIVIDUAL' nan]
2
Number of missing values: 3
42538


In [127]:
# Replace missing values with 3
df['application_type'] = df['application_type'].fillna('Else')

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,False,False,0.0,0.0,False


In [128]:
# Show unique values in the column
unique_values = df['policy_code'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['policy_code'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['policy_code']))

Unique values: [True nan]
2
Number of missing values: 3
42538


In [129]:
# Replace missing values with 3
df['policy_code'] = df['policy_code'].fillna(False)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,,True,INDIVIDUAL,False,False,0.0,0.0,False


In [130]:
# Show unique values in the column
unique_values = df['mths_since_last_major_derog'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['mths_since_last_major_derog'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mths_since_last_major_derog']))

Unique values: [nan]
1
Number of missing values: 42538
42538


In [131]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['mths_since_last_major_derog'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [132]:
# Show unique values in the column
unique_values = df['collections_12_mths_ex_med'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['collections_12_mths_ex_med'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['collections_12_mths_ex_med']))

Unique values: [False nan]
2
Number of missing values: 148
42538


In [133]:
# Replace missing values with 3
df['collections_12_mths_ex_med'] = df['collections_12_mths_ex_med'].fillna(True)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [134]:
# Show unique values in the column
unique_values = df['last_fico_range_low'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['last_fico_range_low'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['last_fico_range_low']))

Unique values: [740.   0. 715. 600. 690. 675. 650. 685. 515. 730. 665. 615. 765. 635.
 790. 555. 640. 680. 530. 775. 505. 575. 595. 655. 560. 700. 780. 735.
 745. 825. 770. 590. 785. 610. 695. 710. 645. 705. 660. 625. 605. 725.
 835. 570. 750. 815. 540. 630. 550. 795. 670. 545. 720. 800. 565. 520.
 760. 755. 525. 620. 500. 810. 585. 535. 580. 805. 820. 510. 830. 840.
 845.  nan]
72
Number of missing values: 3
42538


In [135]:
# Replace missing values with 3
df['last_fico_range_low'] = df['last_fico_range_low'].fillna(1)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [136]:
# Show unique values in the column
unique_values = df['last_fico_range_high'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['last_fico_range_high'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['last_fico_range_high']))

Unique values: [744. 499. 719. 604. 694. 679. 654. 689. 519. 734. 669. 619. 769. 639.
 794. 559. 644. 684. 534. 779. 509. 579. 599. 659. 564. 704. 784. 739.
 749. 829. 774. 594. 789. 614. 699. 714. 649. 709. 664. 629. 609. 729.
 839. 574. 754. 819. 544. 634. 554. 799. 674. 549. 724. 804. 569. 524.
 764. 759. 529. 624. 504. 814. 589. 539. 584. 809. 824. 514. 834. 844.
   0. 850.  nan]
73
Number of missing values: 3
42538


In [137]:
# Replace missing values with 1
df['last_fico_range_high'] = df['last_fico_range_high'].fillna(1)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [138]:
# Show unique values in the column
unique_values = df['loan_status'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['loan_status'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['loan_status']))

Unique values: ['Fully Paid' 'Charged Off' 'Current' 'In Grace Period'
 'Late (31-120 days)' 'Late (16-30 days)' 'Default' nan
 'Does not meet the credit policy. Status:Fully Paid'
 'Does not meet the credit policy. Status:Charged Off']
10
Number of missing values: 3
42538


In [139]:
# Show unique values in the column
unique_values = df['member_id'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['member_id'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['member_id']))

Unique values: [1296599. 1314167. 1313524. ...   70868.   70735.   70681.]
42536
Number of missing values: 3
42538


In [140]:
df = df.dropna(subset=['member_id'])

In [141]:
df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [142]:
# Show unique values in the column
unique_values = df['emp_title'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['emp_title'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['emp_title']))

Unique values: [nan 'Ryder' 'AIR RESOURCES BOARD' ... 'Tanks Tavern'
 'Halping hands company inc.' 'Homemaker']
30659
Number of missing values: 2626
42535


In [143]:
# Replace missing values with 3
df['emp_title'] = df['emp_title'].fillna('Unknown')

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [144]:
# Show unique values in the column
unique_values = df['emp_length'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['emp_length'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['emp_length']))

Unique values: ['10+ years' '< 1 year' '1 year' '3 years' '8 years' '9 years' '4 years'
 '5 years' '6 years' '2 years' '7 years' nan]
12
Number of missing values: 1112
42535


In [145]:
# Replace missing values with 3
df['emp_length'] = df['emp_length'].fillna(0)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [146]:
# Show unique values in the column
unique_values = df['annual_inc'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['annual_inc'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['annual_inc']))

Unique values: [24000. 30000. 12252. ...  7280. 66624.  6500.]
5598
Number of missing values: 4
42535


In [147]:
# Replace missing values with 3
df['annual_inc'] = df['annual_inc'].fillna(0)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [148]:
# Show unique values in the column
unique_values = df['url'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['url'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['url']))

Unique values: ['https://lendingclub.com/browse/loanDetail.action?loan_id=1077501'
 'https://lendingclub.com/browse/loanDetail.action?loan_id=1077430'
 'https://lendingclub.com/browse/loanDetail.action?loan_id=1077175' ...
 'https://lendingclub.com/browse/loanDetail.action?loan_id=72176'
 'https://lendingclub.com/browse/loanDetail.action?loan_id=71623'
 'https://lendingclub.com/browse/loanDetail.action?loan_id=70686']
42535
Number of missing values: 0
42535


In [149]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['url'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [150]:
# Show unique values in the column
unique_values = df['desc'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['desc'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['desc']))

Unique values: ['  Borrower added on 12/22/11 > I need to upgrade my business technologies.<br>'
 '  Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike. I only need this money because the deal im looking at is to good to pass up.<br><br>  Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike.I only need this money because the deal im looking at is to good to pass up. I have finished college with an associates degree in business and its takingmeplaces<br>'
 nan ...
 'I need to pay $2,100 for fixing my Volvo :)  Any help appreciated!'
 "Hi,   I'm buying  a used car. Anybody on facebook wants to finance me?   Thanks"
 'I need to make several improvements around the house - fix garage, fix back fencing, and misc other.']
28963
Number of missing values: 13295
4253

In [151]:
# Replace missing values with 3
df['desc'] = df['desc'].fillna('No Description')

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [152]:
# Show unique values in the column
unique_values = df['title'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['title'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['title']))

Unique values: ['Computer' 'bike' 'real estate business' ... 'delight' 'Car repair bill'
 'Aroundthehouse']
21265
Number of missing values: 13
42535


In [153]:
# Replace missing values with 3
df['title'] = df['title'].fillna('No Title')

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [154]:
# Show unique values in the column
unique_values = df['zip_code'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['zip_code'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['zip_code']))

Unique values: ['860xx' '309xx' '606xx' '917xx' '972xx' '852xx' '280xx' '900xx' '958xx'
 '774xx' '853xx' '913xx' '245xx' '951xx' '641xx' '921xx' '067xx' '890xx'
 '770xx' '335xx' '799xx' '605xx' '103xx' '150xx' '326xx' '564xx' '141xx'
 '080xx' '330xx' '974xx' '934xx' '405xx' '946xx' '445xx' '850xx' '604xx'
 '292xx' '088xx' '180xx' '029xx' '700xx' '010xx' '441xx' '104xx' '061xx'
 '616xx' '947xx' '914xx' '765xx' '980xx' '017xx' '752xx' '787xx' '077xx'
 '540xx' '225xx' '440xx' '437xx' '559xx' '912xx' '325xx' '300xx' '923xx'
 '352xx' '013xx' '146xx' '074xx' '786xx' '937xx' '331xx' '115xx' '191xx'
 '114xx' '908xx' '902xx' '992xx' '750xx' '950xx' '329xx' '226xx' '614xx'
 '802xx' '672xx' '083xx' '100xx' '926xx' '931xx' '712xx' '060xx' '707xx'
 '342xx' '895xx' '430xx' '919xx' '996xx' '891xx' '935xx' '801xx' '928xx'
 '233xx' '927xx' '970xx' '211xx' '303xx' '070xx' '194xx' '263xx' '403xx'
 '301xx' '553xx' '993xx' '312xx' '432xx' '602xx' '216xx' '151xx' '971xx'
 '305xx' '334xx' '050xx' '129xx' '92

In [155]:
df['zip_code'] = df['zip_code'].str.replace('x', '')  # Remove 'x'
df['zip_code'] = pd.to_numeric(df['zip_code'])  # Convert to numeric column

In [156]:
df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [157]:
# Show unique values in the column
unique_values = df['delinq_2yrs'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['delinq_2yrs'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['delinq_2yrs']))

Unique values: [ 0.  2.  3.  1.  4.  6.  5.  8.  7.  9. 11. nan 13.]
13
Number of missing values: 29
42535


In [158]:
# Assuming df is your DataFrame and 'column_name' is the name of your categorical column
value_percentages = df['delinq_2yrs'].value_counts(normalize=True) * 100
print(value_percentages)

delinq_2yrs
0.0     88.860396
1.0      8.457630
2.0      1.813862
3.0      0.574037
4.0      0.169388
5.0      0.063520
6.0      0.030584
7.0      0.014116
8.0      0.007058
11.0     0.004705
9.0      0.002353
13.0     0.002353
Name: proportion, dtype: float64


In [159]:
# Replace missing values with 3
df['delinq_2yrs'] = df['delinq_2yrs'].fillna(15)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [160]:
# Show unique values in the column
unique_values = df['earliest_cr_line'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['earliest_cr_line'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['earliest_cr_line']))

Unique values: ['1985' '1999' '2001' '1996' '2004' '2005' '2007' '1998' '1989' '2003'
 '1991' '1993' '1997' '1983' '2002' '1984' '2006' '1987' '1981' '2000'
 '1994' '1995' '1992' '1986' '1990' '1988' '1980' '2008' '1973' '1979'
 '1982' '1978' '1971' '1972' '1970' '1969' '1975' '1976' '1977' '1962'
 '1968' '1974' '1964' '1967' '1965' '1963' '1954' '1966' '1959' '1956'
 '1946' '1950' '1961' nan]
54
Number of missing values: 29
42535


In [161]:
# Replace missing values with 3
df['earliest_cr_line'] = df['earliest_cr_line'].fillna(0)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [162]:
# Show unique values in the column
unique_values = df['inq_last_6mths'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['inq_last_6mths'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['inq_last_6mths']))

Unique values: [ 1.  5.  2.  0.  3.  4.  6.  7.  8.  9. 10. 11. 12. 15. 14. 33. 17. 32.
 24. 13. 18. 16. 31. 28. 25. 27. 20. 19. nan]
29
Number of missing values: 29
42535


In [163]:
# Replace missing values with 3
df['inq_last_6mths'] = df['inq_last_6mths'].fillna(50)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [164]:
# Show unique values in the column
unique_values = df['mths_since_last_delinq'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['mths_since_last_delinq'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mths_since_last_delinq']))

Unique values: [ nan  35.  38.  61.   8.  20.  18.  68.  45.  48.  41.  40.  74.  25.
  53.  39.  10.  26.  56.  77.  28.  52.  24.  16.  60.  54.  23.   9.
  11.  13.  65.  19.  80.  22.  59.  79.  44.  64.  57.  14.  63.  49.
  15.  73.  70.  29.  51.   5.  75.  55.   2.  30.  47.  33.  69.   4.
  43.  21.  27.  46.  81.  78.  82.  31.  76.  62.  72.  42.  50.   3.
  12.  67.  36.  34.  58.  17.  71.  66.  32.   6.  37.   7.   1.  83.
  86. 115.  96. 103. 120. 106.  89. 107.  85.  97.  95.   0.]
96
Number of missing values: 26926
42535


In [165]:
# Replace missing values with 500
df['mths_since_last_delinq'] = df['mths_since_last_delinq'].fillna(500)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [166]:
# Show unique values in the column
unique_values = df['mths_since_last_record'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['mths_since_last_record'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['mths_since_last_record']))

Unique values: [ nan 113. 105.  97.  33.  93.  52.  85.  90.  91. 114.  92. 117.  87.
  45.  83. 118.  38. 101. 100. 112. 110.  88.  79.  77. 107. 102.  98.
  95. 103.  96. 116. 111.  89. 108.  29. 106. 115.  53.  86.  57.  63.
  94. 109.  99. 104.  76.  61.  28.  23.  75.  47.  82.  21.  62.  44.
  80.  67. 119.  42.  34.  66.  58.  22.  56.  72.  64.  50.  69.  49.
  74.  35.  12.  26.  78.  54.  37.  73.  11.  31.  59.  32.  81.  68.
  55.  39.  51.  70.  30.  41.  71.  40.  43.  27.  65.  46.  19.  17.
  25.  13.  48.  36.   7.  60.  14.   6.  18.   0.  20. 120. 129.   5.
  24.  15.]
114
Number of missing values: 38884
42535


In [167]:
# Replace missing values with 500
df['mths_since_last_record'] = df['mths_since_last_record'].fillna(222)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [168]:
# Show unique values in the column
unique_values = df['open_acc'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['open_acc'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['open_acc']))

Unique values: [ 3.  2. 10. 15.  9.  7.  4. 11. 14. 12. 20.  8.  6. 17.  5. 13. 16. 30.
 21. 18. 19. 27. 23. 34. 25. 22. 24. 26. 32. 28. 29. 33. 31. 39. 35. 36.
 38. 44. 41. 42.  1. 46. 37. 47. nan]
45
Number of missing values: 29
42535


In [169]:
# Replace missing values with 3
df['open_acc'] = df['open_acc'].fillna(0)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [170]:
# Show unique values in the column
unique_values = df['pub_rec'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['pub_rec'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['pub_rec']))

Unique values: [ 0.  1.  2.  3.  4.  5. nan]
7
Number of missing values: 29
42535


In [171]:
# Replace missing values with 6
df['pub_rec'] = df['pub_rec'].fillna(6)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [172]:
# Show unique values in the column
unique_values = df['total_acc'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['total_acc'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['total_acc']))

Unique values: [ 9.  4. 10. 37. 38. 12. 11. 13.  3. 23. 34. 29. 28. 42. 14. 22. 21. 17.
  7. 31. 44. 26. 16.  6. 18. 27. 24. 25. 40. 35.  8. 20. 15. 19. 36. 51.
 32. 30. 33. 46.  5. 61. 56. 50. 41. 39. 79. 62. 43. 47. 53. 45. 60. 55.
 52. 58. 54. 57. 49. 63. 48. 59. 77. 87. 75. 72. 64. 67. 78. 76. 74. 66.
 81. 90. 80. 71. 69. 73. 70. 68. 65.  2.  1. nan]
84
Number of missing values: 29
42535


In [173]:
# Replace missing values with 3
df['total_acc'] = df['total_acc'].fillna(0)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,False,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,False,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [174]:
# Show unique values in the column
unique_values = df['initial_list_status'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['initial_list_status'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['initial_list_status']))

Unique values: [False]
1
Number of missing values: 0
42535


In [175]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['initial_list_status'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [176]:
# Show unique values in the column
unique_values = df['last_pymnt_d'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['last_pymnt_d'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['last_pymnt_d']))

Unique values: ['2015' '2013' '2014' '2016' '2012' nan '2011' '2010' '2009' '2008' '2007']
11
Number of missing values: 83
42535


In [177]:
# Replace missing values with 3
df['last_pymnt_d'] = df['last_pymnt_d'].fillna(0)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [178]:
# Show unique values in the column
unique_values = df['next_pymnt_d'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['next_pymnt_d'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['next_pymnt_d']))

Unique values: [nan '2016' '2013' '2014' '2012' '2011' '2015' '2010' '2009' '2008' '2007']
11
Number of missing values: 39239
42535


In [179]:
# Replace missing values with 3
df['next_pymnt_d'] = df['next_pymnt_d'].fillna(0)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [180]:
# Show unique values in the column
unique_values = df['last_credit_pull_d'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['last_credit_pull_d'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['last_credit_pull_d']))

Unique values: ['2016' '2014' '2015' '2013' '2012' '2011' '2010' nan '2007' '2009' '2008']
11
Number of missing values: 4
42535


In [181]:
# Replace missing values with 3
df['last_credit_pull_d'] = df['last_credit_pull_d'].fillna(0)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,True,INDIVIDUAL,False,False,0.0,0.0,False


In [182]:
# Show unique values in the column
unique_values = df['policy_code'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['policy_code'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['policy_code']))

Unique values: [ True]
1
Number of missing values: 0
42535


In [183]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['policy_code'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,INDIVIDUAL,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,INDIVIDUAL,False,False,0.0,0.0,False


In [184]:
# Show unique values in the column
unique_values = df['application_type'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['application_type'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['application_type']))

Unique values: ['INDIVIDUAL']
1
Number of missing values: 0
42535


In [185]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['application_type'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False


In [186]:
# Show unique values in the column
unique_values = df['chargeoff_within_12_mths'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['chargeoff_within_12_mths'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['chargeoff_within_12_mths']))

Unique values: [False  True]
2
Number of missing values: 0
42535


In [187]:
# Replace missing values with 3
df['chargeoff_within_12_mths'] = df['chargeoff_within_12_mths'].fillna(True)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False


In [188]:
# Show unique values in the column
unique_values = df['tax_liens'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['tax_liens'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['tax_liens']))

Unique values: [False nan True]
3
Number of missing values: 105
42535


In [189]:
# Replace missing values with 3
df['tax_liens'] = df['tax_liens'].fillna(True)

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10+ years,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False


In [190]:
# Define the substrings to remove
to_remove = ['+', 'years', 'year', '<', '>']

# Remove the substrings from the column
for substring in to_remove:
    df['emp_length'] = df['emp_length'].str.replace(substring, '')

In [191]:
num_columns = df.shape[1]
print("Number of columns:", num_columns)

Number of columns: 57


In [192]:
df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False


In [193]:
# Show unique values in the column
unique_values = df['tax_liens'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['tax_liens'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['tax_liens']))

Unique values: [False  True]
2
Number of missing values: 0
42535


In [194]:
df['term'] = df['term'].str.replace(' ', '')
df['grade'] = df['grade'].str.replace(' ', '')
df['sub_grade'] = df['sub_grade'].str.replace(' ', '')

In [210]:
df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False


In [213]:
df['loan_status'].unique()

array(['Fully Paid', 'Charged Off', 'Current', 'In Grace Period',
       'Late (31-120 days)', 'Late (16-30 days)', 'Default',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

In [214]:
# Define the conditions
conditions = [
    df['loan_status'].isin(['Fully Paid', 'Current', 'In Grace Period', 'Does not meet the credit policy. Status:Fully Paid'])
]

# Define the corresponding values
values = [1]

# Create the new 'target' column based on the conditions
df['target'] = np.where(conditions[0], values[0], 2)  # Assign 2 for all other cases

In [215]:
df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,Fully Paid,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,Charged Off,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,2


In [216]:
df['target'].value_counts(normalize=True) * 100

target
1    84.878335
2    15.121665
Name: proportion, dtype: float64

In [217]:
# Dropping the column, because it contains only missing values
df = df.drop(columns=['loan_status'])

df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,2


In [230]:
# Check which columns contain NaN values
columns_with_nan = df5.columns[df5.isnull().any()].tolist()

print("Columns containing NaN values:")
print(columns_with_nan)

Columns containing NaN values:
['emp_length', 'revol_util']


In [231]:
# Show unique values in the column
unique_values = df['emp_length'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['emp_length'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['emp_length']))

Unique values: ['10 ' ' 1 ' '1 ' '3 ' '8 ' '9 ' '4 ' '5 ' '6 ' '2 ' '7 ' nan]
12
Number of missing values: 1112
42535


In [233]:
# Replace missing values with 3
df['emp_length'] = df['emp_length'].fillna(0)

In [234]:
# Show unique values in the column
unique_values = df['revol_util'].unique()
print("Unique values:", unique_values)
print(len(unique_values))

# Count missing values in the column
missing_values_count = df['revol_util'].isnull().sum()
print("Number of missing values:", missing_values_count)

print(len(df['revol_util']))

Unique values: [0.837 0.094 0.985 ... 1.035 1.053 1.057]
1120
Number of missing values: 90
42535


In [235]:
# Replace missing values with 3
df['revol_util'] = df['revol_util'].fillna(222)

In [237]:
# Check which columns contain NaN values
columns_with_nan = df.columns[df.isnull().any()].tolist()

print("Columns containing NaN values:")
print(columns_with_nan)

Columns containing NaN values:
[]


## German Credit Dataset With Header Names

### Data Contains Encoded values (e.g. A11, A121) and numerical values for the rest [DF1]

In [238]:
df1 = df.copy()

In [239]:
df1.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,2


### Data Contains the dataset converted with Label Encoder [DF5] (contiued on the work of DF4)

In [240]:
df5 = df1.copy()

In [241]:
# Drop the column from the DataFrame
df5.drop(columns=['desc'], inplace=True)

In [242]:
df5.head(10)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,2
2,1077175,1313524.0,2400.0,2400.0,2400.0,36months,0.1596,84.33,C,C5,Unknown,10,RENT,12252.0,Not Verified,2011,False,small_business,real estate business,606,IL,8.72,0.0,2001,735.0,739.0,2.0,500.0,222.0,2.0,0.0,2956.0,0.985,10.0,0.0,0.0,3005.666844,3005.67,2400.0,605.67,0.0,0.0,0.0,2014,649.91,0,2016,719.0,715.0,False,False,False,0.0,0.0,False,1
3,1076863,1277178.0,10000.0,10000.0,10000.0,36months,0.1349,339.31,C,C1,AIR RESOURCES BOARD,10,RENT,49200.0,Source Verified,2011,False,other,personel,917,CA,20.0,0.0,1996,690.0,694.0,1.0,35.0,222.0,10.0,0.0,5598.0,0.21,37.0,0.0,0.0,12231.89,12231.89,10000.0,2214.92,16.97,0.0,0.0,2015,357.48,0,2016,604.0,600.0,False,False,False,0.0,0.0,False,1
4,1075358,1311748.0,3000.0,3000.0,3000.0,60months,0.1269,67.79,B,B5,University Medical Group,1,RENT,80000.0,Source Verified,2011,False,other,Personal,972,OR,17.94,0.0,1996,695.0,699.0,0.0,38.0,222.0,15.0,0.0,27783.0,0.539,38.0,270.78,270.78,3784.49,3784.49,2729.22,1055.27,0.0,0.0,0.0,2016,67.79,2016,2016,694.0,690.0,False,False,False,0.0,0.0,False,1
5,1075269,1311441.0,5000.0,5000.0,5000.0,36months,0.079,156.46,A,A4,Veolia Transportaton,3,RENT,36000.0,Source Verified,2011,False,wedding,My wedding loan I promise to pay back,852,AZ,11.2,0.0,2004,730.0,734.0,3.0,500.0,222.0,9.0,0.0,7963.0,0.283,12.0,0.0,0.0,5632.21,5632.21,5000.0,632.21,0.0,0.0,0.0,2015,161.03,0,2016,679.0,675.0,False,False,False,0.0,0.0,False,1
6,1069639,1304742.0,7000.0,7000.0,7000.0,60months,0.1596,170.08,C,C5,Southern Star Photography,8,RENT,47004.0,Not Verified,2011,False,debt_consolidation,Loan,280,NC,23.51,0.0,2005,690.0,694.0,1.0,500.0,222.0,7.0,0.0,17726.0,0.856,11.0,0.0,0.0,10137.840008,10137.84,7000.0,3137.84,0.0,0.0,0.0,2016,1313.76,0,2016,654.0,650.0,False,False,False,0.0,0.0,False,1
7,1072053,1288686.0,3000.0,3000.0,3000.0,36months,0.1864,109.43,E,E1,MKC Accounting,9,RENT,48000.0,Source Verified,2011,False,car,Car Downpayment,900,CA,5.35,0.0,2007,660.0,664.0,2.0,500.0,222.0,4.0,0.0,8221.0,0.875,4.0,0.0,0.0,3939.135294,3939.14,3000.0,939.14,0.0,0.0,0.0,2015,111.34,0,2014,689.0,685.0,False,False,False,0.0,0.0,False,1
8,1071795,1306957.0,5600.0,5600.0,5600.0,60months,0.2128,152.39,F,F2,Unknown,4,OWN,40000.0,Source Verified,2011,False,small_business,Expand Business & Buy Debt Portfolio,958,CA,5.55,0.0,2004,675.0,679.0,2.0,500.0,222.0,11.0,0.0,5210.0,0.326,13.0,0.0,0.0,646.02,646.02,162.02,294.94,0.0,189.06,2.09,2012,152.39,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,2
9,1071570,1306721.0,5375.0,5375.0,5350.0,60months,0.1269,121.45,B,B5,Starbucks,1,RENT,15000.0,Verified,2011,False,other,Building my credit history.,774,TX,18.08,0.0,2004,725.0,729.0,0.0,500.0,222.0,2.0,0.0,9279.0,0.365,3.0,0.0,0.0,1476.19,1469.34,673.48,533.42,0.0,269.29,2.52,2012,121.45,0,2016,519.0,515.0,False,False,False,0.0,0.0,False,2


In [243]:
mixed_type_columns = []
for col in df5.columns:
    if df5[col].apply(lambda x: isinstance(x, (int, float))).all():
        continue
    mixed_type_columns.append(col)

print("Columns with mixed data types:", mixed_type_columns)

Columns with mixed data types: ['id', 'term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'title', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d']


In [244]:
# List to store the columns that cannot be converted to float
non_float_columns = []

# Convert each column to float
for col in mixed_type_columns:
    try:
        df5[col] = df5[col].astype(float)
    except ValueError:
        # If conversion fails, append the column name to non_float_columns list
        non_float_columns.append(col)

# Print the columns that cannot be converted to float
print("Columns that cannot be converted to float:", non_float_columns)

Columns that cannot be converted to float: ['term', 'grade', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'purpose', 'title', 'addr_state']


In [245]:
df5.dtypes

id                            float64
member_id                     float64
loan_amnt                     float64
funded_amnt                   float64
funded_amnt_inv               float64
term                           object
int_rate                      float64
installment                   float64
grade                          object
sub_grade                      object
emp_title                      object
emp_length                    float64
home_ownership                 object
annual_inc                    float64
verification_status            object
issue_d                       float64
pymnt_plan                     object
purpose                        object
title                          object
zip_code                        int64
addr_state                     object
dti                           float64
delinq_2yrs                   float64
earliest_cr_line              float64
fico_range_low                float64
fico_range_high               float64
inq_last_6mt

In [246]:
from sklearn.preprocessing import LabelEncoder

In [247]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode object columns
object_columns = df5.select_dtypes(include=['object']).columns
for col in object_columns:
    df5[col] = label_encoder.fit_transform(df5[col])

# Encode boolean columns
boolean_columns = df5.select_dtypes(include=['bool']).columns
for col in boolean_columns:
    df5[col] = df5[col].astype(int)

In [248]:
df5.head(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501.0,1296599.0,5000.0,5000.0,4975.0,0,0.1065,162.87,1,6,25275,10.0,4,24000.0,2,2011.0,0,1,3160,860,3,27.65,0.0,1985.0,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015.0,171.62,0.0,2016.0,744.0,740.0,0,0,0,0.0,0.0,0,1
1,1077430.0,1314167.0,2500.0,2500.0,2500.0,1,0.1527,59.83,2,13,20173,1.0,4,30000.0,1,2011.0,0,0,17558,309,10,1.0,0.0,1999.0,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013.0,119.66,0.0,2016.0,499.0,0.0,0,0,0,0.0,0.0,0,2
2,1077175.0,1313524.0,2400.0,2400.0,2400.0,0,0.1596,84.33,2,14,25275,10.0,4,12252.0,0,2011.0,0,11,20488,606,14,8.72,0.0,2001.0,735.0,739.0,2.0,500.0,222.0,2.0,0.0,2956.0,0.985,10.0,0.0,0.0,3005.666844,3005.67,2400.0,605.67,0.0,0.0,0.0,2014.0,649.91,0.0,2016.0,719.0,715.0,0,0,0,0.0,0.0,0,1
3,1076863.0,1277178.0,10000.0,10000.0,10000.0,0,0.1349,339.31,2,10,357,10.0,4,49200.0,1,2011.0,0,9,20372,917,4,20.0,0.0,1996.0,690.0,694.0,1.0,35.0,222.0,10.0,0.0,5598.0,0.21,37.0,0.0,0.0,12231.89,12231.89,10000.0,2214.92,16.97,0.0,0.0,2015.0,357.48,0.0,2016.0,604.0,600.0,0,0,0,0.0,0.0,0,1
4,1075358.0,1311748.0,3000.0,3000.0,3000.0,1,0.1269,67.79,1,9,25135,1.0,4,80000.0,1,2011.0,0,9,13366,972,36,17.94,0.0,1996.0,695.0,699.0,0.0,38.0,222.0,15.0,0.0,27783.0,0.539,38.0,270.78,270.78,3784.49,3784.49,2729.22,1055.27,0.0,0.0,0.0,2016.0,67.79,2016.0,2016.0,694.0,690.0,0,0,0,0.0,0.0,0,1


In [249]:
# Assuming df is your DataFrame
df5.to_csv('df5.csv')

In [None]:
sys.exit()

##### Data Contains the dataset converted with Label Encoder [DF5b] (continued on the work of DF3 - encoded and numerical values)

In [None]:
df5b = df3.copy()

In [None]:
#label encoding

from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your DataFrame
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate through each column in the DataFrame
for column in df5b.columns:
    # Check if the column is categorical
    if df5b[column].dtype == 'object':
        # Use LabelEncoder to encode the categorical column
        df5b[column] = label_encoder.fit_transform(df5b[column])

In [None]:
df5b.head()

##### Data Contains the dataset converted with Label Encoder [DF5c] (continued on the work of DF3 - labeled and numerical values)

In [None]:
df5c = df5b.copy()

In [None]:
# Drop columns containing 'coded' in their names
columns_to_drop = [col for col in df5c.columns if 'coded' in col or 'quantile' in col]
df5c.drop(columns=columns_to_drop, inplace=True)

df5c.head()

In [None]:
# Assuming df is your DataFrame
df5c.to_csv('df5c.csv')

### Data Contains the dataset converted with Label Encoder and modify it to be used by fasttext[DF7] (continued on the work of DF4)

In [264]:
df7 = df1.copy()

In [265]:
df7.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,2


In [266]:
# Assuming your target column is named "target"
df7['target'] = '__label__' + df7['target'].astype(str)

In [267]:
df7.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,__label__1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,__label__2


In [268]:
df7 = df7.drop(columns=['desc'])

In [269]:
# Select columns from the second column till the last column
selected_columns = df7.iloc[:, :-1]

In [270]:
selected_columns.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False


In [271]:
# Create a new column "content" by concatenating values from selected columns
df7['content'] = df7['target'] + ' ' + selected_columns.apply(lambda row: ' '.join(str(val) for val in row), axis=1)

In [272]:
df7.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target,content
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,__label__1,__label__1 1077501 1296599.0 5000.0 5000.0 497...
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,__label__2,__label__2 1077430 1314167.0 2500.0 2500.0 250...


In [273]:
# Assuming df is your DataFrame
df7.to_csv('df7.csv')

### Data Contains the dataset converted with quartiles for the numerical values and modify it to be used by fasttext[DF8] 

In [313]:
df8 = df1.copy()

In [314]:
df8.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,2


In [315]:
# Assuming your target column is named "target"
df8['target'] = '__label__' + df8['target'].astype(str)

In [316]:
df8.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,__label__1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,Borrower added on 12/22/11 > I plan to use t...,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,__label__2


In [317]:
df8 = df8.drop(columns=['desc'])

In [318]:
df8.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,target
0,1077501,1296599.0,5000.0,5000.0,4975.0,36months,0.1065,162.87,B,B2,Unknown,10,RENT,24000.0,Verified,2011,False,credit_card,Computer,860,AZ,27.65,0.0,1985,735.0,739.0,1.0,500.0,222.0,3.0,0.0,13648.0,0.837,9.0,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,2015,171.62,0,2016,744.0,740.0,False,False,False,0.0,0.0,False,__label__1
1,1077430,1314167.0,2500.0,2500.0,2500.0,60months,0.1527,59.83,C,C4,Ryder,1,RENT,30000.0,Source Verified,2011,False,car,bike,309,GA,1.0,0.0,1999,740.0,744.0,5.0,500.0,222.0,3.0,0.0,1687.0,0.094,4.0,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,2013,119.66,0,2016,499.0,0.0,False,False,False,0.0,0.0,False,__label__2


In [319]:
# First, select only the numerical columns excluding 'id' and 'member_id'
numerical_columns = df8.select_dtypes(include=['number']).drop(columns=['member_id'])

# Initialize an empty DataFrame to store the categorized values
categorized_df = pd.DataFrame()

# Iterate over each numerical column
for col in numerical_columns.columns:
    # Use qcut to categorize values into 10 quartiles
    quartile_categories = pd.qcut(df8[col], q=10, labels=False, duplicates='drop')
    
    # Create new category names based on column index and quartile
    category_names = 'C' + str(numerical_columns.columns.get_loc(col)) + "Q" + quartile_categories.astype(str)
    
    # Add the categorized values to the new DataFrame
    categorized_df[col] = category_names

# Concatenate the categorized DataFrame with non-numerical columns and 'id', 'member_id' columns from the original DataFrame
df8_categorized = pd.concat([df8.select_dtypes(exclude=['number']), categorized_df], axis=1)

In [320]:
# Drop the "target" column from the DataFrame
target_column = df8_categorized.pop('target')

# Add the "target" column back to the DataFrame as the last column
df8_categorized['target'] = target_column

In [321]:
df8_categorized.head(2)

Unnamed: 0,id,term,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,issue_d,pymnt_plan,purpose,title,addr_state,earliest_cr_line,last_pymnt_d,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,tax_liens,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,zip_code,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,last_fico_range_high,last_fico_range_low,delinq_amnt,pub_rec_bankruptcies,target
0,1077501,36months,B,B2,Unknown,10,RENT,Verified,2011,False,credit_card,Computer,AZ,1985,2015,0,2016,False,False,False,False,C0Q1,C1Q1,C2Q2,C3Q3,C4Q2,C5Q0,C6Q7,C7Q9,C8Q0,C9Q7,C10Q7,C11Q0,C12Q3,C13Q0,C14Q0,C15Q0,C16Q6,C17Q8,C18Q0,C19Q0,C20Q0,C21Q2,C22Q3,C23Q2,C24Q3,C25Q0,C26Q0,C27Q0,C28Q1,C29Q7,C30Q7,C31Q0,C32Q0,__label__1
1,1077430,60months,C,C4,Ryder,1,RENT,Source Verified,2011,False,car,bike,GA,1999,2013,0,2016,False,False,False,False,C0Q0,C1Q0,C2Q0,C3Q7,C4Q0,C5Q0,C6Q3,C7Q0,C8Q0,C9Q7,C10Q7,C11Q3,C12Q3,C13Q0,C14Q0,C15Q0,C16Q1,C17Q1,C18Q0,C19Q0,C20Q0,C21Q0,C22Q0,C23Q0,C24Q1,C25Q0,C26Q1,C27Q1,C28Q1,C29Q0,C30Q0,C31Q0,C32Q0,__label__2


In [323]:
# Select columns from the second column till the last column
selected_columns = df8_categorized.iloc[:, :-1]

In [324]:
# Create a new column "content" by concatenating values from selected columns
df8_categorized['content'] = df8_categorized['target'] + ' ' + selected_columns.apply(lambda row: ' '.join(str(val) for val in row), axis=1)

In [325]:
df8 = df8_categorized.copy()

In [326]:
df8.head(2)

Unnamed: 0,id,term,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,issue_d,pymnt_plan,purpose,title,addr_state,earliest_cr_line,last_pymnt_d,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,tax_liens,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,zip_code,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,last_fico_range_high,last_fico_range_low,delinq_amnt,pub_rec_bankruptcies,target,content
0,1077501,36months,B,B2,Unknown,10,RENT,Verified,2011,False,credit_card,Computer,AZ,1985,2015,0,2016,False,False,False,False,C0Q1,C1Q1,C2Q2,C3Q3,C4Q2,C5Q0,C6Q7,C7Q9,C8Q0,C9Q7,C10Q7,C11Q0,C12Q3,C13Q0,C14Q0,C15Q0,C16Q6,C17Q8,C18Q0,C19Q0,C20Q0,C21Q2,C22Q3,C23Q2,C24Q3,C25Q0,C26Q0,C27Q0,C28Q1,C29Q7,C30Q7,C31Q0,C32Q0,__label__1,__label__1 1077501 36months B B2 Unknown 10 R...
1,1077430,60months,C,C4,Ryder,1,RENT,Source Verified,2011,False,car,bike,GA,1999,2013,0,2016,False,False,False,False,C0Q0,C1Q0,C2Q0,C3Q7,C4Q0,C5Q0,C6Q3,C7Q0,C8Q0,C9Q7,C10Q7,C11Q3,C12Q3,C13Q0,C14Q0,C15Q0,C16Q1,C17Q1,C18Q0,C19Q0,C20Q0,C21Q0,C22Q0,C23Q0,C24Q1,C25Q0,C26Q1,C27Q1,C28Q1,C29Q0,C30Q0,C31Q0,C32Q0,__label__2,__label__2 1077430 60months C C4 Ryder 1 REN...


In [327]:
# Assuming df is your DataFrame
df8.to_csv('df8.csv')

### Data Contains the dataset converted with Label Encoder and modify it to be used by fasttext[DF7a] (continued on the work of DF2)

In [None]:
df7a = df2.copy()

In [None]:
df7a.head(2)

In [None]:
# Assuming your target column is named "target"
df7a['target'] = '__label__' + df7a['target'].astype(str)

In [None]:
df7a.head(2)

In [None]:
# Select columns from the second column till the last column
selected_columns = df7a.iloc[:, :-1]

In [None]:
selected_columns.head(2)

In [None]:
# Create a new column "content" by concatenating values from selected columns
df7a['content'] = df7a['target'] + ' ' + selected_columns.apply(lambda row: ' '.join(str(val) for val in row), axis=1)

In [None]:
df7a.head(2)

In [None]:
# Assuming df is your DataFrame
df7a.to_csv('df7a.csv')

### Data Contains the dataset converted to One Hot [DF6] (contiued on the work of DF4)

In [None]:
df6 = df4.copy()

In [None]:
# Separate numerical and categorical columns
numerical_cols = df6.select_dtypes(include=['int', 'float']).columns
categorical_cols = df6.select_dtypes(include=['object']).columns

# Perform one-hot encoding on categorical columns
one_hot_encoded_cols = pd.get_dummies(df6[categorical_cols])

# Concatenate the one-hot encoded columns with the original DataFrame
df6_hot = pd.concat([one_hot_encoded_cols, df5[numerical_cols]], axis=1)

In [None]:
df6_hot.head()

### DF7b ([DF7 dataset values replaced with vectors through word2vec])

In [None]:
df7b = df7.copy()

In [None]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
import numpy as np
np.random.seed(42)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [None]:
features = df7['content'].apply(lambda x: x.split(' ', 1)[1])  # Drop the target column to get the feature columns
target = df7['target'].apply(lambda x: int(x.split("__label__")[1]))  # Target column to predict

In [None]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

features = features.apply(preprocess)

In [None]:
sentences = [sentence.split() for sentence in features]

In [None]:
w2v_model = Word2Vec(sentences, vector_size=20, window=21, min_count=2, workers=4, seed=42) #size=100,

In [None]:
import numpy as np

In [None]:
# Assuming w2v_model is your Word2Vec model object
vocabulary = list(w2v_model.wv.key_to_index.keys())

# Print the vocabulary
print("Vocabulary size:", len(vocabulary))
print("Example words in the vocabulary:")
for word in vocabulary[:2]:
    print(word)


In [None]:
df7b = df4.copy()

In [None]:
df7b = df7b.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
df7b.head(2)

In [None]:
# Vectorize the values in the column using Word2Vec
#df7b['account_balance'] = df7b['account_balance'].apply(lambda words: np.mean([w2v_model.wv[word] for word in words.split() if word in w2v_model.wv], axis=0))

In [None]:
# Iterate over each column in df7b
for col in df7b.columns:
    # Check if the column contains strings
    if df7b[col].dtype == 'object':
        # Lowercase the words in the column
        df7b[col] = df7b[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
        # Vectorize the values in the column using Word2Vec
        df7b[col] = df7b[col].apply(lambda words: np.mean([w2v_model.wv[word] for word in words.split() if word in w2v_model.wv], axis=0) if isinstance(words, str) else np.nan)

In [None]:
df7b.head(2)

In [None]:
# Assuming df is your DataFrame
df7b.to_csv('df7b.csv')

### DF7C ([DF7 dataset values replaced with vectors through FastText])

In [None]:
from gensim.models import FastText

In [None]:
features = df7['content'].apply(lambda x: x.split(' ', 1)[1])  # Drop the target column to get the feature columns
target = df7['target'].apply(lambda x: int(x.split("__label__")[1]))  # Target column to predict

In [None]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

features = features.apply(preprocess)

In [None]:
sentences = [sentence.split() for sentence in features]

In [None]:
# Train the FastText model
fasttext_model = FastText(sentences, vector_size=20, window=21, min_count=2, workers=4, seed=42)

In [None]:
df7c = df4.copy()

In [None]:
df7c = df7c.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
df7c.head(2)

In [None]:
# Iterate over each column in df7b
for col in df7c.columns:
    # Check if the column contains strings
    if df7c[col].dtype == 'object':
        # Lowercase the words in the column
        df7c[col] = df7c[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
        # Vectorize the values in the column using FastText
        df7c[col] = df7c[col].apply(lambda words: np.mean([fasttext_model.wv[word] for word in words.split() if word in fasttext_model.wv], axis=0) if isinstance(words, str) else np.nan)

In [None]:
#fasttext_model.wv['a201']

In [None]:
df7c.head(2)

In [None]:
# Assuming df is your DataFrame
df7c.to_csv('df7c.csv')

# OLS Regression

### finding the importance of each attribute with target as dependent variable

In [None]:
# Assuming 'df' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5c.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5c['target']  # Target column to predict

# Splitting the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
import statsmodels.api as sm

In [None]:
# Add a constant term to the features (intercept)
X_train_with_const = sm.add_constant(X_train)
X_test_with_const = sm.add_constant(X_test)

# Fit the OLS regression model
model = sm.OLS(y_train, X_train_with_const)
result = model.fit()