In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [14]:
df = pd.read_csv('../data/loan.csv', low_memory=False)

# 1. Exploring the Data

In [15]:
df.head(20)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
2,1077175,1313524,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,
3,1076863,1277178,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,
4,1075358,1311748,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,
5,1075269,1311441,5000.0,5000.0,5000.0,36 months,7.9,156.46,A,A4,...,,,,,,,,,,
6,1069639,1304742,7000.0,7000.0,7000.0,60 months,15.96,170.08,C,C5,...,,,,,,,,,,
7,1072053,1288686,3000.0,3000.0,3000.0,36 months,18.64,109.43,E,E1,...,,,,,,,,,,
8,1071795,1306957,5600.0,5600.0,5600.0,60 months,21.28,152.39,F,F2,...,,,,,,,,,,
9,1071570,1306721,5375.0,5375.0,5350.0,60 months,12.69,121.45,B,B5,...,,,,,,,,,,


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887379 entries, 0 to 887378
Data columns (total 74 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           887379 non-null  int64  
 1   member_id                    887379 non-null  int64  
 2   loan_amnt                    887379 non-null  float64
 3   funded_amnt                  887379 non-null  float64
 4   funded_amnt_inv              887379 non-null  float64
 5   term                         887379 non-null  object 
 6   int_rate                     887379 non-null  float64
 7   installment                  887379 non-null  float64
 8   grade                        887379 non-null  object 
 9   sub_grade                    887379 non-null  object 
 10  emp_title                    835917 non-null  object 
 11  emp_length                   842554 non-null  object 
 12  home_ownership               887379 non-null  object 
 13 

# 2. Filtering and Cleaning

In [17]:
# Filter for single person applications
df_indv = df[df["application_type"] != "JOINT"]

In [18]:
df_indv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 886868 entries, 0 to 887378
Data columns (total 74 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           886868 non-null  int64  
 1   member_id                    886868 non-null  int64  
 2   loan_amnt                    886868 non-null  float64
 3   funded_amnt                  886868 non-null  float64
 4   funded_amnt_inv              886868 non-null  float64
 5   term                         886868 non-null  object 
 6   int_rate                     886868 non-null  float64
 7   installment                  886868 non-null  float64
 8   grade                        886868 non-null  object 
 9   sub_grade                    886868 non-null  object 
 10  emp_title                    835467 non-null  object 
 11  emp_length                   842104 non-null  object 
 12  home_ownership               886868 non-null  object 
 13  annu

In [19]:
# Extract relevant columns to new df
df_indv.drop(columns=df_indv.columns.difference(['id','member_id','loan_amnt','funded_amnt','term','int_rate','installment',
                                         'grade','sub_grade','emp_length','home_ownership','annual_inc','verification_status','loan_status','pymnt_plan','purpose','addr_state','dti','delinq_2yrs','mths_since_last_delinq','total_acc','out_prncp', 'total_pymnt','total_rec_prncp','total_rec_interest','total_rec_late_fee','acc_now_delinq']), inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv.drop(columns=df_indv.columns.difference(['id','member_id','loan_amnt','funded_amnt','term','int_rate','installment',


# Dealing with Nulls

In [21]:
# Look at all the nulls 
df_indv.isnull().sum()

id                             0
member_id                      0
loan_amnt                      0
funded_amnt                    0
term                           0
int_rate                       0
installment                    0
grade                          0
sub_grade                      0
emp_length                 44764
home_ownership                 0
annual_inc                     4
verification_status            0
loan_status                    0
pymnt_plan                     0
purpose                        0
addr_state                     0
dti                            0
delinq_2yrs                   29
mths_since_last_delinq    454080
total_acc                     29
out_prncp                      0
total_pymnt                    0
total_rec_prncp                0
total_rec_late_fee             0
acc_now_delinq                29
dtype: int64

In [22]:
# Fill any int or float columns null with 0

for column in df_indv.columns:
    if df_indv[column].isnull().any() and df_indv[column].dtype in ['int64', 'float64']:
        df_indv[column].fillna(0, inplace=True)
df_indv.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv[column].fillna(0, inplace=True)


id                            0
member_id                     0
loan_amnt                     0
funded_amnt                   0
term                          0
int_rate                      0
installment                   0
grade                         0
sub_grade                     0
emp_length                44764
home_ownership                0
annual_inc                    0
verification_status           0
loan_status                   0
pymnt_plan                    0
purpose                       0
addr_state                    0
dti                           0
delinq_2yrs                   0
mths_since_last_delinq        0
total_acc                     0
out_prncp                     0
total_pymnt                   0
total_rec_prncp               0
total_rec_late_fee            0
acc_now_delinq                0
dtype: int64

In [23]:
emp_length_map = {
    '10+ years': 11,
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    np.nan: 0  
}

df_indv['emp_length'] = df_indv['emp_length'].map(emp_length_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv['emp_length'] = df_indv['emp_length'].map(emp_length_map)


In [24]:
df_indv['emp_length'].unique()

array([11,  0,  1,  3,  8,  9,  4,  5,  6,  2,  7])

In [25]:
df_indv.isnull().sum()

id                        0
member_id                 0
loan_amnt                 0
funded_amnt               0
term                      0
int_rate                  0
installment               0
grade                     0
sub_grade                 0
emp_length                0
home_ownership            0
annual_inc                0
verification_status       0
loan_status               0
pymnt_plan                0
purpose                   0
addr_state                0
dti                       0
delinq_2yrs               0
mths_since_last_delinq    0
total_acc                 0
out_prncp                 0
total_pymnt               0
total_rec_prncp           0
total_rec_late_fee        0
acc_now_delinq            0
dtype: int64

# Dealing with non numeric features

In [26]:
df_indv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 886868 entries, 0 to 887378
Data columns (total 26 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      886868 non-null  int64  
 1   member_id               886868 non-null  int64  
 2   loan_amnt               886868 non-null  float64
 3   funded_amnt             886868 non-null  float64
 4   term                    886868 non-null  object 
 5   int_rate                886868 non-null  float64
 6   installment             886868 non-null  float64
 7   grade                   886868 non-null  object 
 8   sub_grade               886868 non-null  object 
 9   emp_length              886868 non-null  int64  
 10  home_ownership          886868 non-null  object 
 11  annual_inc              886868 non-null  float64
 12  verification_status     886868 non-null  object 
 13  loan_status             886868 non-null  object 
 14  pymnt_plan              8

# Grade and Subgrade

In [27]:
df_indv['grade'].unique()

array(['B', 'C', 'A', 'E', 'F', 'D', 'G'], dtype=object)

In [28]:
sorted(df_indv['sub_grade'].unique())

['A1',
 'A2',
 'A3',
 'A4',
 'A5',
 'B1',
 'B2',
 'B3',
 'B4',
 'B5',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'E1',
 'E2',
 'E3',
 'E4',
 'E5',
 'F1',
 'F2',
 'F3',
 'F4',
 'F5',
 'G1',
 'G2',
 'G3',
 'G4',
 'G5']

In [29]:
grades = {
    'A' : 1,
    'B' : 2,
    'C' : 3,
    'D' : 4,
    'E' : 5,
    'F' : 6,
    'G' : 7
}
subgrades = {
    'A1': 1,
 'A2': 2,
 'A3': 3,
 'A4': 4,
 'A5': 5,
 'B1': 6,
 'B2': 7,
    'B3': 8,
 'B4': 9,
 'B5':10,
 'C1': 11,
 'C2': 12,
 'C3': 13,
 'C4': 14,
 'C5': 15,
 'D1': 16,
 'D2': 17,
 'D3': 18,
 'D4': 19,
 'D5': 20,
 'E1': 21,
 'E2': 22,
 'E3': 23,
 'E4': 24,
 'E5': 25,
 'F1': 26,
 'F2': 27,
 'F3': 28,
 'F4': 29,
 'F5': 30,
 'G1': 31,
 'G2': 32,
 'G3': 33,
 'G4': 34,
 'G5': 35
}

In [30]:
df_indv['grade'] = df_indv['grade'].map(grades)
df_indv['sub_grade'] = df_indv['sub_grade'].map(subgrades)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv['grade'] = df_indv['grade'].map(grades)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv['sub_grade'] = df_indv['sub_grade'].map(subgrades)


In [31]:
df_indv.head(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,...,addr_state,dti,delinq_2yrs,mths_since_last_delinq,total_acc,out_prncp,total_pymnt,total_rec_prncp,total_rec_late_fee,acc_now_delinq
0,1077501,1296599,5000.0,5000.0,36 months,10.65,162.87,2,7,11,...,AZ,27.65,0.0,0.0,9.0,0.0,5861.071414,5000.0,0.0,0.0
1,1077430,1314167,2500.0,2500.0,60 months,15.27,59.83,3,14,0,...,GA,1.0,0.0,0.0,4.0,0.0,1008.71,456.46,0.0,0.0
2,1077175,1313524,2400.0,2400.0,36 months,15.96,84.33,3,15,11,...,IL,8.72,0.0,0.0,10.0,0.0,3003.653644,2400.0,0.0,0.0
3,1076863,1277178,10000.0,10000.0,36 months,13.49,339.31,3,11,11,...,CA,20.0,0.0,35.0,37.0,0.0,12226.302212,10000.0,16.97,0.0
4,1075358,1311748,3000.0,3000.0,60 months,12.69,67.79,2,10,1,...,OR,17.94,0.0,38.0,38.0,766.9,3242.17,2233.1,0.0,0.0


# Term

In [32]:
df_indv['term']

0          36 months
1          60 months
2          36 months
3          36 months
4          60 months
             ...    
887374     36 months
887375     36 months
887376     60 months
887377     60 months
887378     36 months
Name: term, Length: 886868, dtype: object

In [33]:
df_indv['term'] = df_indv['term'].str.extract('(\d+)').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv['term'] = df_indv['term'].str.extract('(\d+)').astype(int)


In [34]:
df_indv['term']

0         36
1         60
2         36
3         36
4         60
          ..
887374    36
887375    36
887376    60
887377    60
887378    36
Name: term, Length: 886868, dtype: int64

# Homeownership

In [35]:
sorted(df_indv['home_ownership'].unique())

['ANY', 'MORTGAGE', 'NONE', 'OTHER', 'OWN', 'RENT']

In [36]:
homeownership = {
    'ANY': 0,
    'MORTGAGE': -1,
    'NONE': 0,
    'OTHER': 0,
    'OWN': 2,
    'RENT': 1
}

In [37]:
df_indv['home_ownership'] = df_indv['home_ownership'].map(homeownership)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv['home_ownership'] = df_indv['home_ownership'].map(homeownership)


In [38]:
df_indv.head(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,...,addr_state,dti,delinq_2yrs,mths_since_last_delinq,total_acc,out_prncp,total_pymnt,total_rec_prncp,total_rec_late_fee,acc_now_delinq
0,1077501,1296599,5000.0,5000.0,36,10.65,162.87,2,7,11,...,AZ,27.65,0.0,0.0,9.0,0.0,5861.071414,5000.0,0.0,0.0
1,1077430,1314167,2500.0,2500.0,60,15.27,59.83,3,14,0,...,GA,1.0,0.0,0.0,4.0,0.0,1008.71,456.46,0.0,0.0
2,1077175,1313524,2400.0,2400.0,36,15.96,84.33,3,15,11,...,IL,8.72,0.0,0.0,10.0,0.0,3003.653644,2400.0,0.0,0.0
3,1076863,1277178,10000.0,10000.0,36,13.49,339.31,3,11,11,...,CA,20.0,0.0,35.0,37.0,0.0,12226.302212,10000.0,16.97,0.0
4,1075358,1311748,3000.0,3000.0,60,12.69,67.79,2,10,1,...,OR,17.94,0.0,38.0,38.0,766.9,3242.17,2233.1,0.0,0.0


# Verification

In [39]:
sorted(df_indv['verification_status'].unique())

['Not Verified', 'Source Verified', 'Verified']

In [40]:
verification = {
    'Not Verified' : -1,
    'Source Verified':1,
    'Verified': 2
}

In [41]:
df_indv['verification_status'] = df_indv['verification_status'].map(verification)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv['verification_status'] = df_indv['verification_status'].map(verification)


In [42]:
df_indv.head(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,...,addr_state,dti,delinq_2yrs,mths_since_last_delinq,total_acc,out_prncp,total_pymnt,total_rec_prncp,total_rec_late_fee,acc_now_delinq
0,1077501,1296599,5000.0,5000.0,36,10.65,162.87,2,7,11,...,AZ,27.65,0.0,0.0,9.0,0.0,5861.071414,5000.0,0.0,0.0
1,1077430,1314167,2500.0,2500.0,60,15.27,59.83,3,14,0,...,GA,1.0,0.0,0.0,4.0,0.0,1008.71,456.46,0.0,0.0
2,1077175,1313524,2400.0,2400.0,36,15.96,84.33,3,15,11,...,IL,8.72,0.0,0.0,10.0,0.0,3003.653644,2400.0,0.0,0.0
3,1076863,1277178,10000.0,10000.0,36,13.49,339.31,3,11,11,...,CA,20.0,0.0,35.0,37.0,0.0,12226.302212,10000.0,16.97,0.0
4,1075358,1311748,3000.0,3000.0,60,12.69,67.79,2,10,1,...,OR,17.94,0.0,38.0,38.0,766.9,3242.17,2233.1,0.0,0.0


# Loan status -> default

In [43]:
sorted(df_indv['loan_status'].unique())

['Charged Off',
 'Current',
 'Default',
 'Does not meet the credit policy. Status:Charged Off',
 'Does not meet the credit policy. Status:Fully Paid',
 'Fully Paid',
 'In Grace Period',
 'Issued',
 'Late (16-30 days)',
 'Late (31-120 days)']

In [44]:
l_stat = {
    'Charged Off': 1,
    'Default':1,
    'Does not meet the credit policy. Status:Charged Off':0,
    'Late (16-30 days)':1,
    'Late (31-120 days)': 1,
 'Current': 0,
 'Does not meet the credit policy. Status:Fully Paid' : 0,
 'Fully Paid': 0,
 'In Grace Period':0,
 'Issued': 0,
}

In [45]:
df_indv['loan_status'] = df_indv['loan_status'].map(l_stat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv['loan_status'] = df_indv['loan_status'].map(l_stat)


In [46]:
df_indv.head(15)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,...,addr_state,dti,delinq_2yrs,mths_since_last_delinq,total_acc,out_prncp,total_pymnt,total_rec_prncp,total_rec_late_fee,acc_now_delinq
0,1077501,1296599,5000.0,5000.0,36,10.65,162.87,2,7,11,...,AZ,27.65,0.0,0.0,9.0,0.0,5861.071414,5000.0,0.0,0.0
1,1077430,1314167,2500.0,2500.0,60,15.27,59.83,3,14,0,...,GA,1.0,0.0,0.0,4.0,0.0,1008.71,456.46,0.0,0.0
2,1077175,1313524,2400.0,2400.0,36,15.96,84.33,3,15,11,...,IL,8.72,0.0,0.0,10.0,0.0,3003.653644,2400.0,0.0,0.0
3,1076863,1277178,10000.0,10000.0,36,13.49,339.31,3,11,11,...,CA,20.0,0.0,35.0,37.0,0.0,12226.302212,10000.0,16.97,0.0
4,1075358,1311748,3000.0,3000.0,60,12.69,67.79,2,10,1,...,OR,17.94,0.0,38.0,38.0,766.9,3242.17,2233.1,0.0,0.0
5,1075269,1311441,5000.0,5000.0,36,7.9,156.46,1,4,3,...,AZ,11.2,0.0,0.0,12.0,0.0,5631.377753,5000.0,0.0,0.0
6,1069639,1304742,7000.0,7000.0,60,15.96,170.08,3,15,8,...,NC,23.51,0.0,0.0,11.0,1889.15,8136.84,5110.85,0.0,0.0
7,1072053,1288686,3000.0,3000.0,36,18.64,109.43,5,21,9,...,CA,5.35,0.0,0.0,4.0,0.0,3938.144334,3000.0,0.0,0.0
8,1071795,1306957,5600.0,5600.0,60,21.28,152.39,6,27,4,...,CA,5.55,0.0,0.0,13.0,0.0,646.02,162.02,0.0,0.0
9,1071570,1306721,5375.0,5375.0,60,12.69,121.45,2,10,0,...,TX,18.08,0.0,0.0,3.0,0.0,1476.19,673.48,0.0,0.0


# payment plan

In [47]:
sorted(df_indv['pymnt_plan'].unique())

['n', 'y']

In [48]:
paym_plan = {
    'n' : 0,
    'y' : 1
}

In [49]:
df_indv['pymnt_plan'] = df_indv['pymnt_plan'].map(paym_plan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv['pymnt_plan'] = df_indv['pymnt_plan'].map(paym_plan)


# Purpose

In [50]:
sorted(df_indv['purpose'].unique())

['car',
 'credit_card',
 'debt_consolidation',
 'educational',
 'home_improvement',
 'house',
 'major_purchase',
 'medical',
 'moving',
 'other',
 'renewable_energy',
 'small_business',
 'vacation',
 'wedding']

In [51]:
for col in ["purpose", "addr_state"]:
    le = LabelEncoder()
    le.fit(df_indv[col])
    df_indv[col] = le.transform(df_indv[col])
df_indv.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv[col] = le.transform(df_indv[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv[col] = le.transform(df_indv[col])


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,...,addr_state,dti,delinq_2yrs,mths_since_last_delinq,total_acc,out_prncp,total_pymnt,total_rec_prncp,total_rec_late_fee,acc_now_delinq
0,1077501,1296599,5000.0,5000.0,36,10.65,162.87,2,7,11,...,3,27.65,0.0,0.0,9.0,0.0,5861.071414,5000.0,0.0,0.0
1,1077430,1314167,2500.0,2500.0,60,15.27,59.83,3,14,0,...,10,1.0,0.0,0.0,4.0,0.0,1008.71,456.46,0.0,0.0
2,1077175,1313524,2400.0,2400.0,36,15.96,84.33,3,15,11,...,14,8.72,0.0,0.0,10.0,0.0,3003.653644,2400.0,0.0,0.0
3,1076863,1277178,10000.0,10000.0,36,13.49,339.31,3,11,11,...,4,20.0,0.0,35.0,37.0,0.0,12226.302212,10000.0,16.97,0.0
4,1075358,1311748,3000.0,3000.0,60,12.69,67.79,2,10,1,...,37,17.94,0.0,38.0,38.0,766.9,3242.17,2233.1,0.0,0.0


In [52]:
df_indv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 886868 entries, 0 to 887378
Data columns (total 26 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      886868 non-null  int64  
 1   member_id               886868 non-null  int64  
 2   loan_amnt               886868 non-null  float64
 3   funded_amnt             886868 non-null  float64
 4   term                    886868 non-null  int64  
 5   int_rate                886868 non-null  float64
 6   installment             886868 non-null  float64
 7   grade                   886868 non-null  int64  
 8   sub_grade               886868 non-null  int64  
 9   emp_length              886868 non-null  int64  
 10  home_ownership          886868 non-null  int64  
 11  annual_inc              886868 non-null  float64
 12  verification_status     886868 non-null  int64  
 13  loan_status             886868 non-null  int64  
 14  pymnt_plan              8

In [53]:
df_indv.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'loan_status', 'pymnt_plan',
       'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'mths_since_last_delinq',
       'total_acc', 'out_prncp', 'total_pymnt', 'total_rec_prncp',
       'total_rec_late_fee', 'acc_now_delinq'],
      dtype='object')

In [57]:
df_indv.rename(columns={'loan_status': 'default'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_indv.rename(columns={'loan_status': 'default'}, inplace=True)


In [58]:
columns = ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'term', 'int_rate',
           'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
           'annual_inc', 'verification_status', 'pymnt_plan', 'purpose', 
           'addr_state', 'dti', 'delinq_2yrs', 'mths_since_last_delinq',
           'total_acc', 'out_prncp', 'total_pymnt', 'total_rec_prncp',
           'total_rec_late_fee', 'acc_now_delinq', 'default']

df_indv = df_indv[columns]

In [59]:
df_indv.to_csv("cleaned_data.csv")