In [11]:
import pandas as pd
import numpy as np
import random

In [12]:
# Read the data:

# We take the url to download our Dataset : 

Data = pd.read_csv("Loan_data.csv", low_memory=False)
Data.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000,5000,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500,2500,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,


In [30]:
# We need to check how is our dataset :
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466285 entries, 0 to 466284
Data columns (total 74 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           466285 non-null  int64  
 1   member_id                    466285 non-null  int64  
 2   loan_amnt                    466285 non-null  int64  
 3   funded_amnt                  466285 non-null  int64  
 4   funded_amnt_inv              466285 non-null  float64
 5   term                         466285 non-null  object 
 6   int_rate                     466285 non-null  float64
 7   installment                  466285 non-null  float64
 8   grade                        466285 non-null  object 
 9   sub_grade                    466285 non-null  object 
 10  emp_title                    438697 non-null  object 
 11  emp_length                   445277 non-null  object 
 12  home_ownership               466285 non-null  object 
 13 

In [6]:
# Checking for missing values
Data.isnull().sum()

id                       0
member_id                0
loan_amnt                0
funded_amnt              0
funded_amnt_inv          0
                     ...  
all_util            466285
total_rev_hi_lim     70276
inq_fi              466285
total_cu_tl         466285
inq_last_12m        466285
Length: 74, dtype: int64

In [13]:
# Dropping the missing values :

# I tried to erase rows with missing values but it erases every columns.
# Maybe we could decide to remove COLUMNS with missing values instead of rows.
# It is 'less professional' but it is also an easy way to remove features as we have too many of them.

Data.dropna(axis = 1, thresh = Data.shape[0]*0.9, inplace = True) # axis=1 is to remove comlumns instead of rows
Data.dropna(axis = 0, inplace = True) # axis0 is to remove rows which contain missing values
Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437710 entries, 1 to 466283
Data columns (total 49 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          437710 non-null  int64  
 1   member_id                   437710 non-null  int64  
 2   loan_amnt                   437710 non-null  int64  
 3   funded_amnt                 437710 non-null  int64  
 4   funded_amnt_inv             437710 non-null  float64
 5   term                        437710 non-null  object 
 6   int_rate                    437710 non-null  float64
 7   installment                 437710 non-null  float64
 8   grade                       437710 non-null  object 
 9   sub_grade                   437710 non-null  object 
 10  emp_title                   437710 non-null  object 
 11  emp_length                  437710 non-null  object 
 12  home_ownership              437710 non-null  object 
 13  annual_inc    

In [60]:
# Checking for missing values again :
Data.isnull().sum()
Data.info()

# We can see that we have less features and we keep all our observations.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437710 entries, 1 to 466283
Data columns (total 38 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   loan_amnt                   437710 non-null  int64  
 1   funded_amnt                 437710 non-null  int64  
 2   funded_amnt_inv             437710 non-null  float64
 3   term                        437710 non-null  int32  
 4   int_rate                    437710 non-null  float64
 5   installment                 437710 non-null  float64
 6   grade                       437710 non-null  object 
 7   emp_length                  437710 non-null  int64  
 8   home_ownership              437710 non-null  object 
 9   annual_inc                  437710 non-null  float64
 10  verification_status         437710 non-null  object 
 11  purpose                     437710 non-null  object 
 12  title                       437710 non-null  object 
 13  addr_state    

In [40]:
# We compute the base rate to check if our dataset is balanced :
Data.grade.value_counts()/Data.shape[0]

# We observe a huge difference between the grade so we will have to change that.

B    0.293774
C    0.268865
D    0.164465
A    0.160718
E    0.076649
F    0.028402
G    0.007126
Name: grade, dtype: float64

In [41]:
Data.grade.value_counts()

# We see that the smaller group has 3119 values. The idea is to delete enough observations for the other group to have 
# a balanced dataset.

B    128588
C    117685
D     71988
A     70348
E     33550
F     12432
G      3119
Name: grade, dtype: int64

In [14]:
#We remove columns that are not useful for our study
Data.drop(columns = ['id', 'member_id', 'sub_grade', 'emp_title', 'loan_status', 'pymnt_plan',
                          'zip_code', 'initial_list_status', 'application_type', 'url'], inplace = True)

In [15]:
#We change the way dates are given in the dataset
Data['term'] = Data['term'].str.replace(' months','')
Data['term'] = Data['term'].astype(int)

In [16]:
def emp_length_converter(df, column):
    df[column] = df[column].str.replace('\+ years', '')
    df[column] = df[column].str.replace('< 1 year', str(0))
    df[column] = df[column].str.replace(' years', '')
    df[column] = df[column].str.replace(' year', '')
    df[column] = pd.to_numeric(df[column])

In [17]:
#We change the type and the format of emp_length
emp_length_converter(Data,'emp_length')
Data['emp_length']

  df[column] = df[column].str.replace('\+ years', '')


1          0
3         10
4          1
5          3
6          8
          ..
466279     4
466280     4
466281    10
466282     7
466283     3
Name: emp_length, Length: 437710, dtype: int64

In [18]:
def date_columns(df, column):
    # store current month
    today_date = pd.to_datetime('2022-04-01')
    # convert to datetime format
    df[column] = pd.to_datetime(df[column], format = "%b-%y")
    # calculate the difference in months and add to a new column
    df['mths_since_' + column] = round(pd.to_numeric((today_date - df[column]) / np.timedelta64(1, 'M')))
    # make any resulting -ve values to be equal to the max date
    df['mths_since_' + column] = df['mths_since_' + column].apply(lambda x: df['mths_since_' + column].max() if x < 0 else x)
    # drop the original date column
    df.drop(columns = [column], inplace = True)

In [19]:
date_columns(Data, 'earliest_cr_line')
date_columns(Data, 'issue_d')
date_columns(Data, 'last_pymnt_d')
date_columns(Data, 'last_credit_pull_d')

In [25]:
#Verification of the modified dates in the dataset
Data[['mths_since_earliest_cr_line', 'mths_since_issue_d', 'mths_since_last_pymnt_d', 'mths_since_last_credit_pull_d']]

Unnamed: 0,mths_since_earliest_cr_line,mths_since_issue_d,mths_since_last_pymnt_d,mths_since_last_credit_pull_d
1,276.0,124.0,108.0,103.0
3,314.0,124.0,87.0,87.0
4,315.0,124.0,75.0,75.0
5,209.0,124.0,87.0,79.0
6,201.0,124.0,75.0,75.0
...,...,...,...,...
466279,272.0,99.0,75.0,75.0
466280,228.0,99.0,75.0,75.0
466281,298.0,99.0,88.0,75.0
466282,244.0,99.0,75.0,76.0


In [27]:
# The smaller group has 3322 observations and we want to remove enough rows in other groups to have the same number in each
# group

# We define NbA, NbB, NbC, NbD, NbE, NbF and NbG the numbers of rows in each group for the moment :

NbA = 70348
NbB = 128588
NbC = 117685
NbD = 71988
NbE = 33550
NbF = 12432
NbG = 3119

ListNbRows = [NbA, NbB, NbC, NbD, NbE, NbF]

# We define ListNbRowsToDelete the list of the numbers of rows we need to delete for each group :

ListNbRowsToDelete = []
for i in ListNbRows:
    ListNbRowsToDelete.append(i - NbG)

In [28]:
# We divide our dataset in different groups, one for each grade. Then we could delete the number of rows we want for each group.

groups = Data.groupby(Data.grade)

Data_A = groups.get_group('A')
Data_B = groups.get_group('B')
Data_C = groups.get_group('C')
Data_D = groups.get_group('D')
Data_E = groups.get_group('E')
Data_F = groups.get_group('F')
Data_G = groups.get_group('G')

# To make it easier for after we put them in a list :

listGroup = [Data_A, Data_B, Data_C, Data_D, Data_E, Data_F]

In [None]:
# We randomly remove the overpart of each group :

In [103]:
# Firt we try to remove rows one by one, it is the best way to be random :

#for i in range(6):
#    while  ListNbRowsToDelete[i] > 0:
#        n = random.randrange(0, len(listGroup[i].index))
#        listGroup[i] = listGroup[i].drop([listGroup[i].index[n]])
#        ListNbRowsToDelete[i] = ListNbRowsToDelete[i] - 1
        
# Unfortunately this code is very long to run (approximately 3h)

In [29]:
# We try to remove rows ten by ten to obtain a quicker result and it works, it is around 15 min to run!
# It is almost the same code as before :

for i in range(6):
    while  ListNbRowsToDelete[i] > 0:
        n = random.randrange(0, len(listGroup[i].index)-9)
        listGroup[i] = listGroup[i].drop([listGroup[i].index[n], listGroup[i].index[n+1], listGroup[i].index[n+2],
                                         listGroup[i].index[n+3], listGroup[i].index[n+4], listGroup[i].index[n+5],
                                         listGroup[i].index[n+6], listGroup[i].index[n+7], listGroup[i].index[n+8],
                                         listGroup[i].index[n+9]])
        ListNbRowsToDelete[i] = ListNbRowsToDelete[i] - 10
        
# It took 14 min to run
# Attention it does not change Data_A, Data_B etc. It modifies only the list listGroup!

In [30]:
# Now we can concatenate the new DataFrame : 

DataNew = pd.concat(listGroup + [Data_G])

DataNew.shape

(21810, 39)

In [31]:
# As we delete 10 by 10, we do not delete the exact number of rows for each grade
# It is not important but it is to explain why each grade do not have the same rate
print(ListNbRowsToDelete)

[-1, -1, -4, -1, -9, -7]


In [32]:
DataNew.grade.value_counts()/DataNew.shape[0]

G    0.143008
D    0.142962
A    0.142962
B    0.142962
C    0.142824
F    0.142687
E    0.142595
Name: grade, dtype: float64

In [33]:
# In order to work quicker we save the clean dataframe to csv
# We don't have to run the code to clean before each session, we just need to import the following dataframe :
#DataNew.to_csv("cleanData.csv")

In [34]:
# To continue we just open the dataset we saved before :

#DataNew = pd.read_csv("cleanData.csv", low_memory=False)
DataNew.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,...,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,policy_code,acc_now_delinq,mths_since_earliest_cr_line,mths_since_issue_d,mths_since_last_pymnt_d,mths_since_last_credit_pull_d
5,5000,5000,5000.0,36,7.9,156.46,A,3,RENT,36000.0,...,0.0,0.0,161.03,0.0,1,0.0,209.0,124.0,87.0,79.0
17,3600,3600,3600.0,36,6.03,109.57,A,10,MORTGAGE,110000.0,...,0.0,0.0,583.45,0.0,1,0.0,344.0,124.0,107.0,95.0
19,9200,9200,9200.0,36,6.03,280.01,A,6,RENT,77385.19,...,0.0,0.0,8061.1,0.0,1,0.0,255.0,124.0,117.0,117.0
31,31825,31825,31825.0,36,7.9,995.82,A,5,MORTGAGE,75000.0,...,0.0,0.0,16966.7,0.0,1,0.0,485.0,124.0,104.0,78.0
83,4500,4500,4500.0,36,6.03,136.96,A,4,RENT,53000.0,...,0.0,0.0,138.95,0.0,1,0.0,258.0,124.0,87.0,88.0


In [20]:
# We have a new column "Unnamed" with the old indexes. We delete that as well :
#Quand on réouvre le document csv
DataNew.drop(columns = ['Unnamed: 0'], inplace = True)

In [37]:
#We divide our dataset in two parts with our target variable and the other ones
from sklearn.model_selection import train_test_split
X = DataNew.drop('grade', axis = 1)
y = DataNew['grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [38]:
#We divide our training data into categorical and numerical subsets
X_train_cat = X_train.select_dtypes(include = 'object').copy()
X_train_num = X_train.select_dtypes(include = 'number').copy()

In [39]:
X_train_num

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,...,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,policy_code,acc_now_delinq,mths_since_earliest_cr_line,mths_since_issue_d,mths_since_last_pymnt_d,mths_since_last_credit_pull_d
195597,10000,10000,10000.0,36,16.29,353.01,0,34800.0,20.07,0.0,...,0.0,0.0,356.11,0.0,1,0.0,315.0,115.0,78.0,79.0
50314,14125,14125,14125.0,36,22.90,546.04,5,55000.0,8.28,0.0,...,0.0,0.0,625.00,0.0,1,0.0,413.0,100.0,85.0,75.0
310922,15000,15000,15000.0,36,8.39,472.75,2,36500.0,12.46,0.0,...,0.0,0.0,472.75,0.0,1,0.0,403.0,91.0,75.0,75.0
125875,16000,16000,16000.0,60,23.76,458.07,8,85000.0,30.52,0.0,...,0.0,0.0,458.07,0.0,1,0.0,257.0,106.0,75.0,75.0
190483,20000,20000,19950.0,60,22.95,563.24,3,55000.0,18.58,1.0,...,0.0,0.0,563.24,0.0,1,0.0,278.0,114.0,76.0,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231109,7475,7475,7475.0,36,24.99,297.17,10,25000.0,16.54,0.0,...,0.0,0.0,6331.96,0.0,1,0.0,165.0,88.0,78.0,77.0
13774,3900,3900,3900.0,36,5.99,118.63,6,73909.0,12.71,0.0,...,0.0,0.0,2373.44,0.0,1,0.0,293.0,130.0,114.0,75.0
148251,22000,22000,22000.0,60,22.95,619.56,2,50280.0,15.69,0.0,...,0.0,0.0,619.56,0.0,1,0.0,267.0,108.0,75.0,75.0
383915,25000,25000,25000.0,36,14.99,866.52,0,100000.0,14.50,0.0,...,0.0,0.0,866.52,0.0,1,0.0,258.0,95.0,75.0,75.0


In [None]:
#Define an empty dictionary to store chi-squared test results
chi2_check = {}

In [None]:
#loop over each column in the training set to calculate chi-statistic with the target variable
for column in X_train_cat:
    chi, p, dof, ex = chi2_contingency(pd.crosstab(y_train, X_train_cat[column]))
    chi2_check.setdefault('Feature',[]).append(column)
    chi2_check.setdefault('p-value',[]).append(round(p, 10))