In [1]:
import pandas as pd

In [2]:
rejected_drop = pd.read_csv('Rejected_DropNAs.csv')

In [3]:
rejected_drop.shape

(8992195, 5)

In [4]:
rejected_drop.isnull().sum()                 #No Null Values

Risk_Score           0
DI                   0
Loan Type            0
Employment Length    0
Accepted             0
dtype: int64

In [5]:
rejected_drop.dtypes

Risk_Score           float64
DI                   float64
Loan Type             object
Employment Length     object
Accepted               int64
dtype: object

In [6]:
accepted_drop = pd.read_csv('Accepted_DropNAs.csv')

In [7]:
accepted_drop.shape

(2113644, 5)

In [8]:
accepted_drop.isnull().sum()                 #No Null Values

Risk_Score           0
DI                   0
Loan Type            0
Employment Length    0
Accepted             0
dtype: int64

In [9]:
accepted_drop.dtypes

Risk_Score           float64
DI                   float64
Loan Type             object
Employment Length     object
Accepted               int64
dtype: object

In [10]:
combo = pd.concat([accepted_drop, rejected_drop], axis=0, ignore_index=True)

In [11]:
combo.shape

(11105839, 5)

In [12]:
combo

Unnamed: 0,Risk_Score,DI,Loan Type,Employment Length,Accepted
0,677.0,0.0591,Debt Consolidation,10+ years,1
1,717.0,0.1606,Business Loan,10+ years,1
2,697.0,0.1078,Home Improvement,10+ years,1
3,787.0,0.1706,Debt Consolidation,10+ years,1
4,697.0,0.2537,Major Purchase,3 years,1
...,...,...,...,...,...
11105834,681.0,0.5515,Debt Consolidation,< 1 year,0
11105835,531.0,0.3131,Debt Consolidation,< 1 year,0
11105836,590.0,0.4126,Debt Consolidation,< 1 year,0
11105837,686.0,0.1026,Other,< 1 year,0


In [13]:
combo.dtypes

Risk_Score           float64
DI                   float64
Loan Type             object
Employment Length     object
Accepted               int64
dtype: object

In [14]:
#Top 17 Loan Types have 99% of data. Let's drop all categories below top 17. 
#We can always add more data, by consolidating at earlier stage. 

print(combo['Loan Type'].value_counts().head(17).sum())
print(len(combo))
combo['Loan Type'].value_counts().head(17).sum()/len(combo)

11008224
11105839


0.9912104794603992

In [15]:
combo['Loan Type'].value_counts().head(17)

Debt Consolidation         5357784
Credit Card Refinancing    1610200
Other                      1520787
Home Improvement            545814
Car Financing               488781
Major Purchase              305554
Medical Expenses            299137
Moving And Relocation       268143
Business Loan               257856
Home Purchase               183072
Vacation                    114960
Green Loan                   25056
Wedding                      20288
Personal Loan                 5206
Student Loan                  4913
Freedom                        411
Motorcycle Loan                262
Name: Loan Type, dtype: int64

In [64]:
sub_combo = combo[combo.groupby('Loan Type')['Loan Type'].transform('count') >= 262].copy().reset_index(drop=True)
sub_combo

Unnamed: 0,Risk_Score,DI,Loan Type,Employment Length,Accepted
0,677.0,0.0591,Debt Consolidation,10+ years,1
1,717.0,0.1606,Business Loan,10+ years,1
2,697.0,0.1078,Home Improvement,10+ years,1
3,787.0,0.1706,Debt Consolidation,10+ years,1
4,697.0,0.2537,Major Purchase,3 years,1
...,...,...,...,...,...
11008219,681.0,0.5515,Debt Consolidation,< 1 year,0
11008220,531.0,0.3131,Debt Consolidation,< 1 year,0
11008221,590.0,0.4126,Debt Consolidation,< 1 year,0
11008222,686.0,0.1026,Other,< 1 year,0


In [65]:
sub_combo.dtypes

Risk_Score           float64
DI                   float64
Loan Type             object
Employment Length     object
Accepted               int64
dtype: object

In [66]:
#Convert Employment Length to Ordinal Feature:

In [67]:
sorted(sub_combo['Employment Length'].unique())

['1 year',
 '10+ years',
 '2 years',
 '3 years',
 '4 years',
 '5 years',
 '6 years',
 '7 years',
 '8 years',
 '9 years',
 '< 1 year']

In [68]:
emp_ordering = [
    '< 1 year',
    '1 year',
    '2 years',
    '3 years',
    '4 years',
    '5 years',
    '6 years',
    '7 years',
    '8 years',
    '9 years',
    '10+ years'
 ]

In [69]:
def ordinal_encode(df, column, ordering):
    df[column] = df[column].apply(lambda x: ordering.index(x))

In [70]:
ordinal_encode(sub_combo, 'Employment Length', emp_ordering)

In [71]:
sub_combo

Unnamed: 0,Risk_Score,DI,Loan Type,Employment Length,Accepted
0,677.0,0.0591,Debt Consolidation,10,1
1,717.0,0.1606,Business Loan,10,1
2,697.0,0.1078,Home Improvement,10,1
3,787.0,0.1706,Debt Consolidation,10,1
4,697.0,0.2537,Major Purchase,3,1
...,...,...,...,...,...
11008219,681.0,0.5515,Debt Consolidation,0,0
11008220,531.0,0.3131,Debt Consolidation,0,0
11008221,590.0,0.4126,Debt Consolidation,0,0
11008222,686.0,0.1026,Other,0,0


In [72]:
sub_combo.dtypes

Risk_Score           float64
DI                   float64
Loan Type             object
Employment Length      int64
Accepted               int64
dtype: object

In [73]:
#Dummify Loan Type:

In [74]:
def onehot_encode(df, column):
        dummies = pd.get_dummies(df[column])
        df_new = pd.concat([df, dummies], axis=1)
        df_new = df_new.drop(column, axis=1)
        return df_new

In [75]:
new_combo = onehot_encode(sub_combo, 'Loan Type')
new_combo

Unnamed: 0,Risk_Score,DI,Employment Length,Accepted,Business Loan,Car Financing,Credit Card Refinancing,Debt Consolidation,Freedom,Green Loan,...,Home Purchase,Major Purchase,Medical Expenses,Motorcycle Loan,Moving And Relocation,Other,Personal Loan,Student Loan,Vacation,Wedding
0,677.0,0.0591,10,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,717.0,0.1606,10,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,697.0,0.1078,10,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,787.0,0.1706,10,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,697.0,0.2537,3,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11008219,681.0,0.5515,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11008220,531.0,0.3131,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11008221,590.0,0.4126,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11008222,686.0,0.1026,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [76]:
sub_combo

Unnamed: 0,Risk_Score,DI,Loan Type,Employment Length,Accepted
0,677.0,0.0591,Debt Consolidation,10,1
1,717.0,0.1606,Business Loan,10,1
2,697.0,0.1078,Home Improvement,10,1
3,787.0,0.1706,Debt Consolidation,10,1
4,697.0,0.2537,Major Purchase,3,1
...,...,...,...,...,...
11008219,681.0,0.5515,Debt Consolidation,0,0
11008220,531.0,0.3131,Debt Consolidation,0,0
11008221,590.0,0.4126,Debt Consolidation,0,0
11008222,686.0,0.1026,Other,0,0


In [77]:
new_combo.dtypes

Risk_Score                 float64
DI                         float64
Employment Length            int64
Accepted                     int64
Business Loan                uint8
Car Financing                uint8
Credit Card Refinancing      uint8
Debt Consolidation           uint8
Freedom                      uint8
Green Loan                   uint8
Home Improvement             uint8
Home Purchase                uint8
Major Purchase               uint8
Medical Expenses             uint8
Motorcycle Loan              uint8
Moving And Relocation        uint8
Other                        uint8
Personal Loan                uint8
Student Loan                 uint8
Vacation                     uint8
Wedding                      uint8
dtype: object

In [78]:
#new_combo.describe()

In [79]:
y = new_combo['Accepted']
X = new_combo.drop('Accepted', axis=1)

In [80]:
X

Unnamed: 0,Risk_Score,DI,Employment Length,Business Loan,Car Financing,Credit Card Refinancing,Debt Consolidation,Freedom,Green Loan,Home Improvement,Home Purchase,Major Purchase,Medical Expenses,Motorcycle Loan,Moving And Relocation,Other,Personal Loan,Student Loan,Vacation,Wedding
0,677.0,0.0591,10,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,717.0,0.1606,10,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,697.0,0.1078,10,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,787.0,0.1706,10,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,697.0,0.2537,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11008219,681.0,0.5515,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
11008220,531.0,0.3131,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
11008221,590.0,0.4126,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
11008222,686.0,0.1026,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [81]:
X.shape

(11008224, 20)

In [82]:
X.columns

Index(['Risk_Score', 'DI', 'Employment Length', 'Business Loan',
       'Car Financing', 'Credit Card Refinancing', 'Debt Consolidation',
       'Freedom', 'Green Loan', 'Home Improvement', 'Home Purchase',
       'Major Purchase', 'Medical Expenses', 'Motorcycle Loan',
       'Moving And Relocation', 'Other', 'Personal Loan', 'Student Loan',
       'Vacation', 'Wedding'],
      dtype='object')

In [83]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [84]:
X_scaled

array([[ 0.41261569, -0.00823819,  2.6859197 , ..., -0.02113059,
        -0.10272931, -0.04296962],
       [ 0.87577502, -0.00759484,  2.6859197 , ..., -0.02113059,
        -0.10272931, -0.04296962],
       [ 0.64419535, -0.00792951,  2.6859197 , ..., -0.02113059,
        -0.10272931, -0.04296962],
       ...,
       [-0.59475586, -0.00599754, -0.48452155, ..., -0.02113059,
        -0.10272931, -0.04296962],
       [ 0.51682654, -0.00796247, -0.48452155, ..., -0.02113059,
        -0.10272931, -0.04296962],
       [ 0.49366857, -0.00794218, -0.48452155, ..., -0.02113059,
        -0.10272931, -0.04296962]])

In [85]:
X = pd.DataFrame(X_scaled, columns=X.columns)
X

Unnamed: 0,Risk_Score,DI,Employment Length,Business Loan,Car Financing,Credit Card Refinancing,Debt Consolidation,Freedom,Green Loan,Home Improvement,Home Purchase,Major Purchase,Medical Expenses,Motorcycle Loan,Moving And Relocation,Other,Personal Loan,Student Loan,Vacation,Wedding
0,0.412616,-0.008238,2.685920,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,-0.130045,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297
1,0.875775,-0.007595,2.685920,6.456885,-0.215556,-0.413925,-0.973759,-0.00611,-0.047763,-0.228405,-0.130045,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297
2,0.644195,-0.007930,2.685920,-0.154873,-0.215556,-0.413925,-0.973759,-0.00611,-0.047763,4.378179,-0.130045,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297
3,1.686304,-0.007531,2.685920,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,-0.130045,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297
4,0.644195,-0.007005,0.466611,-0.154873,-0.215556,-0.413925,-0.973759,-0.00611,-0.047763,-0.228405,-0.130045,5.918370,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11008219,0.458932,-0.005117,-0.484522,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,-0.130045,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297
11008220,-1.277916,-0.006628,-0.484522,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,-0.130045,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297
11008221,-0.594756,-0.005998,-0.484522,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,-0.130045,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297
11008222,0.516827,-0.007962,-0.484522,-0.154873,-0.215556,-0.413925,-0.973759,-0.00611,-0.047763,-0.228405,-0.130045,-0.168965,-0.167132,-0.004879,-0.158008,2.497700,-0.021752,-0.021131,-0.102729,-0.04297


In [86]:
y.shape

(11008224,)

In [87]:
y

0           1
1           1
2           1
3           1
4           1
           ..
11008219    0
11008220    0
11008221    0
11008222    0
11008223    0
Name: Accepted, Length: 11008224, dtype: int64

In [90]:
scaled_combo = pd.concat([X,y], axis=1)
scaled_combo

Unnamed: 0,Risk_Score,DI,Employment Length,Business Loan,Car Financing,Credit Card Refinancing,Debt Consolidation,Freedom,Green Loan,Home Improvement,...,Major Purchase,Medical Expenses,Motorcycle Loan,Moving And Relocation,Other,Personal Loan,Student Loan,Vacation,Wedding,Accepted
0,0.412616,-0.008238,2.685920,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,...,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297,1
1,0.875775,-0.007595,2.685920,6.456885,-0.215556,-0.413925,-0.973759,-0.00611,-0.047763,-0.228405,...,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297,1
2,0.644195,-0.007930,2.685920,-0.154873,-0.215556,-0.413925,-0.973759,-0.00611,-0.047763,4.378179,...,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297,1
3,1.686304,-0.007531,2.685920,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,...,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297,1
4,0.644195,-0.007005,0.466611,-0.154873,-0.215556,-0.413925,-0.973759,-0.00611,-0.047763,-0.228405,...,5.918370,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11008219,0.458932,-0.005117,-0.484522,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,...,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297,0
11008220,-1.277916,-0.006628,-0.484522,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,...,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297,0
11008221,-0.594756,-0.005998,-0.484522,-0.154873,-0.215556,-0.413925,1.026948,-0.00611,-0.047763,-0.228405,...,-0.168965,-0.167132,-0.004879,-0.158008,-0.400368,-0.021752,-0.021131,-0.102729,-0.04297,0
11008222,0.516827,-0.007962,-0.484522,-0.154873,-0.215556,-0.413925,-0.973759,-0.00611,-0.047763,-0.228405,...,-0.168965,-0.167132,-0.004879,-0.158008,2.497700,-0.021752,-0.021131,-0.102729,-0.04297,0


In [91]:
scaled_combo.to_csv('Combo_4_DropNAs.csv', index=False)