In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

# allow plots to appear within the notebook
%matplotlib inline



In [2]:
# read train data
train_data = pd.read_csv("train.csv",  index_col = 'id')
# read test data 
test_data = pd.read_csv('test.csv', index_col = 'id')

In [3]:
# Display data
train_data.head(10)

Unnamed: 0_level_0,loan_amnt,emp_length,home_ownership,zip_code,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,purpose,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1418165,17000,,MORTGAGE,321xx,48000.0,21.28,0,0,8,0,10265,76.6,15,0.0,0,,,,debt_consolidation,Fully Paid
1032352,30000,6 years,MORTGAGE,950xx,100000.0,1.19,0,0,3,1,8378,89.1,4,0.0,0,,,,debt_consolidation,charged off
436240,12000,5 years,MORTGAGE,025xx,85000.0,6.58,0,0,7,0,0,0.0,30,0.0,0,,,,home_improvement,Fully Paid
3255664,12000,< 1 year,MORTGAGE,223xx,55000.0,7.86,0,0,6,0,11253,55.4,15,0.0,0,124.0,146441.0,20300.0,debt_consolidation,charged off
8126043,8000,8 years,RENT,967xx,45000.0,27.13,0,1,11,0,12856,59.8,15,0.0,0,0.0,49976.0,21500.0,debt_consolidation,charged off
3921481,20950,1 year,OWN,852xx,47710.0,21.45,0,3,14,0,25144,78.1,25,0.0,0,0.0,197663.0,32200.0,credit_card,Fully Paid
417536,8000,2 years,RENT,926xx,45000.0,11.17,0,0,8,0,1150,13.9,17,0.0,0,,,,educational,Fully Paid
6534871,5000,10+ years,RENT,967xx,86400.0,28.1,0,0,9,0,6666,88.9,24,0.0,0,0.0,160916.0,7500.0,other,charged off
498498,3000,3 years,RENT,900xx,44000.0,0.46,0,3,2,0,384,96.0,11,0.0,0,,,,educational,charged off
12995831,15075,8 years,MORTGAGE,661xx,36000.0,19.0,0,0,6,0,12793,66.3,18,0.0,0,0.0,67403.0,19300.0,debt_consolidation,charged off


In [4]:
# Display some descriptive statistics 
train_data.describe()

Unnamed: 0,loan_amnt,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
count,99198.0,99198.0,99198.0,99198.0,99198.0,99198.0,99198.0,99198.0,99105.0,99198.0,99185.0,99198.0,76837.0,76837.0,76837.0
mean,13904.074427,70980.02,17.15093,0.26308,0.876217,11.038821,0.148854,15119.25315,55.42643,24.963921,0.007904,0.00374,280.8686,131887.6,28922.58
std,8240.729748,54554.73,7.953368,0.766506,1.084135,4.958767,0.447266,17901.507503,24.599037,11.791805,0.094823,0.067772,33070.09,147285.0,28391.97
min,700.0,4080.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,7750.0,44000.0,11.23,0.0,0.0,8.0,0.0,5912.25,37.6,16.0,0.0,0.0,0.0,27784.0,13000.0
50%,12000.0,60000.0,16.85,0.0,1.0,10.0,0.0,10980.5,57.1,23.0,0.0,0.0,0.0,72653.0,21800.0
75%,19200.0,85000.0,22.74,0.0,1.0,14.0,0.0,19073.0,75.0,32.0,0.0,0.0,0.0,198254.0,35800.0
max,35000.0,8706582.0,39.99,21.0,8.0,55.0,12.0,867528.0,148.0,150.0,3.0,5.0,9152545.0,4772549.0,1035000.0


In [5]:
# Display loan_status stats 
train_data['loan_status'].value_counts()

Fully Paid     60432
charged off    38766
Name: loan_status, dtype: int64

# Enconding data 

In [6]:
# Encode 'loan_status' into float 
loan_status_dict = {'Fully Paid':1, 'charged off':0}
train_data['loan_status'] = train_data['loan_status'].map(loan_status_dict)

In [7]:
train_data['loan_status'].value_counts()

1    60432
0    38766
Name: loan_status, dtype: int64

In [8]:
target = train_data['loan_status']
train_data =  train_data.drop('loan_status', 1)

In [9]:
train_data.head(5)

Unnamed: 0_level_0,loan_amnt,emp_length,home_ownership,zip_code,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,purpose
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1418165,17000,,MORTGAGE,321xx,48000.0,21.28,0,0,8,0,10265,76.6,15,0.0,0,,,,debt_consolidation
1032352,30000,6 years,MORTGAGE,950xx,100000.0,1.19,0,0,3,1,8378,89.1,4,0.0,0,,,,debt_consolidation
436240,12000,5 years,MORTGAGE,025xx,85000.0,6.58,0,0,7,0,0,0.0,30,0.0,0,,,,home_improvement
3255664,12000,< 1 year,MORTGAGE,223xx,55000.0,7.86,0,0,6,0,11253,55.4,15,0.0,0,124.0,146441.0,20300.0,debt_consolidation
8126043,8000,8 years,RENT,967xx,45000.0,27.13,0,1,11,0,12856,59.8,15,0.0,0,0.0,49976.0,21500.0,debt_consolidation


In [10]:
# Reduce 'zip_code' string
train_data['zip_code'] = train_data['zip_code'].str[0:2]
test_data['zip_code'] = test_data['zip_code'].str[0:2]

In [11]:
train_data['zip_code']

id
1418165     32
1032352     95
436240      02
3255664     22
8126043     96
3921481     85
417536      92
6534871     96
498498      90
12995831    66
12686540    55
3644164     60
680672      18
1059041     94
776165      94
1331373     11
8981068     44
2445677     27
4105541     33
1462418     55
12616243    59
5197445     94
8665321     08
1156067     45
61422068    10
878629      93
15400104    10
889248      93
1307376     92
15309561    96
            ..
1597723     60
19937825    55
6826647     90
30705889    18
45344377    28
19157649    11
33161921    89
5555383     95
18815071    88
8607735     65
5648300     32
1866189     86
17734063    75
7356142     89
1536001     94
472809      07
22273847    21
1568053     79
745633      90
25857374    96
2380167     93
1515823     98
12267064    48
1704892     70
3484668     33
1158699     33
6286913     30
14858848    75
39599246    63
34693535    79
Name: zip_code, dtype: object

In [12]:
train_data.head(5)

Unnamed: 0_level_0,loan_amnt,emp_length,home_ownership,zip_code,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,purpose
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1418165,17000,,MORTGAGE,32,48000.0,21.28,0,0,8,0,10265,76.6,15,0.0,0,,,,debt_consolidation
1032352,30000,6 years,MORTGAGE,95,100000.0,1.19,0,0,3,1,8378,89.1,4,0.0,0,,,,debt_consolidation
436240,12000,5 years,MORTGAGE,2,85000.0,6.58,0,0,7,0,0,0.0,30,0.0,0,,,,home_improvement
3255664,12000,< 1 year,MORTGAGE,22,55000.0,7.86,0,0,6,0,11253,55.4,15,0.0,0,124.0,146441.0,20300.0,debt_consolidation
8126043,8000,8 years,RENT,96,45000.0,27.13,0,1,11,0,12856,59.8,15,0.0,0,0.0,49976.0,21500.0,debt_consolidation


In [13]:
# OneHotEncoding of the features 'home_ownership' , 'zip_code' and 'purpose'
def one_hot_dataframe(data, cols, replace=False):
    """ Takes a dataframe and a list of columns that need to be encoded.
        Returns a 3-tuple comprising the data, the vectorized data,
        and the fitted vectorizor.
    """
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return (data, vecData, vec)

train_data, vecTrainData, vec = one_hot_dataframe(train_data, ['home_ownership', 'purpose','zip_code','emp_length'], replace=True)

In [14]:
test_data, vecTestData, vec = one_hot_dataframe(test_data, ['home_ownership', 'purpose','zip_code','emp_length'], replace=True)

In [15]:
train_data.head()

Unnamed: 0_level_0,loan_amnt,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,...,zip_code=90,zip_code=91,zip_code=92,zip_code=93,zip_code=94,zip_code=95,zip_code=96,zip_code=97,zip_code=98,zip_code=99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1418165,17000,48000.0,21.28,0,0,8,0,10265,76.6,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1032352,30000,100000.0,1.19,0,0,3,1,8378,89.1,4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
436240,12000,85000.0,6.58,0,0,7,0,0,0.0,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3255664,12000,55000.0,7.86,0,0,6,0,11253,55.4,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8126043,8000,45000.0,27.13,0,1,11,0,12856,59.8,15,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99198 entries, 1418165 to 34693535
Columns: 146 entries, loan_amnt to zip_code=99
dtypes: float64(138), int64(8)
memory usage: 111.3 MB


In [17]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48860 entries, 5716507 to 27702218
Columns: 142 entries, loan_amnt to zip_code=99
dtypes: float64(134), int64(8)
memory usage: 53.3 MB


In [18]:
# Get missing columns in the training test
missing_cols = set( train_data.columns ) - set( test_data.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_data[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
test_data = test_data[train_data.columns]

In [19]:
test_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 48860 entries, 5716507 to 27702218
Columns: 146 entries, loan_amnt to zip_code=99
dtypes: float64(134), int64(12)
memory usage: 54.8 MB


In [20]:
train_data.isnull().any()

loan_amnt                     False
annual_inc                    False
dti                           False
delinq_2yrs                   False
inq_last_6mths                False
open_acc                      False
pub_rec                       False
revol_bal                     False
revol_util                     True
total_acc                     False
collections_12_mths_ex_med     True
acc_now_delinq                False
tot_coll_amt                   True
tot_cur_bal                    True
total_rev_hi_lim               True
emp_length=1 year             False
emp_length=10+ years          False
emp_length=2 years            False
emp_length=3 years            False
emp_length=4 years            False
emp_length=5 years            False
emp_length=6 years            False
emp_length=7 years            False
emp_length=8 years            False
emp_length=9 years            False
emp_length=< 1 year           False
emp_length=n/a                False
home_ownership=MORTGAGE     

In [21]:
#train and test data split 
X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.20, random_state=7)

In [22]:
X_train.shape

(79358, 146)

In [23]:
y_train.shape

(79358,)

# Fit model

In [24]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score





In [25]:
params={
    'n_estimators':1000,
    'objective': 'binary:logistic',
    'learning_rate': 0.05,
    'gamma':0.1,
    'subsample':0.8,
    'colsample_bytree':0.3,
    'min_child_weight':3,
    'max_depth':6,
    'seed':0,
}

In [26]:
train_X = np.array(X_train).astype('float')
test_X = np.array(X_test).astype('float')

In [27]:
train_y = np.array(y_train)

In [28]:
xgtrain = xgb.DMatrix(train_X, label=train_y)

In [29]:
xgtest = xgb.DMatrix(test_X)

In [30]:

plst = list(params.items())
num_rounds = 1000


In [31]:
model = xgb.train(plst, xgtrain, num_rounds)

In [32]:
pred_test_y = model.predict(xgtest)
test_predictions = [round(value) for value in pred_test_y]


In [33]:
pred_test_y
test_y = np.array(y_test)
roc_auc_score(test_y,pred_test_y)


0.69305526326699263

In [34]:
accuracy_score(test_y,test_predictions)

0.66043346774193545

# Validation

In [35]:
train_X = np.array(train_data).astype('float')
train_y = np.array(target)
test_X = np.array(test_data).astype('float')

In [36]:
xgtrain = xgb.DMatrix(train_X, label=train_y)
xgtest = xgb.DMatrix(test_X)

In [37]:
train_data.shape

(99198, 146)

In [38]:
params={
    'n_estimators':1000,
    'objective': 'binary:logistic',
    'learning_rate': 0.05,
    'gamma':0.1,
    'subsample':0.8,
    'colsample_bytree':0.3,
    'min_child_weight':3,
    'max_depth':6,
    'seed':0,
}

In [39]:
plst = list(params.items())
num_rounds = 1000

In [40]:
model = xgb.train(plst, xgtrain, num_rounds)

In [41]:
predictions_y = model.predict(xgtest)

In [42]:

Y_test = pd.DataFrame(predictions_y,columns=['loan_status'])

In [43]:
submission = pd.read_csv("submission_file.csv")
Loan_Id = submission['id']
preds = pd.Series(predictions_y)
final_preds = pd.concat([Loan_Id, preds], names=['id', 'loan_status'], axis=1)
final_preds.columns = ['id', 'loan_status']
final_preds.to_csv('submission_file.csv',index=None)