In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn-notebook')
plt.rcParams["figure.figsize"] = (20, 3)
pd.options.display.float_format = '{:20,.4f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set(context="paper", font="monospace")

from sklearn.preprocessing import LabelEncoder,StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import log_loss, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [4]:
train.drop(['form_field40', 'form_field31','form_field41','form_field11', 'form_field45'],axis=1, inplace = True)
test.drop(['form_field40', 'form_field31','form_field41','form_field11', 'form_field45'],axis=1, inplace = True)            

In [5]:
test['default_status'] = 'test'
data = pd.concat([train, test]).reset_index(drop=True)

In [6]:
missing_col = data.columns[data.isna().any()]
missing_col

Index(['form_field1', 'form_field2', 'form_field3', 'form_field4',
       'form_field5', 'form_field6', 'form_field7', 'form_field8',
       'form_field9', 'form_field10', 'form_field12', 'form_field13',
       'form_field15', 'form_field16', 'form_field17', 'form_field18',
       'form_field19', 'form_field20', 'form_field21', 'form_field22',
       'form_field23', 'form_field24', 'form_field25', 'form_field26',
       'form_field27', 'form_field28', 'form_field29', 'form_field30',
       'form_field32', 'form_field33', 'form_field34', 'form_field35',
       'form_field36', 'form_field37', 'form_field38', 'form_field39',
       'form_field42', 'form_field43', 'form_field44', 'form_field46',
       'form_field48', 'form_field49', 'form_field50'],
      dtype='object')

In [7]:
for col in missing_col:
    
    data[col].fillna(data[col].median(), inplace = True)

In [8]:
data.isnull().sum()

Applicant_ID      0
form_field1       0
form_field2       0
form_field3       0
form_field4       0
form_field5       0
form_field6       0
form_field7       0
form_field8       0
form_field9       0
form_field10      0
form_field12      0
form_field13      0
form_field14      0
form_field15      0
form_field16      0
form_field17      0
form_field18      0
form_field19      0
form_field20      0
form_field21      0
form_field22      0
form_field23      0
form_field24      0
form_field25      0
form_field26      0
form_field27      0
form_field28      0
form_field29      0
form_field30      0
form_field32      0
form_field33      0
form_field34      0
form_field35      0
form_field36      0
form_field37      0
form_field38      0
form_field39      0
form_field42      0
form_field43      0
form_field44      0
form_field46      0
form_field47      0
form_field48      0
form_field49      0
form_field50      0
default_status    0
dtype: int64

In [9]:
data.describe(include='all')

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,form_field12,form_field13,form_field14,form_field15,form_field16,form_field17,form_field18,form_field19,form_field20,form_field21,form_field22,form_field23,form_field24,form_field25,form_field26,form_field27,form_field28,form_field29,form_field30,form_field32,form_field33,form_field34,form_field35,form_field36,form_field37,form_field38,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
count,80000,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000,80000.0,80000.0,80000.0,80000
unique,80000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,3
top,Apcnt_1123552,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,charge,,,,no
freq,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,51840,,,,42285
mean,,3491.5809,0.5331,1.05,0.8488,2.0118,504236.3583,6462628.16,2238848.3195,12158555.3064,11810270.711,2487520.2115,5541622.9248,76279711.7706,102246169.9499,0.178,0.2135,0.2218,0.9456,1.0218,87.4888,6390.3615,6001.6298,94.5028,2584.921,9856.2579,6014.317,17300.527,16506.2011,5421.4318,908.0282,13.8583,0.2286,0.1164,12.7318,8.2616,8.5567,0.084,0.3651,6.6023,0.5667,0.0688,,190919.7083,1.047,499430.9013,
std,,184.7323,0.7968,2.1577,3.2238,10.7592,1276940.5801,17488905.8669,3513809.2259,19416348.255,26058050.8968,4239615.6898,21112009.0022,3068578796.1942,91473037.3532,0.554,0.6486,0.6585,1.7271,1.7731,45.2371,4535.6731,4111.7536,393.8159,1866.3231,8214.9892,4941.4527,26946.6527,9638.2655,2319.3139,1001.9172,15.1587,0.6777,0.4471,10.3914,8.939,7.9066,0.4039,0.4089,6.3435,0.1871,0.2959,,1367927.4625,1.8056,5437941.8276,
min,,2986.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,539543.0,0.0,0.0,0.0,0.0,0.0,0.255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0485,0.0,,0.0,0.0,0.0,
25%,,3366.0,0.08,0.0,0.0,0.0,35010.0,811843.0,319758.0,1831023.0,452407.0,404949.0,295640.0,16576457.0,71317315.0,0.0,0.0,0.0,0.0,0.0,60.1613,3734.475,5550.0,26.8913,1335.0,3924.0,2494.0,60.0,9855.0,5170.0,226.0,2.5,0.0,0.0,6.0,2.0,4.0,0.0,0.0,2.02,0.4291,0.0,,192.6443,0.0,0.0672,
50%,,3484.0,0.2695,0.06,0.0,0.0,115533.0,2710163.0,963942.0,5512519.0,3692582.5,1158247.5,1591010.0,28141427.0,84083128.0,0.0,0.0,0.0,0.0,0.0,89.535,5961.6375,5550.0,60.66,2208.0,8214.0,5110.0,486.0,15057.0,5658.0,600.0,7.8334,0.0,0.0,10.0,6.0,6.0,0.0,0.22,5.05,0.5896,0.0,,316.7076,0.0,0.1698,
75%,,3612.0,0.6792,1.2746,0.0,0.0,306532.0,6354023.25,2402075.0,14121283.5,13135849.25,2684878.0,4786839.5,48187375.0,99274064.25,0.0,0.0,0.0,1.0,1.0,115.53,8228.9775,5550.0,97.7738,3343.5,12774.0,7666.0,60772.0,21717.0,6022.0,1230.0,21.0,0.0,0.0,16.0,12.0,12.0,0.0,0.6111,10.1,0.7399,0.0,,513.522,1.174,0.379,


In [10]:
from scipy.stats import skew,norm  # for some statistics

In [11]:
numeric = data.select_dtypes(include='number').columns
skew_features = data[numeric].apply(lambda x: skew(x)).sort_values(ascending=True)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features

There are 41 numerical features with Skew > 0.5 :


form_field44                -0.5029
form_field30                -0.1981
form_field1                  0.1248
form_field29                 0.9388
form_field28                 0.9911
form_field22                 1.1656
form_field42                 1.2129
form_field21                 1.5684
form_field33                 1.5974
form_field26                 1.6516
form_field43                 1.6586
form_field27                 1.6741
form_field25                 1.8046
form_field37                 2.1489
form_field23                 2.5311
form_field32                 2.5638
form_field49                 2.5952
form_field36                 2.6840
form_field38                 2.7596
form_field20                 3.1420
form_field19                 3.2760
form_field3                  4.2203
form_field2                  4.5311
form_field16                 4.5474
form_field8                  4.5749
form_field18                 5.1708
form_field17                 5.2843
form_field35                

In [12]:
# Normalize skewed features using log transformation
for column in skew_index:
    data[column] = np.log1p(data[column])

In [13]:
cat_features = data.select_dtypes(exclude=np.number).columns.drop('default_status')

In [14]:
cat_features

Index(['Applicant_ID', 'form_field47'], dtype='object')

In [15]:
data.drop('Applicant_ID',inplace=True,axis=1)

In [16]:
dissected = pd.get_dummies(data[['form_field47']])

In [17]:
data = data.join(dissected)

In [18]:
data.drop(['form_field47'], inplace=True, axis=1)

In [19]:
data.head(2)

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,form_field12,form_field13,form_field14,form_field15,form_field16,form_field17,form_field18,form_field19,form_field20,form_field21,form_field22,form_field23,form_field24,form_field25,form_field26,form_field27,form_field28,form_field29,form_field30,form_field32,form_field33,form_field34,form_field35,form_field36,form_field37,form_field38,form_field39,form_field42,form_field43,form_field44,form_field46,form_field48,form_field49,form_field50,default_status,form_field47_charge,form_field47_lending
0,3436.0,0.2508,0.9768,0.0,0.0,0.0,16.1848,12.4375,15.2678,16.2432,14.649,15.8793,18.3838,18.5356,0.6931,0.6931,0.6931,1.3863,1.3863,4.9101,8.6933,8.6217,4.9101,8.2944,9.7448,9.3393,4.1109,9.7448,5962.0,6.9594,2.1972,1.0986,0.0,2.5649,2.5649,1.6094,0.0,0.3314,1.1053,0.7116,0.0,5.7611,0.7559,0.0434,no,1,0
1,3456.0,0.5152,0.2104,0.0,0.0,0.0,13.709,13.1174,16.0209,14.745,15.1975,14.577,17.1528,17.6259,0.6931,0.6931,0.6931,1.0986,1.0986,4.7026,9.3402,9.5878,4.4738,8.3338,9.5041,9.0986,5.7203,9.5041,5780.0,7.8458,2.9178,0.0,0.0,2.8332,3.2189,2.1972,0.0,0.2733,2.2061,0.1836,0.0,5.8602,0.9634,0.2795,no,1,0


In [20]:
train_df = data.loc[( data['default_status'] != 'test')].reset_index(drop = True)
test_df = data.loc[( data['default_status'] == 'test')].reset_index(drop = True)

In [21]:
train_df.shape, test_df.shape

((56000, 47), (24000, 47))

In [22]:
X = train_df.drop('default_status', axis = 1)
y = train_df['default_status']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify = y,  random_state=42)

In [24]:
lr = LogisticRegression()
rf = RandomForestClassifier()
gbm = GradientBoostingClassifier()
decision_tree = DecisionTreeClassifier()

In [25]:
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
gbm.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [26]:
print('Logistic Regression Log loss: {}'.format(roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])))
print('RandomForestClassifier Log loss: {}'.format(roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])))
print('GradientBoostingClassifier Log loss: {}'.format(roc_auc_score(y_test, gbm.predict_proba(X_test)[:,1])))
print('DecisionTreeClassifier Log loss: {}'.format(roc_auc_score(y_test, decision_tree.predict_proba(X_test)[:,1])))

Logistic Regression Log loss: 0.802175117000884
RandomForestClassifier Log loss: 0.8276666536884967
GradientBoostingClassifier Log loss: 0.8351402017895926
DecisionTreeClassifier Log loss: 0.6416058461852343


In [27]:
accuracy = gbm.score(X_test, y_test)

In [28]:
accuracy

0.8110119047619048

In [29]:
test_x = test_df.drop('default_status', axis = 1)

In [30]:
prediction = gbm.predict(test_x)

In [31]:
sub['default_status'] = prediction
sub.to_csv('MySubmission.csv', index =False)

In [32]:
sub

Unnamed: 0,Applicant_ID,default_status
0,Apcnt_1000032,no
1,Apcnt_1000048,no
2,Apcnt_1000052,no
3,Apcnt_1000076,yes
4,Apcnt_1000080,no
5,Apcnt_1000084,no
6,Apcnt_1000104,no
7,Apcnt_1000116,yes
8,Apcnt_1000128,no
9,Apcnt_1000156,no
