In [316]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler

In [407]:
train = pd.read_csv('~/train_file.csv')
test = pd.read_csv('~/test_file.csv')

In [408]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [409]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [410]:
train.shape, test.shape

((614, 13), (367, 12))

In [411]:
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [412]:
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [413]:
train.Gender.value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

In [414]:
train.Married.value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [415]:
train.Dependents.value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [416]:
train.Self_Employed.value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [417]:
def data_preprocess(train_df, test_df):
    '''
    This function is used to preprocess the data befor it is feed to modeling
    
    Args:
        train_df: Train data to be processed(type:DataFrame)
        test_df: Test data to be processed(type:DataFrame) 
    
    Returns: Returns Processed Train, Test along with the Status
    '''
    
    Status = train_df['Loan_Status']
    Status = [0 if x in ['N']
             else 1 for x in Status]
    
    train_df.drop('Loan_Status', axis=1, inplace=True)
    entire_df =  train_df.append(test_df)
    
    #filling NA's with mode of Gender
    entire_df.Gender = [1 if x in ['Male']
                       else 0 if x in ['Female']
                       else 1 for x in entire_df.Gender]
    
    #filling NA's with mode of Married status
    entire_df.Married = [0 if x in ['No']
                       else 1 if x in ['Yes']
                       else 1 for x in entire_df.Married]
    
    #filling NA's with mode of Self_Employed
    entire_df.Self_Employed = [0 if x in ['No']
                       else 1 if x in ['Yes']
                       else 0 for x in entire_df.Self_Employed]
    
    #Imputing NA's with Median(), since Mean() gets a hit with outliers.
    entire_df.LoanAmount.fillna(entire_df.LoanAmount.median(), inplace=True)
    
    #I don't prefer imputing the credit history as this completely depends on users history. 
    entire_df.Credit_History = ["credit_history_0" if x in [0]
                       else "credit_history_1" if x in [1]
                       else "credit_history_unknown" for x in entire_df.Credit_History]
    
    entire_df.Education = [1 if x in ['Graduate']
                       else 0 for x in entire_df.Education]
    entire_df.dti = (entire_df.ApplicantIncome + entire_df.CoapplicantIncome)/entire_df.LoanAmount
    
    entire_df.Dependents = ["Single" if x in ['0']
                            else "micro_family" if x in ['1']
                            else "mini_family" if x in ['2']
                            else "macro_family" if x in ['3+']
                            else "unknow_family" for x in entire_df.Dependents]
    
    entire_df.Loan_Amount_Term = ['le5' if x<=60
                                else "5-15" if x>60 and x<=180 
                                else "15-25" if x>180 and x<=300  
                                else "gt25" for x in entire_df.Loan_Amount_Term]
    
    
    #Preparing dummies for caegorical variables
    entire_df = pd.get_dummies(entire_df, columns=['Loan_Amount_Term','Dependents','Credit_History','Property_Area'])
    
    entire_df = entire_df.set_index('Loan_ID')
    #scaling continuous variables
    scaler = MinMaxScaler()
    entire_final_df = pd.DataFrame(scaler.fit_transform(entire_df), columns=entire_df.columns)
    
    train_data = entire_final_df.iloc[:614,:]
    test_data = entire_final_df.iloc[614:,:]
    
    return train_data, test_data, Status
    

In [418]:
train_set, test_set, y_train= data_preprocess(train,test)



In [419]:
train_set.shape, test_set.shape

((614, 22), (367, 22))

In [420]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, log_loss, auc

In [430]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=1, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=10,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=24,
       silent=True)

In [431]:
xgb_model.fit(train_set, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=1, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=24, silent=True, subsample=1)

In [432]:
xgb_model.score(train_set,y_train)

  if diff:


0.8094462540716613

In [434]:
submission_file = pd.DataFrame()
submission_file['Loan_ID']=test['Loan_ID']
submission_file['Loan_Status']=xgb_model.predict(test_set)

  if diff:


In [435]:
submission_file['Loan_Status_mod'] = ['Y' if x in [1]
                                     else 'N' for x in submission_file['Loan_Status']]

In [436]:
submission_final = submission_file[['Loan_ID','Loan_Status_mod']]

In [437]:
submission_final.columns = ['Loan_ID','Loan_Status']

In [438]:
submission_final.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y


In [439]:
submission_final.to_csv('submission_file_26Sep.csv',index=False)

In [440]:
%pwd

'C:\\Users\\Saleha Sayyad'

#### Modeling with entire train data

In [None]:
xgb_final = xgb_model.fit(final_train_data, y)

In [None]:
xgb_final.predict(final_test_data)

In [None]:
xgb_final.score(final_train_data, y)

In [None]:
submission_file_new = pd.DataFrame()
submission_file_new['Loan_ID']=test_file['Loan_ID']
submission_file_new['Loan_Status']=xgb_model.predict(final_test_data)

In [None]:
submission_file_new['Loan_Status_mod'] = ['Y' if x in [1]
                                     else 'N' for x in submission_file_new['Loan_Status']]

In [None]:
submission_file_new = submission_file_new[['Loan_ID','Loan_Status_mod']]
submission_file_new.columns = ['Loan_ID','Loan_Status']

In [None]:
submission_file_new.to_csv('submission_file_SEP25_xgb_1.csv',index=False)

In [None]:
submission_file_check['v2'] = submission_file_new.Loan_Status

#### Neural Net

In [516]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [547]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD

In [552]:
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 64, init = 'uniform', activation = 'relu', input_dim = X_train.shape[1]))
# Adding the second hidden layer
classifier.add(Dropout(0.2))
classifier.add(Dense(units = 64, init = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.2))
# Adding the output layer
classifier.add(Dense(units = 1, init = 'uniform', activation = 'sigmoid'))

  This is separate from the ipykernel package so we can avoid doing imports until
  
  if __name__ == '__main__':


In [553]:
sgd = SGD(lr=0.01, momentum=0.8, decay=0.0, nesterov=False)

In [554]:
# Compiling Neural Network
classifier.compile(optimizer = sgd, loss = 'binary_crossentropy', metrics = ['accuracy'])

In [561]:
# Fitting our model 
classifier.fit(X_train, y_train, batch_size = , epochs = 100, validation_data=[X_test,y_test])

Train on 491 samples, validate on 123 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

<keras.callbacks.History at 0x3c674cc0>

In [562]:
# Predicting the Test set results
y_pred = classifier.predict_classes(final_test_data)

In [563]:
submission_file_new = pd.DataFrame()
submission_file_new['Loan_ID']=test_file['Loan_ID']
submission_file_new['Loan_Status']=y_pred

In [564]:
submission_file_new['Loan_Status_mod'] = ['Y' if x in [1]
                                     else 'N' for x in submission_file_new['Loan_Status']]

In [565]:
submission_file_new = submission_file_new[['Loan_ID','Loan_Status_mod']]
submission_file_new.columns = ['Loan_ID','Loan_Status']

In [566]:
submission_file_new.to_csv('submission_file_new_SEP25_neural.csv',index=False)