<a id="1"></a>
# Import data

In [1]:
# Spark Session, Pipeline, Functions, and Metrics
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import rand
from pyspark.mllib.evaluation import MulticlassMetrics

# Keras / Deep Learning
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras import optimizers, regularizers
from keras.optimizers import Adam

# Elephas for Deep Learning on Spark
from elephas.ml_model import ElephasEstimator

from pyspark.sql.functions import col,isnan, when, count
from pyspark.sql.functions import col, create_map, lit
from itertools import chain



In [2]:
# Spark Session
conf = SparkConf().setAppName('Predict Loan Payback').setMaster('local[6]') # 6 cores
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

In [3]:
# sc.stop()
sc

In [4]:
# Load Data to Spark Dataframe
df = sql_context.read.csv("./data/accepted_2007_to_2018Q4.csv", header=True, inferSchema=True)

In [5]:
# View Schema
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: double (nullable = true)
 |-- funded_amnt: double (nullable = true)
 |-- funded_amnt_inv: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- url: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: string 

<a id="2"></a>
# Preprocessing

In [6]:
df.limit(5).toPandas()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


In [7]:
df.groupBy("loan_status").count().toPandas()

Unnamed: 0,loan_status,count
0,Fully Paid,1076751
1,Default,40
2,,33
3,In Grace Period,8436
4,Does not meet the credit policy. Status:Fully ...,1988
5,Charged Off,268558
6,Oct-2015,1
7,Late (31-120 days),21467
8,Current,878317
9,Does not meet the credit policy. Status:Charge...,761


In [8]:
df = df.filter('loan_status in ("Fully Paid", "Charged Off")')

In [9]:
df.groupBy("loan_status").count().toPandas()

Unnamed: 0,loan_status,count
0,Fully Paid,1076751
1,Charged Off,268558


In [10]:
df = df.drop('id', 'pymnt_plan', 'hardship_flag', 'out_prncp', 'out_prncp_inv', 'policy_code')

In [11]:
def spark_df_shape(df):
    return df.count(), len(df.columns)

In [12]:
print(spark_df_shape(df))

(1345309, 145)


let's drop these columns, as it will not contribute to our goal.

In [13]:
to_drop = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas()

In [14]:
to_drop

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,1345309,0,0,0,0,0,0,0,0,85785,...,1339554,1339554,226,171,1311942,1311957,1311977,1311998,1312012,1312016


In [15]:
to_drop.sum().sort_values(ascending=False).head(50)

member_id                                     1345309
next_pymnt_d                                  1345086
orig_projected_additional_accrued_interest    1341548
hardship_last_payment_amount                  1339554
hardship_payoff_balance_amount                1339554
hardship_dpd                                  1339550
hardship_loan_status                          1339548
hardship_start_date                           1339547
hardship_end_date                             1339546
hardship_length                               1339545
payment_plan_start_date                       1339543
hardship_amount                               1339540
deferral_term                                 1339532
hardship_status                               1339521
hardship_reason                               1339513
hardship_type                                 1339504
sec_app_mths_since_last_major_derog           1338662
sec_app_revol_util                            1327004
sec_app_num_rev_accts       

In [16]:
to_drop.sum().sort_values(ascending=False)[50:].head(50)

open_rv_12m                   807699
open_rv_24m                   807693
total_bal_il                  807685
open_il_24m                   807678
open_il_12m                   807673
open_act_il                   807645
open_acc_6m                   807631
mths_since_last_delinq        678599
mths_since_recent_inq         174049
num_tl_120dpd_2m              117400
mo_sin_old_il_acct            105485
emp_title                      85785
emp_length                     78511
pct_tl_nvr_dlq                 67681
avg_cur_bal                    67546
num_tl_30dpd                   67527
num_bc_tl                      67526
num_rev_accts                  67526
num_tl_op_past_12m             67525
num_actv_rev_tl                67525
num_tl_90g_dpd_24m             67525
num_rev_tl_bal_gt_0            67525
total_rev_hi_lim               67522
num_il_tl                      67521
num_op_rev_tl                  67520
num_actv_bc_tl                 67518
num_accts_ever_120_pd          67516
t

The first 95 columns has lots of nulls so I will drop them

In [17]:
to_drop = list(to_drop.sum().sort_values(ascending=False).head(95).index)

In [18]:
print(to_drop)

['member_id', 'next_pymnt_d', 'orig_projected_additional_accrued_interest', 'hardship_last_payment_amount', 'hardship_payoff_balance_amount', 'hardship_dpd', 'hardship_loan_status', 'hardship_start_date', 'hardship_end_date', 'hardship_length', 'payment_plan_start_date', 'hardship_amount', 'deferral_term', 'hardship_status', 'hardship_reason', 'hardship_type', 'sec_app_mths_since_last_major_derog', 'sec_app_revol_util', 'sec_app_num_rev_accts', 'sec_app_open_act_il', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'sec_app_open_acc', 'sec_app_mort_acc', 'sec_app_inq_last_6mths', 'sec_app_fico_range_low', 'revol_bal_joint', 'sec_app_earliest_cr_line', 'sec_app_fico_range_high', 'verification_status_joint', 'dti_joint', 'annual_inc_joint', 'settlement_term', 'settlement_percentage', 'settlement_amount', 'settlement_date', 'settlement_status', 'debt_settlement_flag_date', 'desc', 'mths_since_last_record', 'mths_since_recent_bc_dlq', 'mths_since_last_major_derog',

In [19]:
df = df.drop(*to_drop)
print((df.count(), len(df.columns)))

(1345309, 50)


In [20]:
df = df.na.drop()

In [21]:
print(spark_df_shape(df))

(1340812, 50)


In [22]:
to_drop = ['fico_range_low', 'funded_amnt_inv', 'funded_amnt', 'total_pymnt_inv', 'total_pymnt', 'installment', 'collection_recovery_fee', 'total_rec_prncp', 'last_fico_range_low']
df = df.drop(*to_drop)

In [23]:
print(spark_df_shape(df))

(1340812, 41)


In [24]:
mapping = {'Fully Paid': 1, 'Charged Off': 0}

mapping_expr = create_map([lit(x) for x in chain(*mapping.items())])

df = df.withColumn("loan_is_paid", mapping_expr[col("loan_status")])

In [25]:
df.groupBy("loan_is_paid").count().toPandas()

Unnamed: 0,loan_is_paid,count
0,1,1074961
1,0,265851


In [26]:
df = df.drop('loan_status')

In [27]:
distinct_terms = [x.term for x in df.select('term').distinct().collect()]

In [28]:
distinct_terms

[' 36 months', ' 60 months']

In [29]:
mapping = dict(zip(distinct_terms, [36, 60]))
mapping_expr = create_map([lit(x) for x in chain(*mapping.items())])
df = df.withColumn("term_months", mapping_expr[col("term")])

In [30]:
df.select("term_months").distinct().show()

+-----------+
|term_months|
+-----------+
|         60|
|         36|
+-----------+



In [31]:
df = df.drop('term')

In [32]:
df.printSchema()

root
 |-- loan_amnt: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- url: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- earliest_cr_line: string (nullable = true)
 |-- fico_range_high: string (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- open_acc: string (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- revol_bal: string (nullable = true)
 |-- revol_util: string (nullable = true)
 |-- total_acc: string (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- total_rec_int: string (n

In [33]:
df.groupBy('home_ownership').count().toPandas()

Unnamed: 0,home_ownership,count
0,OWN,144179
1,RENT,532381
2,MORTGAGE,663782
3,ANY,283
4,OTHER,142
5,NONE,45


In [34]:
df = df.replace('NONE', 'ANY', 'home_ownership')

In [35]:
df.groupBy('home_ownership').count().toPandas()

Unnamed: 0,home_ownership,count
0,OWN,144179
1,RENT,532381
2,MORTGAGE,663782
3,ANY,328
4,OTHER,142


In [36]:
to_drop=['grade', 'issue_d', 'url', 'last_pymnt_d', 'last_credit_pull_d', 'zip_code', 'addr_state', 'earliest_cr_line']
df = df.drop(*to_drop)

In [37]:
df.printSchema()

root
 |-- loan_amnt: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- dti: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- fico_range_high: string (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- open_acc: string (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- revol_bal: string (nullable = true)
 |-- revol_util: string (nullable = true)
 |-- total_acc: string (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- total_rec_int: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- recoveries: string (nullable = true)
 |-- last_pymnt_amnt: string (nullable = true)
 |-- last_fico_range_high: string (nullable = true)
 |-- collections_12_mths_ex_med: s

In [38]:
cols = ['annual_inc', 'dti', 'delinq_2yrs', 'fico_range_high', 'inq_last_6mths', 
        'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'total_rec_int', 
        'total_rec_late_fee', 'recoveries', 'last_pymnt_amnt', 'last_fico_range_high', 
        'collections_12_mths_ex_med', 'acc_now_delinq']
for col_name in cols:
    df = df.withColumn(col_name, col(col_name).cast('float'))

In [39]:
df.printSchema()

root
 |-- loan_amnt: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- dti: float (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- fico_range_high: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- open_acc: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- revol_bal: float (nullable = true)
 |-- revol_util: float (nullable = true)
 |-- total_acc: float (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- total_rec_int: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- recoveries: float (nullable = true)
 |-- last_pymnt_amnt: float (nullable = true)
 |-- last_fico_range_high: float (nullable = true)
 |-- collections_12_mths_ex_med: float (nullable 

In [40]:
df.limit(5).toPandas()

Unnamed: 0,loan_amnt,int_rate,sub_grade,home_ownership,annual_inc,verification_status,purpose,dti,delinq_2yrs,fico_range_high,...,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,disbursement_method,debt_settlement_flag,loan_is_paid,term_months
0,3600.0,13.99,C4,MORTGAGE,55000.0,Not Verified,debt_consolidation,5.91,0.0,679.0,...,Individual,0.0,0.0,0.0,0.0,0.0,Cash,N,1,36
1,24700.0,11.99,C1,MORTGAGE,65000.0,Not Verified,small_business,16.059999,1.0,719.0,...,Individual,0.0,0.0,0.0,0.0,0.0,Cash,N,1,36
2,20000.0,10.78,B4,MORTGAGE,63000.0,Not Verified,home_improvement,10.78,0.0,699.0,...,Joint App,0.0,0.0,0.0,0.0,0.0,Cash,N,1,60
3,10400.0,22.45,F1,MORTGAGE,104433.0,Source Verified,major_purchase,25.370001,1.0,699.0,...,Individual,0.0,0.0,0.0,0.0,0.0,Cash,N,1,60
4,11950.0,13.44,C3,RENT,34000.0,Source Verified,debt_consolidation,10.2,0.0,694.0,...,Individual,0.0,0.0,0.0,0.0,0.0,Cash,N,1,36


In [41]:
# Spark Pipeline
cat_features = ['sub_grade', 'verification_status', 'application_type', 'initial_list_status', 
                'purpose', 'home_ownership', 'disbursement_method', 'debt_settlement_flag']
assembler_inputs = list(set(df.columns) - set(cat_features)) + [feature + "_class_vec" for feature in cat_features]
assembler_inputs.remove('loan_is_paid')
assembler_inputs

['delinq_2yrs',
 'total_rec_int',
 'pub_rec',
 'total_acc',
 'inq_last_6mths',
 'revol_bal',
 'annual_inc',
 'collections_12_mths_ex_med',
 'chargeoff_within_12_mths',
 'term_months',
 'loan_amnt',
 'last_fico_range_high',
 'dti',
 'tax_liens',
 'last_pymnt_amnt',
 'int_rate',
 'fico_range_high',
 'recoveries',
 'acc_now_delinq',
 'pub_rec_bankruptcies',
 'open_acc',
 'revol_util',
 'delinq_amnt',
 'total_rec_late_fee',
 'sub_grade_class_vec',
 'verification_status_class_vec',
 'application_type_class_vec',
 'initial_list_status_class_vec',
 'purpose_class_vec',
 'home_ownership_class_vec',
 'disbursement_method_class_vec',
 'debt_settlement_flag_class_vec']

In [42]:
# Pipeline Stages List
stages = []

# Loop for StringIndexer and OHE for Categorical Variables
for features in cat_features:
    # Index Categorical Features
    string_indexer = StringIndexer(inputCol=features, outputCol=features + "_index")
    # One Hot Encode Categorical Features
    encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()], outputCols=[features + "_class_vec"])
    # Append Pipeline Stages
    stages += [string_indexer, encoder]
    
assembler_final = VectorAssembler(inputCols=assembler_inputs, outputCol="features") 

stages += [assembler_final]

In [43]:
stages

[StringIndexer_08ebfc408b9c,
 OneHotEncoder_5d27316ab1da,
 StringIndexer_70f3772530f6,
 OneHotEncoder_5b5e72be0fc1,
 StringIndexer_ad77d5d9db24,
 OneHotEncoder_4fe6a2e30042,
 StringIndexer_2ac75b52c687,
 OneHotEncoder_15952299a546,
 StringIndexer_746f476e4534,
 OneHotEncoder_9714416b81bb,
 StringIndexer_5ac5385ac9f2,
 OneHotEncoder_2a8cd8326554,
 StringIndexer_fe5f22f11a6e,
 OneHotEncoder_2eafbf779c70,
 StringIndexer_5f531be3727f,
 OneHotEncoder_944056b8ad6d,
 VectorAssembler_c4a389ebe69b]

In [44]:
# Set Pipeline
pipeline = Pipeline(stages=stages)

In [45]:
# Fit Pipeline to Data
pipeline_model = pipeline.fit(df)

In [46]:
# Transform Data using Fitted Pipeline
df = pipeline_model.transform(df)

In [47]:
# Preview Newly Transformed Data
df.limit(5).toPandas()

Unnamed: 0,loan_amnt,int_rate,sub_grade,home_ownership,annual_inc,verification_status,purpose,dti,delinq_2yrs,fico_range_high,...,initial_list_status_class_vec,purpose_index,purpose_class_vec,home_ownership_index,home_ownership_class_vec,disbursement_method_index,disbursement_method_class_vec,debt_settlement_flag_index,debt_settlement_flag_class_vec,features
0,3600.0,13.99,C4,MORTGAGE,55000.0,Not Verified,debt_consolidation,5.91,0.0,679.0,...,(1.0),0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",0.0,(1.0),0.0,(1.0),"(0.0, 821.719970703125, 0.0, 13.0, 1.0, 2765.0..."
1,24700.0,11.99,C1,MORTGAGE,65000.0,Not Verified,small_business,16.059999,1.0,719.0,...,(1.0),6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",0.0,(1.0),0.0,(1.0),"(1.0, 979.6599731445312, 0.0, 38.0, 4.0, 21470..."
2,20000.0,10.78,B4,MORTGAGE,63000.0,Not Verified,home_improvement,10.78,0.0,699.0,...,(1.0),2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",0.0,(1.0),0.0,(1.0),"(0.0, 2705.919921875, 0.0, 18.0, 0.0, 7869.0, ..."
3,10400.0,22.45,F1,MORTGAGE,104433.0,Source Verified,major_purchase,25.370001,1.0,699.0,...,(1.0),4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0)",0.0,(1.0),0.0,(1.0),"(1.0, 1340.5, 0.0, 35.0, 3.0, 21929.0, 104433...."
4,11950.0,13.44,C3,RENT,34000.0,Source Verified,debt_consolidation,10.2,0.0,694.0,...,(1.0),0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0)",0.0,(1.0),0.0,(1.0),"(0.0, 1758.949951171875, 0.0, 6.0, 0.0, 8822.0..."


In [48]:
df.printSchema()

root
 |-- loan_amnt: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- dti: float (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- fico_range_high: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- open_acc: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- revol_bal: float (nullable = true)
 |-- revol_util: float (nullable = true)
 |-- total_acc: float (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- total_rec_int: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- recoveries: float (nullable = true)
 |-- last_pymnt_amnt: float (nullable = true)
 |-- last_fico_range_high: float (nullable = true)
 |-- collections_12_mths_ex_med: float (nullable 

In [49]:
df = df.select('features','loan_is_paid')
df.limit(20).toPandas()

Unnamed: 0,features,loan_is_paid
0,"(0.0, 821.719970703125, 0.0, 13.0, 1.0, 2765.0...",1
1,"(1.0, 979.6599731445312, 0.0, 38.0, 4.0, 21470...",1
2,"(0.0, 2705.919921875, 0.0, 18.0, 0.0, 7869.0, ...",1
3,"(1.0, 1340.5, 0.0, 35.0, 3.0, 21929.0, 104433....",1
4,"(0.0, 1758.949951171875, 0.0, 6.0, 0.0, 8822.0...",1
5,"(0.0, 1393.800048828125, 0.0, 27.0, 0.0, 87329...",1
6,"(1.0, 1538.510009765625, 0.0, 15.0, 0.0, 826.0...",1
7,"(0.0, 998.969970703125, 1.0, 23.0, 1.0, 10464....",1
8,"(0.0, 939.5800170898438, 0.0, 18.0, 0.0, 7034....",1
9,"(0.0, 175.16000366210938, 0.0, 24.0, 0.0, 3782...",1


In [50]:
# Shuffle Data
df = df.orderBy(rand())

In [51]:
# Split Data into Train / Test Sets
train_data, test_data = df.randomSplit([.8, .2], seed=42)

In [52]:
print(spark_df_shape(df))

(1340812, 2)


In [53]:
# Number of Inputs or Input Dimensions
input_dim = len(train_data.select("features").first()[0])
input_dim

81

In [54]:
print(spark_df_shape(train_data))

(1073352, 2)


In [55]:
print(spark_df_shape(test_data))

(267460, 2)


In [56]:
def create_model():
    # Set up Deep Learning Model / Architecture
    model = Sequential()
    model.add(Dense(units=78, input_shape=(input_dim,), activation='relu'))
    model.add(Dense(units=39,activation='relu'))
    model.add(Dense(units=19,activation='relu'))
    model.add(Dense(units=8,activation='relu'))
    model.add(Dense(units=4,activation='relu'))
    model.add(Dense(units=1,activation='sigmoid'))
    return model

def compile_model(model):
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [57]:
model = create_model()
model = compile_model(model)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 78)                6396      
_________________________________________________________________
dense_1 (Dense)              (None, 39)                3081      
_________________________________________________________________
dense_2 (Dense)              (None, 19)                760       
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 160       
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 5         
Total params: 10,438
Trainable params: 10,438
Non-trainable params: 0
____________________________________________________

In [73]:
# adam = optimizers.Adam(lr=0.01)
# opt_conf = optimizers.serialize(adam)

# # Initialize SparkML Estimator and set all relevant properties
# estimator = ElephasEstimator()
# estimator.setFeaturesCol("features")
# estimator.setLabelCol("loan_is_paid")
# estimator.set_keras_model_config(model.to_yaml()) # Provide serialized Keras model
# estimator.set_num_workers(1)                      # We just use one worker here. Feel free to adapt it.
# estimator.set_epochs(10)
# estimator.set_batch_size(512)
# estimator.set_verbosity(1)
# estimator.set_validation_split(0.1)
# estimator.set_categorical_labels(True)
# estimator.set_nb_classes(2)
# estimator.set_optimizer_config(opt_conf)
# estimator.set_mode("synchronous")
# estimator.set_loss("binary_crossentropy")
# estimator.set_metrics(['acc'])


# # from elephas.spark_model import SparkModel

# # spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')
# # spark_model.fit(rdd, epochs=5, batch_size=512, verbose=1, validation_split=0.1)
epochs = 40
batch_size = 512
nb_classes = 2
estimator = ElephasEstimator(epochs=epochs, batch_size=batch_size, frequency='batch', mode='asynchronous', categorical=True, nb_classes=nb_classes)
# estimator = ElephasEstimator(epochs=epochs, batch_size=batch_size, frequency='batch', mode='asynchronous', categorical=True)

estimator.setFeaturesCol("features")
estimator.setLabelCol("loan_is_paid")
estimator.set_keras_model_config(model.to_yaml())
estimator.set_verbosity(1)
estimator.set_validation_split(0.1)
adam = optimizers.Adam(lr=0.01)
opt_conf = optimizers.serialize(adam)

estimator.set_loss("binary_crossentropy")
estimator.set_metrics(['acc'])

ElephasEstimator_e7c423d315f4

In [59]:
train_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- loan_is_paid: integer (nullable = true)



In [60]:
test_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- loan_is_paid: integer (nullable = true)



In [74]:
dl_pipeline = Pipeline(stages=[estimator])

In [75]:
fit_dl_pipeline = dl_pipeline.fit(train_data)

ValueError: Could not interpret optimizer identifier: None

In [None]:
pred_test = fit_dl_pipeline.transform(test_data)

In [None]:
pred_test.limit(10).toPandas()

In [None]:
pnl_test = pred_test.select("loan_is_paid", "prediction")

In [None]:
pred_and_label_test = pnl_test.rdd.map(lambda row: (row["loan_is_paid"], row['prediction']))

In [None]:
metrics_test = MulticlassMetrics(pred_and_label_test)

In [None]:
print("\nTest Data Accuracy: {}".format(metrics_test.weightedPrecision))

In [None]:
print("Test Data Confusion Matrix")
display(pnl_test.crosstab('loan_is_paid', 'prediction').toPandas())