# <span style='color:IndianRed'> Distributed Deep Learning Pipelines with PySpark and Keras.</span>

__Objective__: Apply PySpark Pipelines

In [1]:
# Import Libraries. 

# PySpark Session.
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler
from pyspark.sql.functions import rand
from pyspark.mllib.evaluation import MulticlassMetrics

# Keras Deep Learning. 
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras import optimizers, regularizers
from keras.optimizers import Adam

# Elephas for Deep Learning-Spark.
from elephas.ml_model import ElephasEstimator

In [2]:
# Configure Spark Session. 
spark = SparkSession.builder.appName("Spark DL Tabular Pipeline").master("local[2]").getOrCreate()
spark

In [3]:
# Import Dataset. 
df = spark.read.csv("bank.csv", header=True, inferSchema=True)

In [4]:
# View the Dataframe Scheme.
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [5]:
# Preview Dataframe. 
df.limit(5).toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [6]:
# Drop unnessesary data (day and month).
df.drop("day", "month")

DataFrame[age: int, job: string, marital: string, education: string, default: string, balance: int, housing: string, loan: string, contact: string, duration: int, campaign: int, pdays: int, previous: int, poutcome: string, deposit: string]

## <span style='color:mediumseagreen'> Helper Function to Select Features.</span>

In [None]:
# Helper function to select features to scale given their skew.