### import packages

In [2]:
# from pyspark.context import SparkContext
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, StandardScaler, VectorAssembler 
from pyspark.ml import Pipeline
from pyspark.sql.functions import rand
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import DoubleType

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras import optimizers, regularizers
from keras.optimizers import Adam

from elephas.ml_model import ElephasEstimator

#### set up spark context and increase available memory

In [3]:
conf = SparkConf().setAppName('Spark DL Tabular Pipeline').setMaster('local[*]')
conf = conf.set("spark.sql.crossJoin.enabled", "true")
# Increase memory spark has available
SparkContext.setSystemProperty('spark.executor.memory', '6g')
sc = SparkContext(conf=conf)

sql_context = SQLContext(sc)

#pyspark.conf.set("spark.sql.crossJoin.enabled", "true")

#Verify settings set for available memory
sc._conf.getAll()

[('spark.executor.memory', '6g'),
 ('spark.driver.port', '54406'),
 ('spark.driver.host', 'DESKTOP-RE98QFK'),
 ('spark.sql.crossJoin.enabled', 'true'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.id', 'local-1622810173134'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'Spark DL Tabular Pipeline')]

In [4]:
# Read the weather and the accident df

df_weather_temp = sql_context.read\
    .options(header=True, inferSchema=True)\
    .csv('datasets/hourly_weather.csv')

df_weather = df_weather_temp.withColumn('Hour', df_weather_temp.date[12:13][1:2].cast('double'))

df_accidents_temp = sql_context.read\
    .options(header=True, inferSchema=True)\
    .csv('datasets/accidents.csv')
df_accidents = df_accidents_temp.drop('Hour')

# join the two datasets
df = df_weather.join(df_accidents, how='left')

# drop all not essential columns that will not be used as features or labels
df = df.drop(*['AccidentLocation_CHLV95_N', 'AccidentLocation_CHLV95_E', 'RoadType',\
               'AccidentInvolvingMotorcycle', 'AccidentInvolvingBicycle', 'AccidentInvolvingPedestrian',\
              'AccidentType', 'date', '_c0'])

# show the schema of the joined df
df.printSchema()

root
 |-- air_temperature: double (nullable = true)
 |-- water_temperature: double (nullable = true)
 |-- wind_gust_max_10min: double (nullable = true)
 |-- wind_speed_avg_10min: double (nullable = true)
 |-- wind_force_avg_10min: double (nullable = true)
 |-- wind_direction: integer (nullable = true)
 |-- windchill: double (nullable = true)
 |-- barometric_pressure_qfe: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- dew_point: double (nullable = true)
 |-- global_radiation: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- water_level: double (nullable = true)
 |-- Hour: double (nullable = true)
 |-- AccidentSeverityCategory: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- WeekDay: string (nullable = true)



In [5]:
#df_accidents.limit(5).toPandas()

In [6]:
#df_weather.limit(5).toPandas()

## Spark pipeline creation
- Create spark pipeline to:
    - Apply one hot encoding
    - Define X matrix and y label
    - Add all stages to the pipeline
    
The performance on the String indexer is very slow

In [7]:
stages = []

X = df.columns
#del X[-4:]

# index accident type, accident severity and week day

severity_indexer = StringIndexer(inputCol='AccidentSeverityCategory', outputCol='severityIndex')
# severity_indexer = StringIndexer(inputCol='AccidentSeverityCategory', outputCol='severityIndex')
day_indexer = StringIndexer(inputCol='WeekDay', outputCol='weekDayIndex')

stages += [severity_indexer]
stages += [day_indexer]

# One-hot encoder for accident type, severity and week day

# ohe_accident_type = OneHotEncoder(inputCol='typeIndex', outputCol='type_vec')
# ohe_accident_severity = OneHotEncoder(intype_indexerputCol='severityIndex', outputCol='severity_vec')
ohe_weekday = OneHotEncoder(inputCol='weekDayIndex', outputCol='weekday_vec')
ohe_hour = OneHotEncoder(inputCol='Hour', outputCol='hour_vec')
5000
X += ['weekday_vec', 'hour_vec']
y = 'severityIndex'

# stages += ohe_accident_type


assembler_final = VectorAssembler(inputCols=X, outputCol='features')

stages += [ohe_weekday, ohe_hour, assembler_final]

# Create pipeline and pass all stages
pipeline = Pipeline(stages=stages)

In [8]:
stages

[StringIndexer_5f17e8875dc4,
 StringIndexer_86799480d0e4,
 OneHotEncoder_0c46231a04fe,
 OneHotEncoder_3d7ee2486ba4,
 VectorAssembler_27b746a699d5]

In [None]:
# fit pipeline to data
pipeline_model = pipeline.fit(df)

# Transform data using fitted pipeline
df_transform = pipeline_model.transform(df)

In [None]:
# Inspect transformed data
df_transform.limit(5).toPandas()

In [None]:
# select the features and the label for the final df
df_transform_fin = df_transform.select('features', 'severityIndex')
df_transform_fin.limit(5).toPandas()

In [None]:
# Shuffle
df_transform_fin = df_transform_fin.orderBy(rand())

In [None]:
# Train / Test Split
train_data, test_data = df_transform_fin.randomSplit([.8, .2], seed=42)
train_data.persist()


In [None]:
# Number of classes
# too computationally expensive:
# nb_classes = train_data.select('severityIndex').distinct().count()
nb_classes = 4

In [None]:
#train_data.select('features').limit(5).toPandas()

In [None]:
#df.select('AccidentSeverityCategory').limit(5).toPandas()

In [None]:
# Input dimention
#input_dim = len(train_data.select('features').first()[0])
input_dim = 44

In [None]:
model = Sequential()
model.add(Dense(256, input_shape=(input_dim,), activity_regularizer=regularizers.l2(0.01)))
model.add(Activation('relu'))
model.add(Dropout(rate=0.3))

model.add(Dense(256, input_shape=(input_dim,), activity_regularizer=regularizers.l2(0.01)))
model.add(Activation('relu'))
model.add(Dropout(rate=0.3))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.summary()

In [None]:
# Set and Serial optimizer
optimizer_conf = optimizers.Adam(lr=0.01)
opt_conf = optimizers.serialize(optimizer_conf)

# Initialize SparkML Estimator and get Settings
estimator = ElephasEstimator()
estimator.setFeaturesCol('features')
estimator.setLabelCol('severityIndex')
estimator.set_keras_model_config(model.to_yaml())
estimator.set_categorical_labels(True)
estimator.set_nb_classes(nb_classes)
estimator.set_num_workers(1)
estimator.set_epochs(2)
estimator.set_batch_size(64)
estimator.set_verbosity(1)
estimator.set_validation_split(0.1)
estimator.set_optimizer_config(opt_conf)
estimator.set_mode('synchronous')
estimator.set_loss('categorical_crossentropy')
estimator.set_metrics(['acc'])

In [None]:
# Create deep learning pipeline
dl_pipeline = Pipeline(stages=[estimator])

In [None]:
# Helper function for fitting, transforming and predicting train and test
def dl_pip_fit_score_res(dl_pipeline=dl_pipeline, train_data=train_data,
                        test_data=test_data, label='labelIndex'):
    fit_dl_pipeline = dl_pipeline.fit(train_data)
    pred_train = fit_dl_pipeline.transform(train_data)
    pred_test = fit_dl_pipeline.transform(test_data)
    
    pnl_train = pred_train.select(label, 'prediction')
    pnl_test = pred_test.select(label, 'prediction')
    
    pred_and_label_train = pnl_train.rdd.map(lambda row: (row[label], row['prediction']))
    pred_and_label_test = pnl_test.rdd.map(lambda row: (row[label], row['prediction']))
    
    metrics_train = MulticlassMetrics(pred_and_label_train)
    metrics_test = MulticlassMetrics(pred_and_label_test)
    
    print('Train data accuracy: {}'.format(round(metrics_train.precision(), 4)))
    print('Training Data Confusion Matrix')
    display(pnl_train.crosstab('labelIndex', 'prediction\n').toPandas())
    
    print('Test data accuracy: {}'.format(round(metrics_test.precision(), 4)))
    print('Test Data Confusion Matrix')
    display(pnl_test.crosstab('labelIndex', 'prediction').toPandas())
    

In [None]:
dl_pip_fit_score_res(dl_pipeline=dl_pipeline, train_data=train_data,
                        test_data=test_data, label='labelIndex')