In [1]:
# from pyspark.context import SparkContext
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, StandardScaler, VectorAssembler 
from pyspark.ml import Pipeline
from pyspark.sql.functions import rand
from pyspark.mllib.evaluation import MulticlassMetrics

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras import optimizers, regularizers
from keras.optimizers import Adam

from elephas.ml_model import ElephasEstimator



In [2]:
conf = SparkConf().setAppName('Spark DL Tabular Pipeline').setMaster('local[6]')

sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

#spark = SparkSession(sc)

In [3]:
# Read the weather and the accident df

df_weather = sql_context.read\
    .options(header=True, inferSchema=True)\
    .csv('datasets/hourly_weather.csv')
df_accidents = sql_context.read\
    .options(header=True, inferSchema=True)\
    .csv('datasets/accidents.csv')

In [4]:
df = df_weather.join(df_accidents, how='left')
df.show(1)
df.printSchema()

+---+-------------------+-----------------+-----------------+-------------------+--------------------+--------------------+--------------+------------------+-----------------------+-------------+------------------+----------------+-----------------+-----------+---+-------------------+------------+------------------------+---------------------------+------------------------+---------------------------+--------+-------------------------+-------------------------+-----+--------+----+
|_c0|               date|  air_temperature|water_temperature|wind_gust_max_10min|wind_speed_avg_10min|wind_force_avg_10min|wind_direction|         windchill|barometric_pressure_qfe|precipitation|         dew_point|global_radiation|         humidity|water_level|_c0|               date|AccidentType|AccidentSeverityCategory|AccidentInvolvingPedestrian|AccidentInvolvingBicycle|AccidentInvolvingMotorcycle|RoadType|AccidentLocation_CHLV95_E|AccidentLocation_CHLV95_N|Month| WeekDay|Hour|
+---+-------------------+---

In [5]:
df = df.drop(*['AccidentLocation_CHLV95_N', 'AccidentLocation_CHLV95_E', 'RoadType',\
               'AccidentInvolvingMotorcycle', 'AccidentInvolvingBicycle', 'AccidentInvolvingPedestrian',\
              'AccidentSeverityCategory', 'date', '_c0'])
df.printSchema()

root
 |-- air_temperature: double (nullable = true)
 |-- water_temperature: double (nullable = true)
 |-- wind_gust_max_10min: double (nullable = true)
 |-- wind_speed_avg_10min: double (nullable = true)
 |-- wind_force_avg_10min: double (nullable = true)
 |-- wind_direction: integer (nullable = true)
 |-- windchill: double (nullable = true)
 |-- barometric_pressure_qfe: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- dew_point: double (nullable = true)
 |-- global_radiation: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- water_level: double (nullable = true)
 |-- AccidentType: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- WeekDay: string (nullable = true)
 |-- Hour: double (nullable = true)



In [6]:
def sel_feature_scale(df=df, lower_skew=-2, upper_skew=2, dtypes='int32', drop_cols=['']):
    
    #empty sel features
    selected_features = []
    
    # Select features to scale based on inputs
    feature_list = list(df.toPandas().select_dtypes(include=[dtypes]).columns.drop(drop_cols))
    
    # Loop through feature list to select features based on Kurtosis / Skew
    for feature in feature_list:
        if df.toPandas()[feature].kurtosis() < -2 or df.toPandas()[feature].kurtosis() > 2:
            selected_features.append(feature)
    # Return feature list to scale
    return selected_features

In [7]:
# Create a spark pipeline


cat_features = ['WeekDay']

cat_indexed_features = ['Month', 'Hour']

num_features = ['air_temperature', 'water_temperature', 'wind_gust_max_10min',\
                   'wind_speed_avg_10min', 'wind_force_avg_10min', 'wind_direction',\
                   'windchill', 'barometric_pressure_qfe', 'precipitation',\
                   'dew_point', 'global_radiation', 'humidity', 'water_level']

label = 'AccidentType'


# Pipeline stages list
stages = []

# Loop for stringindexer and ohe for categorical variables
for feature in cat_features:
    
    # Index categorical features
    string_indexer = StringIndexer(inputCol=feature, outputCol=feature + '_index')
    
    # Apply one hot encoding of categorical variables
    encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()],
                                    outputCols=[feature + '_class_vec'])
    
    # Append pipeline stages
    stages += [string_indexer, encoder]
    
for feature in cat_indexed_features:
    
    # Apply one hot encoding of categorical variables
    encoder = OneHotEncoder(inputCol=feature, outputCol=feature + '_class_vec')
    
    # Append pipeline stages
    stages += [encoder]

# Index label feature
label_str_index = StringIndexer(inputCol=label, outputCol='label_index')

# Scale feature: select the features to scale using the helper funcntion and standardize
# unscaled_features = sel_feature_scale(df=df, lower_skew=-2, upper_skew=2, dtypes='integer', drop_cols=[])

# unscaled_assembler = VectorAssembler(inputCols=unscaled_features, outputCol='unscaled_features')
# scaler = StandardScaler(inputCol='unscaled_features', outputCol='scaled_features')

# stages += [unscaled_assembler, scaler]

# Create list of Numeric Features that are not being scaled
# num_unscaled_diff_list = list(set(num_features) - set(unscaled_features))
num_unscaled_diff_list = list(set(num_features))

# Assemble or concat the categorical features and numeric features
assembler_inputs = [feature + '_class_vec' for feature in cat_features] + [feature + '_class_vec' for feature in cat_indexed_features] + num_unscaled_diff_list

assembler = VectorAssembler(inputCols=assembler_inputs, outputCol='assembled_inputs')

stages += [label_str_index, assembler]

# Assemble Final training data of scaled, numeric and categorical engeneered features
assembler_final = VectorAssembler(inputCols=['scaled_features', 'assembled_inputs'], outputCol='features')

stages += [assembler_final]

In [8]:
stages

[StringIndexer_899179010f3f,
 OneHotEncoder_4d38b474f2b6,
 OneHotEncoder_8f951ed0aea2,
 OneHotEncoder_94a870d32884,
 StringIndexer_b0df2f1672f3,
 VectorAssembler_baf4ff7ea461,
 VectorAssembler_b1b16cf94178]

In [9]:
# set pipeline
pipeline = Pipeline(stages=stages)

# Fit pipeline to data
pipeline_model = pipeline.fit(df)

#tranform data using fitted pipeline
df_transform = pipeline_model.tranform(df)

Py4JJavaError: An error occurred while calling o49.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 12.0 failed 1 times, most recent failure: Lost task 0.0 in stage 12.0 (TID 18) (192.168.1.26 executor driver): java.lang.NullPointerException: Value at index 0 is null
	at org.apache.spark.sql.Row.getAnyValAs(Row.scala:523)
	at org.apache.spark.sql.Row.getDouble(Row.scala:270)
	at org.apache.spark.sql.Row.getDouble$(Row.scala:270)
	at org.apache.spark.sql.catalyst.expressions.GenericRow.getDouble(rows.scala:166)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.$anonfun$getOutputAttrGroupFromData$3(OneHotEncoder.scala:520)
	at scala.runtime.java8.JFunction1$mcDI$sp.apply(JFunction1$mcDI$sp.java:23)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.$anonfun$getOutputAttrGroupFromData$2(OneHotEncoder.scala:520)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:219)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:219)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1429)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2297)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1183)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1177)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1222)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.getOutputAttrGroupFromData(OneHotEncoder.scala:521)
	at org.apache.spark.ml.feature.OneHotEncoder.fit(OneHotEncoder.scala:196)
	at org.apache.spark.ml.feature.OneHotEncoder.fit(OneHotEncoder.scala:128)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException: Value at index 0 is null
	at org.apache.spark.sql.Row.getAnyValAs(Row.scala:523)
	at org.apache.spark.sql.Row.getDouble(Row.scala:270)
	at org.apache.spark.sql.Row.getDouble$(Row.scala:270)
	at org.apache.spark.sql.catalyst.expressions.GenericRow.getDouble(rows.scala:166)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.$anonfun$getOutputAttrGroupFromData$3(OneHotEncoder.scala:520)
	at scala.runtime.java8.JFunction1$mcDI$sp.apply(JFunction1$mcDI$sp.java:23)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.$anonfun$getOutputAttrGroupFromData$2(OneHotEncoder.scala:520)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:219)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:219)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1429)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
# Inspect transformed data
df_transfrom.limit(5).toPandas()

In [None]:
# Select only features and label index for the final dataframe
df_transform_fin = df_transform.select('features', 'label_index')
df_transform_fin.limit(5).toPandas()

In [None]:
#spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [None]:
# Read the weather and the accident df

df_weather = spark.read\
    .options(header=True, inferSchema=True)\
    .csv('datasets/hourly_weather.csv')
df_accidents = spark.read\
    .options(header=True, inferSchema=True)\
    .csv('datasets/accidents.csv')

In [None]:
df_accidents = df_accidents.drop('AccidentUID', 'AccidentType_de', 'AccidentType_fr', 'AccidentType_it',\
                                  'AccidentType_en',\
                                  'AccidentSeverityCategory_de', 'AccidentSeverityCategory_fr',\
                                  'AccidentSeverityCategory_it', 'AccidentSeverityCategory_en',\
                                  'RoadType_de', 'RoadType_fr', 'RoadType_it', 'RoadType_en',\
                                  'AccidentLocation_CHLV95_E', 'AccidentLocation_CHLV95_N', 'CantonCode',\
                                  'MunicipalityCode', 'AccidentYear', 'AccidentMonth', 'AccidentMonth_de',\
                                  'AccidentMonth_fr', 'AccidentMonth_it', 'AccidentMonth_en', 'AccidentWeekDay',\
                                  'AccidentWeekDay_de', 'AccidentWeekDay_fr', 'AccidentWeekDay_it',\
                                  'AccidentWeekDay_en', 'AccidentHour', 'AccidentHour_text', 'day', 'RoadType',\
                                  'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle')

In [None]:
df_accidents.show(5)

In [None]:
# index accident type, accident severity and week day

type_indexer = StringIndexer(inputCol='AccidentType', outputCol='typeIndex')
severity_indexer = StringIndexer(inputCol='AccidentSeverityCategory', outputCol='severityIndex')
# day_indexer = StringIndexer(inputCol='WeekDay', outputCol='weekDayIndex')

# One-hot encoder for accident type and severity

ohe_accident_type = OneHotEncoder(inputCol='typeIndex', outputCol='type_vec')
ohe_accident_severity = OneHotEncoder(inputCol='severityIndex', outputCol='severity_vec')

# Create pipeline and pass all stages
pipeline = Pipeline(stages=[type_indexer,
                           severity_indexer,
                           ohe_accident_type,
                           ohe_accident_severity])

In [None]:
# Apply pipeline

df_accidents_transformed = pipeline.fit(df_accidents).transform(df_accidents)
df_accidents_transformed.show(5)

In [None]:
df_weather.printSchema()

In [None]:
df_weather_transformed = df_weather.withColumnRenamed('air_temperature', 'air_temp')\
    .withColumnRenamed('water_temperature', 'water_temp')\
    .withColumnRenamed('wind_gust_max_10min', 'wind_gust')\
    .withColumnRenamed('wind_speed_avg_10min', 'avg_wind_speed')\
    .withColumnRenamed('wind_force_avg_10min', 'avg_wind_force')\
    .withColumnRenamed('barometric_pressure_qfe', 'pressure_qfe')
# df_weather_transformed.printSchema()

df_weather_transformed.show(5)

In [None]:
df = df_weather_transformed.join(df_accidents_transformed, how='left')
df.show(1)

In [None]:
df.printSchema()

In [None]:
df.filter('WeekDay is NULL').show(10)

In [None]:
#accidents_rdd.take(1)

In [None]:
df_accidents.columns

In [None]:
df_accidents_2.columns