In [15]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline

In [None]:
spark.conf.set('spark.sql.repl.eagerEval.enabled')

In [2]:
sc = SparkContext('local')
spark = SparkSession(sc)

In [3]:
# Read the weather and the accident df

df_weather = spark.read\
    .options(header=True, inferSchema=True)\
    .csv('datasets/hourly_weather.csv')
df_accidents = spark.read\
    .options(header=True, inferSchema=True)\
    .csv('datasets/accidents.csv')

In [4]:
df_accidents = df_accidents.drop('AccidentUID', 'AccidentType_de', 'AccidentType_fr', 'AccidentType_it',\
                                  'AccidentType_en',\
                                  'AccidentSeverityCategory_de', 'AccidentSeverityCategory_fr',\
                                  'AccidentSeverityCategory_it', 'AccidentSeverityCategory_en',\
                                  'RoadType_de', 'RoadType_fr', 'RoadType_it', 'RoadType_en',\
                                  'AccidentLocation_CHLV95_E', 'AccidentLocation_CHLV95_N', 'CantonCode',\
                                  'MunicipalityCode', 'AccidentYear', 'AccidentMonth', 'AccidentMonth_de',\
                                  'AccidentMonth_fr', 'AccidentMonth_it', 'AccidentMonth_en', 'AccidentWeekDay',\
                                  'AccidentWeekDay_de', 'AccidentWeekDay_fr', 'AccidentWeekDay_it',\
                                  'AccidentWeekDay_en', 'AccidentHour', 'AccidentHour_text', 'day', 'RoadType',\
                                  'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle')

In [16]:
# index accident type and accident severity

type_indexer = StringIndexer(inputCol='AccidentType', outputCol='typeIndex')
severity_indexer = StringIndexer(inputCol='AccidentSeverityCategory', outputCol='severityIndex')

# One-hot encoder for accident type and severity

ohe_accident_type = OneHotEncoder(inputCol='typeIndex', outputCol='type_vec')
ohe_accident_severity = OneHotEncoder(inputCol='severityIndex', outputCol='severity_vec')

# Create pipeline and pass all stages
pipeline = Pipeline(stages=[type_indexer,
                           severity_indexer,
                           ohe_accident_type,
                           ohe_accident_severity])

In [17]:
# Apply pipeline

df_accidents_transformed = pipeline.fit(df_accidents).transform(df_accidents)
df_accidents_transformed.show(5)

+---+------------+------------------------+-------------------+---------+-------------+--------------+-------------+
|_c0|AccidentType|AccidentSeverityCategory|               date|typeIndex|severityIndex|      type_vec| severity_vec|
+---+------------+------------------------+-------------------+---------+-------------+--------------+-------------+
|  0|         at0|                     as4|2011-01-01 00:30:00|      0.0|          0.0|(10,[0],[1.0])|(3,[0],[1.0])|
|  1|         at0|                     as3|2011-01-01 01:30:00|      0.0|          1.0|(10,[0],[1.0])|(3,[1],[1.0])|
|  2|         at0|                     as4|2011-01-01 02:30:00|      0.0|          0.0|(10,[0],[1.0])|(3,[0],[1.0])|
|  3|         at5|                     as3|2011-01-01 02:30:00|      7.0|          1.0|(10,[7],[1.0])|(3,[1],[1.0])|
|  4|         at0|                     as4|2011-01-01 03:30:00|      0.0|          0.0|(10,[0],[1.0])|(3,[0],[1.0])|
+---+------------+------------------------+-------------------+-

In [18]:
df_weather.columns

['_c0',
 'date',
 'air_temperature',
 'water_temperature',
 'wind_gust_max_10min',
 'wind_speed_avg_10min',
 'wind_force_avg_10min',
 'wind_direction',
 'windchill',
 'barometric_pressure_qfe',
 'precipitation',
 'dew_point',
 'global_radiation',
 'humidity',
 'water_level']

In [21]:
df_weather_transformed = df_weather.withColumnRenamed('air_temperature', 'air_temp')\
    .withColumnRenamed('water_temperature', 'water_temp')\
    .withColumnRenamed('wind_gust_max_10min', 'wind_gust')\
    .withColumnRenamed('wind_speed_avg_10min', 'avg_wind_speed')\
    .withColumnRenamed('wind_force_avg_10min', 'avg_wind_force')\
    .withColumnRenamed('barometric_pressure_qfe', 'pressure_qfe')
# df_weather_transformed.printSchema()

df_weather_transformed.show(5)

+---+-------------------+-----------------+-----------------+---------+------------------+------------------+--------------+------------------+-----------------+-------------+------------------+----------------+-----------------+-----------+
|_c0|               date|         air_temp|       water_temp|wind_gust|    avg_wind_speed|    avg_wind_force|wind_direction|         windchill|     pressure_qfe|precipitation|         dew_point|global_radiation|         humidity|water_level|
+---+-------------------+-----------------+-----------------+---------+------------------+------------------+--------------+------------------+-----------------+-------------+------------------+----------------+-----------------+-----------+
|  0|2011-01-01 00:30:00|2.233333333333334|              5.2|      2.4|1.2166666666666668|1.2166666666666668|          1785|2.2000000000000006|974.5500000000001|          0.0|1.6166666666666665|             0.5|95.83333333333333|     67.635|
|  1|2011-01-01 01:30:00|       

In [None]:
weather_rdd = df_weather.rdd.map(tuple)
accidents_rdd = df_accidents.rdd.map(tuple)

# Test:
# weather_rdd.take(1)

In [None]:
#accidents_rdd.take(1)

In [None]:
df_accidents.columns

In [None]:
df_accidents_2.columns