In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import types
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [2]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/usr/local/Cellar/apache-spark/3.2.1/libexec/jars/gcs-connector-hadoop3-latest.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "/Users/amey/google/credentials/google_credentials.json")

In [3]:
sc = SparkContext(conf=conf)

sc._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
sc._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.json.keyfile", "/Users/amey/google/credentials/google_credentials.json")
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")

In [4]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [5]:
#Bringing in only necessary fields for Crashes dataset
crashes_columns = ['CRASH_RECORD_ID','RD_NO','CRASH_DATE','POSTED_SPEED_LIMIT','WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE','STREET_NO','STREET_DIRECTION','STREET_NAME','INJURIES_TOTAL','INJURIES_FATAL']
df_crashes_parquet = spark.read.parquet('gs://dtc_capstone_data_quiet-rigging-347402/raw/crash/*.parquet').select(crashes_columns)

In [6]:
#Bringing in only necessary fields for Vehicles dataset
vehicles_columns = ['CRASH_UNIT_ID','CRASH_RECORD_ID','RD_NO','CRASH_DATE','UNIT_NO','UNIT_TYPE','NUM_PASSENGERS','VEHICLE_ID','CMRC_VEH_I','MAKE','MODEL']
df_vehicles_parquet = spark.read.parquet('gs://dtc_capstone_data_quiet-rigging-347402/raw/vehicles/*.parquet').select(vehicles_columns)

In [7]:
#Adding sequence number for each vehicle involved in a crash in Vehicles Dataset
windowSpec  = Window.partitionBy("CRASH_RECORD_ID").orderBy("CRASH_UNIT_ID")
df_vehicles_parquet= df_vehicles_parquet.withColumn("veh_seq_nbr",row_number().over(windowSpec))

In [8]:
# Creating Temp tables for Vehicles & Crashes datasets for further transformations that will be done using Spark SQL
df_vehicles_parquet.createOrReplaceTempView('vehicles')
df_crashes_parquet.createOrReplaceTempView('crashes')

In [9]:
# Summarizing vehicles data at crash_record_id level so it can be joined onto crashes dataset to create a final dataset to be exported back to GCP
df_vehicle_summ = spark.sql("""
SELECT B.CRASH_RECORD_ID,
    B.RD_NO,
    B.CRASH_DATE, 
    B.Cnt_Entities_Accid,
    B.Cnt_MotorVehicles_Accid,
    
    max(VEHICLE1_ID) as VEHICLE1_ID,
    max(VEHICLE1_MAKE) as VEHICLE1_MAKE,
    max(VEHICLE1_MODEL) as VEHICLE1_MODEL,
    
    max(VEHICLE2_ID) as VEHICLE2_ID,
    max(VEHICLE2_MAKE) as VEHICLE2_MAKE,
    max(VEHICLE2_MODEL) as VEHICLE2_MODEL,
    
    max(VEHICLE3_ID) as VEHICLE3_ID ,
    max(VEHICLE3_MAKE) as VEHICLE3_MAKE,
    max(VEHICLE3_MODEL) as VEHICLE3_MODEL ,
    
    max(VEHICLE4_ID) as VEHICLE4_ID ,
    max(VEHICLE4_MAKE) as VEHICLE4_MAKE,
    max(VEHICLE4_MODEL) as VEHICLE4_MODEL
    FROM 
(SELECT 
    a.CRASH_RECORD_ID,
    a.RD_NO,
    a.CRASH_DATE, 
    
    b1.CRASH_UNIT_ID as VEHICLE1_ID,
    case when b1.MAKE is null then b1.UNIT_TYPE else b1.MAKE end as VEHICLE1_MAKE,
    case when b1.MODEL is null then b1.UNIT_TYPE else b1.MODEL end as VEHICLE1_MODEL,
    
    b2.CRASH_UNIT_ID as VEHICLE2_ID,
    case when b2.MAKE is null then b2.UNIT_TYPE else b2.MAKE end as VEHICLE2_MAKE,
    case when b2.MODEL is null then b2.UNIT_TYPE else b2.MODEL end as VEHICLE2_MODEL,
    
    b3.CRASH_UNIT_ID as VEHICLE3_ID,
    case when b3.MAKE is null then b3.UNIT_TYPE else b3.MAKE end as VEHICLE3_MAKE,
    case when b3.MODEL is null then b3.UNIT_TYPE else b3.MODEL end as VEHICLE3_MODEL,
    
    b4.CRASH_UNIT_ID as VEHICLE4_ID,
    case when b4.MAKE is null then b4.UNIT_TYPE else b4.MAKE end as VEHICLE4_MAKE,
    case when b4.MODEL is null then b4.UNIT_TYPE else b4.MODEL end as VEHICLE4_MODEL,
    
    COUNT(a.CRASH_UNIT_ID) as Cnt_Entities_Accid,
    COUNT(a.VEHICLE_ID) as Cnt_MotorVehicles_Accid
    
FROM
    vehicles a
    
    left join (select CRASH_RECORD_ID, CRASH_DATE, CRASH_UNIT_ID,VEHICLE_ID,MAKE,MODEL,UNIT_TYPE
                from vehicles where veh_seq_nbr = 1 ) b1
    on a.CRASH_RECORD_ID = b1.CRASH_RECORD_ID
    and a.CRASH_DATE = b1.CRASH_DATE
    and a.CRASH_UNIT_ID = b1.CRASH_UNIT_ID
    and a.VEHICLE_ID = b1.VEHICLE_ID
    
    
    left join (select CRASH_RECORD_ID, CRASH_DATE, CRASH_UNIT_ID,VEHICLE_ID,MAKE,MODEL,UNIT_TYPE
                from vehicles where  veh_seq_nbr = 2) b2
    on a.CRASH_RECORD_ID = b2.CRASH_RECORD_ID
    and a.CRASH_DATE = b2.CRASH_DATE
    and a.CRASH_UNIT_ID = b2.CRASH_UNIT_ID
    and a.VEHICLE_ID = b2.VEHICLE_ID
    
    left join (select CRASH_RECORD_ID, CRASH_DATE, CRASH_UNIT_ID,VEHICLE_ID,MAKE,MODEL,UNIT_TYPE
                from vehicles where  veh_seq_nbr = 3) b3
    on a.CRASH_RECORD_ID = b3.CRASH_RECORD_ID
    and a.CRASH_DATE = b3.CRASH_DATE
    and a.CRASH_UNIT_ID = b3.CRASH_UNIT_ID
    and a.VEHICLE_ID = b3.VEHICLE_ID
    
    left join (select CRASH_RECORD_ID, CRASH_DATE, CRASH_UNIT_ID,VEHICLE_ID,MAKE,MODEL ,UNIT_TYPE
                from vehicles where  veh_seq_nbr = 4) b4
    on a.CRASH_RECORD_ID = b4.CRASH_RECORD_ID
    and a.CRASH_DATE = b4.CRASH_DATE
    and a.CRASH_UNIT_ID = b4.CRASH_UNIT_ID
    and a.VEHICLE_ID = b4.VEHICLE_ID
    
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15) B
group by 1,2,3,4,5
""")

In [10]:
#Creating Temp table for Vehicles Summary df created above. This temp will be used to join Vehicles Summary temp table to Crashes temp table.
df_vehicle_summ.createOrReplaceTempView('vehicles_summ')

In [11]:
# Summarizing vehicles data at crash_record_id level so it can be joined onto crashes dataset to create a final dataset to be exported back to GCP
df_final_summ = spark.sql("""
SELECT 
    A.CRASH_RECORD_ID,
    A.RD_NO,
    A.CRASH_DATE,
    date_trunc('month', A.CRASH_DATE) AS CRASH_MONTH,
    date_trunc('year', A.CRASH_DATE) AS CRASH_YEAR,
    date_trunc('year', A.CRASH_DATE)||'_'||date_trunc('month', A.CRASH_DATE) AS YEAR_MONTH,
    CASE WHEN EXTRACT(DOW FROM A.CRASH_DATE) = 0 THEN 'SUNDAY'
         WHEN EXTRACT(DOW FROM A.CRASH_DATE) = 1 THEN 'MONDAY'
         WHEN EXTRACT(DOW FROM A.CRASH_DATE) = 2 THEN 'TUESDAY'
         WHEN EXTRACT(DOW FROM A.CRASH_DATE) = 3 THEN 'WEDNESDAY'
         WHEN EXTRACT(DOW FROM A.CRASH_DATE) = 4 THEN 'THURSDAY'
         WHEN EXTRACT(DOW FROM A.CRASH_DATE) = 5 THEN 'FRIDAY'
         WHEN EXTRACT(DOW FROM A.CRASH_DATE) = 6 THEN 'SATURDAY' 
         END AS DAY_OF_WEEK,
    EXTRACT(HOUR FROM A.CRASH_DATE) AS HOUR_OF_DAY,
    A.POSTED_SPEED_LIMIT,
    A.WEATHER_CONDITION,
    A.LIGHTING_CONDITION,
    A.FIRST_CRASH_TYPE,
    A.STREET_NO,
    A.STREET_DIRECTION,
    A.STREET_NAME,
    A.INJURIES_TOTAL,
    A.INJURIES_FATAL,
    B.CNT_ENTITIES_ACCID,
    B.Cnt_MOTORVEHICLES_ACCID,
    B.VEHICLE1_ID,
    B.VEHICLE1_MAKE,
    B.VEHICLE1_MODEL,
    B.VEHICLE2_ID,
    B.VEHICLE2_MAKE,
    B.VEHICLE2_MODEL,
    B.VEHICLE3_ID ,
    B.VEHICLE3_MAKE,
    B.VEHICLE3_MODEL ,
    B.VEHICLE4_ID ,
    B.VEHICLE4_MAKE,
    B.VEHICLE4_MODEL

FROM crashes A

LEFT JOIN vehicles_summ B
    ON A.CRASH_RECORD_ID = B.CRASH_RECORD_ID
    and A.CRASH_DATE = B.CRASH_DATE
    and A.RD_NO = B.RD_NO
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31

""")

In [13]:
#Saving parquet file to local so I can disconnect GCS connector and enable BQ connector in order upload data to BQ
df_final_summ.write.parquet("/Users/Amey/Data_Engineering/dtc_capstone_proj/output/crashesdata.parquet")

In [14]:
sc.stop()

SyntaxError: invalid syntax (<ipython-input-15-21859f82d531>, line 1)

In [19]:
#Establishing connection with BQ

from pyspark.sql import SparkSession
spark = SparkSession.builder \
  .appName('1.2. BigQuery Storage & Spark SQL - Python')\
  .config('spark.jars', '/usr/local/Cellar/apache-spark/3.2.1/libexec/jars/spark-bigquery-with-dependencies_2.11-0.24.2.jar') \
  .getOrCreate()


df_final_summ=spark.read.parquet("/Users/Amey/Data_Engineering/dtc_capstone_proj/output/crashesdata.parquet")

    

In [22]:
# Update to your GCS bucket
gcs_bucket = 'dtc_capstone_data_quiet-rigging-347402'

# Update to your BigQuery dataset name you created
bq_dataset = 'crashes_summary'

# Enter BigQuery table name you want to create or overwite. 
# If the table does not exist it will be created when you run the write function
bq_table = 'crashes_summary_clean'

df_final_summ.write \
  .format("bigquery") \
  .option("table","{}.{}".format(bq_dataset, bq_table)) \
  .option("temporaryGcsBucket", gcs_bucket) \
  .save()

Py4JJavaError: An error occurred while calling o339.save.
: java.lang.ClassNotFoundException: 
Failed to find data source: bigquery. Please find packages at
http://spark.apache.org/third-party-projects.html
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedToFindDataSourceError(QueryExecutionErrors.scala:443)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:670)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:720)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:852)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:256)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.ClassNotFoundException: bigquery.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:471)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:589)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:656)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:656)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:656)
	... 16 more


Row(CRASH_UNIT_ID=829999, CRASH_RECORD_ID='24ddf9fd8542199d832e1c223cc474e5601b356f1d77a648bb792285d3438a8d5bd177787c5cc51d347da7b1effd5920cb5ff39ce97615a184266b0a7cb615ce', RD_NO='JD124535', CRASH_DATE='01/22/2020 06:25:00 AM', UNIT_NO=1, UNIT_TYPE='DRIVER', NUM_PASSENGERS=None, VEHICLE_ID=796949, CMRC_VEH_I='', MAKE='INFINITI', MODEL='UNKNOWN', veh_seq_nbr=1)

[Row(CRASH_RECORD_ID='26201e24dadcfecc3cfe69f10b7224a4df15f0baecf2e955ef8f8a161eedf196888c395ce9dd5fac43cd1d06f2111cd516400b90912744bd7bfaa0cb67ae3641', num_vehicles=2),
 Row(CRASH_RECORD_ID='cb4362a84a3105a0b6b758fb5272e58c85a58f01ee2e180765100255ca8f845e36f024605e97ab9dd35c7bd6b53169f48b2702da1c62289aa5cf047a507428e0', num_vehicles=4),
 Row(CRASH_RECORD_ID='dddf911ff30ead410b91b853eeea6950051afad37014ac3ea029729b6e8af868b904db194c3fa16a1a918b882772e60166d5c26196211597b006bcc9fb7d67a4', num_vehicles=2),
 Row(CRASH_RECORD_ID='c3b6d574ef99fd6240597d05f803a89dc0b202619d5b788ddf99eed81073ca2f6cada9d8359efacaac664afcd11598339ddd7b134e7cadb0c5810f826ae5f740', num_vehicles=2),
 Row(CRASH_RECORD_ID='07cf72f5dc21e8251606dcebc1fd7ff511d6a1808f97f9d260203c2fe4428ec0c27e73cafb17c2addd3b35af40129524d046d71754c1ecab7318746841f56667', num_vehicles=2),
 Row(CRASH_RECORD_ID='71cf510bf81b49034a82a35b8156c79afbbc2011a1233f334ac71456d57767bd424f1ae151f6121edf164223c55b8c06829d52417bd8feca0985bfcaa40de680

[Row(CRASH_UNIT_ID=940084, CRASH_RECORD_ID='cb4362a84a3105a0b6b758fb5272e58c85a58f01ee2e180765100255ca8f845e36f024605e97ab9dd35c7bd6b53169f48b2702da1c62289aa5cf047a507428e0', RD_NO='JD334931', CRASH_DATE='08/16/2020 06:30:00 PM', UNIT_NO=1, UNIT_TYPE='BICYCLE', NUM_PASSENGERS=None, VEHICLE_ID=None, CMRC_VEH_I='', MAKE='', MODEL='', veh_seq_nbr=1),
 Row(CRASH_UNIT_ID=940086, CRASH_RECORD_ID='cb4362a84a3105a0b6b758fb5272e58c85a58f01ee2e180765100255ca8f845e36f024605e97ab9dd35c7bd6b53169f48b2702da1c62289aa5cf047a507428e0', RD_NO='JD334931', CRASH_DATE='08/16/2020 06:30:00 PM', UNIT_NO=2, UNIT_TYPE='DRIVER', NUM_PASSENGERS=None, VEHICLE_ID=891177, CMRC_VEH_I='', MAKE='FORD', MODEL='MUSTANG', veh_seq_nbr=2),
 Row(CRASH_UNIT_ID=957468, CRASH_RECORD_ID='cb4362a84a3105a0b6b758fb5272e58c85a58f01ee2e180765100255ca8f845e36f024605e97ab9dd35c7bd6b53169f48b2702da1c62289aa5cf047a507428e0', RD_NO='JD334931', CRASH_DATE='08/16/2020 06:30:00 PM', UNIT_NO=2, UNIT_TYPE='BICYCLE', NUM_PASSENGERS=None, VEHIC

[Row(CRASH_RECORD_ID='cb4362a84a3105a0b6b758fb5272e58c85a58f01ee2e180765100255ca8f845e36f024605e97ab9dd35c7bd6b53169f48b2702da1c62289aa5cf047a507428e0', RD_NO='JD334931', CRASH_DATE='08/16/2020 06:30:00 PM', VEHICLE1_ID=None, VEHICLE1_MAKE=None, VEHICLE1_MODEL=None, VEHICLE2_ID=940086, VEHICLE2_MAKE='FORD', VEHICLE2_MODEL='MUSTANG', VEHICLE3_ID=None, VEHICLE3_MAKE=None, VEHICLE3_MODEL=None, VEHICLE4_ID=957469, VEHICLE4_MAKE='FORD', VEHICLE4_MODEL='MUSTANG')]