## Ingest raw files to table

In [3]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# conf = SparkConf().setMaster('local')
# sc = SparkContext(conf = conf)
# spark = SparkSession(sc)
spark = SparkSession.builder \
    .appName("view_creation") \
    .config("spark.sql.catalogImplementation", "hive") \
    .getOrCreate()
# spark = SparkSession.builder \ 
#     .master('loacl') \
#Sets the Spark master URL to connect to, “local” to run locally
#     .appName("view_creation") \ 
#Sets a name for the application, which will be shown in the Spark web UI.
#     .config("spark.sql.catalogImplementation", "hive") \ 
#config(SparkConf conf) :Sets a list of config options based on the given SparkConf.
#config(String key, boolean value) :Sets a config option.
#     .getOrCreate()
#Gets an existing SparkSession or, if there is no existing one, creates a new one based on the options set in this builder.

In [4]:
spark.sql('CREATE DATABASE IF NOT EXISTS f1_raw')

DataFrame[]

### csv

In [5]:
# spark.sql('DROP TABLE f1_raw.circuits')
spark.sql('CREATE TABLE IF NOT EXISTS f1_raw.circuits(circuitId INT,circuitRef STRING,name STRING,location STRING,country STRING,lat DOUBLE,lng DOUBLE,alt INT,url STRING) USING csv OPTIONS (path "file:/E:/unused/Udemy/Spark_practice/raw/raw_files/circuits.csv", header true)')

DataFrame[]

In [6]:
spark.sql('SELECT * FROM f1_raw.circuits;').show(2)

+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|circuitId| circuitRef|                name|    location|  country|     lat|    lng|alt|                 url|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|http://en.wikiped...|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|http://en.wikiped...|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
only showing top 2 rows



In [7]:
# spark.sql('DROP TABLE f1_raw.races')
spark.sql('CREATE TABLE IF NOT EXISTS f1_raw.races(raceId INT, year INT, round INT, circuitId INT, name STRING, date DATE, time STRING, url STRING) USING csv OPTIONS (path "file:/E:/unused/Udemy/Spark_practice/raw/raw_files/races.csv", header true)')

DataFrame[]

In [8]:
spark.sql('SELECT * FROM f1_raw.races;').show(2)

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
only showing top 2 rows



### single line json

In [9]:
# spark.sql('DROP TABLE IF EXISTS f1_raw.constructors')
spark.sql('CREATE TABLE IF NOT EXISTS f1_raw.constructors( constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING) USING json OPTIONS(path "file:/E:/unused/Udemy/Spark_practice/raw/raw_files/constructors.json")')

DataFrame[]

In [10]:
spark.sql('SELECT * FROM f1_raw.constructors').show(2)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|                 url|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber|BMW Sauber|     German|http://en.wikiped...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 2 rows



### complex json (nested object)

In [11]:
# spark.sql('DROP TABLE IF EXISTS f1_raw.drivers')
spark.sql('CREATE TABLE IF NOT EXISTS f1_raw.drivers( driverId INT, driverRef STRING, number INT, code STRING, name STRUCT<forename: STRING, surname: STRING>, dob DATE, nationality STRING, url STRING) USING json OPTIONS (path "file:/E:/unused/Udemy/Spark_practice/raw/raw_files/drivers.json")')

DataFrame[]

In [12]:
spark.sql('SELECT * FROM f1_raw.drivers').show(2)

+--------+---------+------+----+-----------------+----------+-----------+--------------------+
|driverId|driverRef|number|code|             name|       dob|nationality|                 url|
+--------+---------+------+----+-----------------+----------+-----------+--------------------+
|       1| hamilton|    44| HAM|{Lewis, Hamilton}|1985-01-07|    British|http://en.wikiped...|
|       2| heidfeld|  null| HEI| {Nick, Heidfeld}|1977-05-10|     German|http://en.wikiped...|
+--------+---------+------+----+-----------------+----------+-----------+--------------------+
only showing top 2 rows



### single line json

In [13]:
# spark.sql('DROP TABLE IF EXISTS f1_raw.results')
spark.sql('CREATE TABLE IF NOT EXISTS f1_raw.results( resultId INT, raceId INT, driverId INT, constructorId INT, number INT,grid INT, position INT, positionText STRING, positionOrder INT, points INT, laps INT, time STRING, milliseconds INT, fastestLap INT, rank INT, fastestLapTime STRING, fastestLapSpeed FLOAT, statusId STRING) USING json OPTIONS(path "file:/E:/unused/Udemy/Spark_practice/raw/raw_files/results.json")')

DataFrame[]

In [14]:
spark.sql('SELECT * FROM f1_raw.results').show(2)

+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|resultId|raceId|driverId|constructorId|number|grid|position|positionText|positionOrder|points|laps|       time|milliseconds|fastestLap|rank|fastestLapTime|fastestLapSpeed|statusId|
+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|       1|    18|       1|            1|    22|   1|       1|           1|            1|    10|  58|1:34:50.616|     5690616|        39|   2|      1:27.452|          218.3|       1|
|       2|    18|       2|            2|     3|   5|       2|           2|            2|     8|  58|     +5.478|     5696094|        41|   3|      1:27.739|        217.586|       1|
+--------+------+--------+-------------+------+----+--------+------------+-------------+--

### multiline JSON

In [15]:
# spark.sql('DROP TABLE IF EXISTS f1_raw.pit_stops')
spark.sql('CREATE TABLE IF NOT EXISTS f1_raw.pit_stops( driverId INT, duration STRING, lap INT, milliseconds INT, raceId INT, stop INT, time STRING) USING json OPTIONS(path "file:/E:/unused/Udemy/Spark_practice/raw/raw_files/pit_stops.json", multiLine true) ')

DataFrame[]

In [16]:
spark.sql('SELECT * FROM f1_raw.pit_stops').show(2)

+--------+--------+---+------------+------+----+--------+
|driverId|duration|lap|milliseconds|raceId|stop|    time|
+--------+--------+---+------------+------+----+--------+
|     153|  26.898|  1|       26898|   841|   1|17:05:23|
|      30|  25.021|  1|       25021|   841|   1|17:05:52|
+--------+--------+---+------------+------+----+--------+
only showing top 2 rows



In [17]:
# spark.sql('DROP TABLE IF EXISTS f1_raw.lap_times;')
spark.sql('CREATE TABLE IF NOT EXISTS f1_raw.lap_times( raceId INT, driverId INT, lap INT, position INT, time STRING, milliseconds INT ) USING csv OPTIONS (path "file:/E:/unused/Udemy/Spark_practice/raw/raw_files/lap_times")')

DataFrame[]

In [18]:
spark.sql('SELECT * FROM f1_raw.lap_times').show(2)

+------+--------+---+--------+--------+------------+
|raceId|driverId|lap|position|    time|milliseconds|
+------+--------+---+--------+--------+------------+
|   841|      20|  1|       1|1:38.109|       98109|
|   841|      20|  2|       1|1:33.006|       93006|
+------+--------+---+--------+--------+------------+
only showing top 2 rows



In [19]:
# spark.sql('DROP TABLE IF EXISTS f1_raw.qualifying;')
spark.sql('CREATE TABLE IF NOT EXISTS f1_raw.qualifying( constructorId INT, driverId INT, number INT, position INT, q1 STRING, q2 STRING, q3 STRING, qualifyId INT, raceId INT) USING json OPTIONS (path "file:/E:/unused/Udemy/Spark_practice/raw/raw_files/qualifying", multiLine true)')

DataFrame[]

In [20]:
spark.sql('SELECT * FROM f1_raw.qualifying').show(2)

+-------------+--------+------+--------+--------+--------+--------+---------+------+
|constructorId|driverId|number|position|      q1|      q2|      q3|qualifyId|raceId|
+-------------+--------+------+--------+--------+--------+--------+---------+------+
|            1|       1|    22|       1|1:26.572|1:25.187|1:26.714|        1|    18|
|            2|       9|     4|       2|1:26.103|1:25.315|1:26.869|        2|    18|
+-------------+--------+------+--------+--------+--------+--------+---------+------+
only showing top 2 rows



In [21]:
spark.sql("DESCRIBE EXTENDED f1_raw.qualifying").show(truncate=False)

+----------------------------+-------------------------------------------------------------+-------+
|col_name                    |data_type                                                    |comment|
+----------------------------+-------------------------------------------------------------+-------+
|constructorId               |int                                                          |null   |
|driverId                    |int                                                          |null   |
|number                      |int                                                          |null   |
|position                    |int                                                          |null   |
|q1                          |string                                                       |null   |
|q2                          |string                                                       |null   |
|q3                          |string                                                       

In [22]:
spark.sql('SHOW TABLES IN f1_raw').show(truncate = False)

+---------+------------+-----------+
|namespace|tableName   |isTemporary|
+---------+------------+-----------+
|f1_raw   |circuits    |false      |
|f1_raw   |constructors|false      |
|f1_raw   |drivers     |false      |
|f1_raw   |lap_times   |false      |
|f1_raw   |pit_stops   |false      |
|f1_raw   |qualifying  |false      |
|f1_raw   |races       |false      |
|f1_raw   |results     |false      |
+---------+------------+-----------+

