In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName('DB_creation').setMaster('local')
sc = SparkContext(conf = conf)
# spark = SparkSession(sc)
spark = SparkSession.builder \
    .appName("example") \
    .config("spark.sql.catalogImplementation", "hive") \
    .getOrCreate()

In [2]:
spark.sql('CREATE DATABASE IF NOT EXISTS demo').show()

++
||
++
++



In [3]:
spark.sql('SHOW DATABASES').show()

+---------+
|namespace|
+---------+
|  default|
|     demo|
+---------+



In [4]:
spark.sql('DESCRIBE DATABASE EXTENDED demo').show()

+--------------+--------------------+
|     info_name|          info_value|
+--------------+--------------------+
|Namespace Name|                demo|
|       Comment|                    |
|      Location|file:/E:/unused/U...|
|         Owner|                user|
|    Properties|                    |
+--------------+--------------------+



In [5]:
race_result_df = spark.read.parquet(r'E:\unused\Udemy\Spark_practice\raw\presentation\race_result',inferSchema = True)
race_result_df.show(2)

+---------+-----------------+-------------------+----------------+--------------+-------------+------------------+-------+----+-----------+---------+------+--------+
|race_year|        race_name|          race_date|circuit_location|   driver_name|driver_number|driver_nationality|   team|grid|fastest_lap|race_time|points|position|
+---------+-----------------+-------------------+----------------+--------------+-------------+------------------+-------+----+-----------+---------+------+--------+
|     1974|German Grand Prix|1974-08-04 00:00:00|         Nürburg|Clay Regazzoni|         null|             Swiss|Ferrari|   2|       null|1:41:35.0|   9.0|       1|
|     1974|German Grand Prix|1974-08-04 00:00:00|         Nürburg|Jody Scheckter|         null|     South African|Tyrrell|   4|       null|    +50.7|   6.0|       2|
+---------+-----------------+-------------------+----------------+--------------+-------------+------------------+-------+----+-----------+---------+------+--------+
only

### craete managed/internal table

In [6]:
race_result_df.write.mode('overwrite').format('parquet').saveAsTable('demo.race_result_tb_py')

In [7]:
spark.sql('USE demo')
spark.sql('DESCRIBE TABLE EXTENDED demo.race_result_tb_py ').show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|           race_year|                 int|   null|
|           race_name|              string|   null|
|           race_date|           timestamp|   null|
|    circuit_location|              string|   null|
|         driver_name|              string|   null|
|       driver_number|                 int|   null|
|  driver_nationality|              string|   null|
|                team|              string|   null|
|                grid|                 int|   null|
|         fastest_lap|                 int|   null|
|           race_time|              string|   null|
|              points|               float|   null|
|            position|                 int|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|                demo|       |
|           

In [8]:
spark.sql('CREATE TABLE demo.race_result_tb_sql as SELECT * FROM demo.race_result_tb_py').show()

++
||
++
++



### create external table

In [9]:
race_result_df.write.format('parquet').option('path',r'E:\unused\Udemy\Spark_practice\raw\presentation\ext_tb_py').saveAsTable('demo.race_result_tb_py_ext')

In [10]:
race_result_df.printSchema()

root
 |-- race_year: integer (nullable = true)
 |-- race_name: string (nullable = true)
 |-- race_date: timestamp (nullable = true)
 |-- circuit_location: string (nullable = true)
 |-- driver_name: string (nullable = true)
 |-- driver_number: integer (nullable = true)
 |-- driver_nationality: string (nullable = true)
 |-- team: string (nullable = true)
 |-- grid: integer (nullable = true)
 |-- fastest_lap: integer (nullable = true)
 |-- race_time: string (nullable = true)
 |-- points: float (nullable = true)
 |-- position: integer (nullable = true)



In [11]:
spark.sql('CREATE TABLE demo.race_result_tb_sql_ext( race_year INT,race_name STRING, race_date TIMESTAMP, circuit_location STRING, driver_name string, driver_number INT, driver_nationality STRING, team STRING, grid INT, fastest_lap INT, race_time STRING, points FLOAT, position INT) USING parquet LOCATION "file:/E:/unused/Udemy/Spark_practice/raw/presentation/ext_tb_sql"')
#only metadata is created for the table

DataFrame[]

In [12]:
spark.sql('SHOW TABLES IN demo').show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|     demo|   race_result_tb_py|      false|
|     demo|race_result_tb_py...|      false|
|     demo|  race_result_tb_sql|      false|
|     demo|race_result_tb_sq...|      false|
+---------+--------------------+-----------+



In [13]:
spark.sql('INSERT INTO demo.race_result_tb_sql_ext SELECT * FROM demo.race_result_tb_py_ext WHERE race_year=2020')

DataFrame[]

In [14]:
spark.sql('SELECT count(*) FROM demo.race_result_tb_sql_ext').show()

+--------+
|count(1)|
+--------+
|     340|
+--------+



### drop tables

In [15]:
spark.sql('DROP TABLE demo.race_result_tb_sql_ext')
spark.sql('SHOW TABLES IN demo').show()
# table is dropped but file is not deleted in external table

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|     demo|   race_result_tb_py|      false|
|     demo|race_result_tb_py...|      false|
|     demo|  race_result_tb_sql|      false|
+---------+--------------------+-----------+



<div style="max-width:1400px;margin-center: auto">
<img src="images\external table.png" width="600"/>
</div>

In [17]:
spark.sql('DROP TABLE demo.race_result_tb_sql')
spark.sql('SHOW TABLES IN demo').show()
# table is dropped and file is deleted in internal table

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|     demo|   race_result_tb_py|      false|
|     demo|race_result_tb_py...|      false|
+---------+--------------------+-----------+



<div style="max-width:1400px;margin-center: auto">
<img src="images\managed table.png" width="600"/>
</div>

In [2]:
spark.sql('SHOW DATABASES').show()

+---------+
|namespace|
+---------+
|  default|
|     demo|
|   f1_raw|
+---------+

