**To leverage Spark SQL API to make SQL queries**:

I defined two functions: one is spark_df_reader to connect to the MySQL database through jdbc Driver and read the tables as a dictionary called dfs whose keys are the names of the tables and the values are the corresponding dataframes. The second function is temporary_view_registrator to register the Spark dataframes (one per table in the database) as a temporary view to be able to pass in direct SQL queries.


In [7]:
import findspark
findspark.init('/home/danial/spark-3.4.0-bin-hadoop3')
import pyspark 
import os
password = os.environ.get('MYSQL_PASSWORD')

In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MySQL Session").getOrCreate()

In [9]:
def spark_df_reader(database_name, table_names ):
    
    # table_names is a list of table names in the database that I want to connect to 
    
    mysql_url = f"jdbc:mysql://localhost:3306/{database_name}"
    
    mysql_properties = {
    "user": "root",
    "password": password,
    "driver": "com.mysql.jdbc.Driver"
    }
    
    dfs = {}
    for one_table in table_names:
        
        df = spark.read.jdbc(url=mysql_url, table=one_table, properties=mysql_properties)

        dfs[one_table] = df
        
    return dfs 

In [10]:
# I need to register my Spark dataframes (one per table in the database) as a temporary view to be able to pass in direct SQL queries 

def temporary_view_registrator(dfs):
    
    # dfs is a dictionary whose keys are the name of tables and values are the corresponding dfs
    
    tem_views = []
    for one_table in list(dfs.keys()):
        dfs[one_table].createOrReplaceTempView(f"{one_table}")
        tem_views.append(f"{one_table}")
        
    return None # this functions returns nothing but creates temporary views with the same name as the tables

### 1757 Recyclable and Low Fat Products

In [11]:
dfs = spark_df_reader('Leetcode_Q_1757', ['Products'])

In [12]:
dfs['Products'].show()

+----------+--------+----------+
|product_id|low_fats|recyclable|
+----------+--------+----------+
|         0|       Y|         N|
|         1|       Y|         Y|
|         2|       N|         Y|
|         3|       Y|         Y|
|         4|       N|         N|
+----------+--------+----------+



In [13]:
temporary_view_registrator(dfs)

In [14]:
spark.sql("""
SELECT product_id
FROM Products
WHERE low_fats = 'y' AND recyclable = 'Y'

""").show()

+----------+
|product_id|
+----------+
|         1|
|         3|
+----------+



### 1350 Students With Invalid Departments

In [16]:
dfs = spark_df_reader('Leetcode_Q_1350', ['Departments', 'Students'])
temporary_view_registrator(dfs)

In [31]:
spark.sql("""
SELECT 
    s.id, s.name
FROM Students s 
LEFT JOIN Departments d
    ON d.id = s.department_id
WHERE d.id IS NULL
""").show()

+---+-------+
| id|   name|
+---+-------+
|  4|Jasmine|
|  7| Daiana|
|  2|   John|
|  3|  Steve|
+---+-------+



In [30]:
# better solution

spark.sql("""
SELECT id, name
FROM Students
WHERE department_id NOT IN (SELECT id FROM Departments)
""").show()

+---+-------+
| id|   name|
+---+-------+
|  2|   John|
|  4|Jasmine|
|  3|  Steve|
|  7| Daiana|
+---+-------+



### 1303 Find the Team Size

In [32]:
dfs = spark_df_reader('Leetcode_Q_1303', ['Employee'])
temporary_view_registrator(dfs)

In [40]:
spark.sql("""
SELECT 
    employee_id,
    COUNT(employee_id) OVER(PARTITION BY team_id)AS team_size
FROM Employee
""").show()

+-----------+---------+
|employee_id|team_size|
+-----------+---------+
|          4|        1|
|          1|        3|
|          2|        3|
|          3|        3|
|          5|        2|
|          6|        2|
+-----------+---------+



### 1741 Find Total Time Spent by Each Employee

In [41]:
dfs = spark_df_reader('Leetcode_Q_1741', ['Employees'])
temporary_view_registrator(dfs)

In [48]:
spark.sql("""
SELECT 
    DISTINCT event_day AS day,
    emp_id,
    SUM(out_time - in_time) OVER(PARTITION BY emp_id, event_day) AS total_time    
FROM Employees

""").show()

+----------+------+----------+
|       day|emp_id|total_time|
+----------+------+----------+
|2020-12-09|     2|        27|
|2020-11-28|     1|       173|
|2020-11-28|     2|        30|
|2020-12-03|     1|        41|
+----------+------+----------+

