**To leverage Spark SQL API to make SQL queries**:

I defined two functions: one is spark_df_reader to connect to the MySQL database through jdbc Driver and read the tables as a dictionary called dfs whose keys are the names of the tables and the values are the corresponding dataframes. The second function is temporary_view_registrator to register the Spark dataframes (one per table in the database) as a temporary view to be able to pass in direct SQL queries.


In [1]:
import findspark
findspark.init('/home/danial/spark-3.4.0-bin-hadoop3')
import pyspark 
import os
password = os.environ.get('MYSQL_PASSWORD')

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MySQL Session").getOrCreate()

In [4]:
def spark_df_reader(database_name, table_names ):
    
    # table_names is a list of table names in the database that I want to connect to 
    
    mysql_url = f"jdbc:mysql://localhost:3306/{database_name}"
    
    mysql_properties = {
    "user": "root",
    "password": password,
    "driver": "com.mysql.jdbc.Driver"
    }
    
    dfs = {}
    for one_table in table_names:
        
        df = spark.read.jdbc(url=mysql_url, table=one_table, properties=mysql_properties)

        dfs[one_table] = df
        
    return dfs 

In [5]:
# I need to register my Spark dataframes (one per table in the database) as a temporary view to be able to pass in direct SQL queries 

def temporary_view_registrator(dfs):
    
    # dfs is a dictionary whose keys are the name of tables and values are the corresponding dfs
    
    tem_views = []
    for one_table in list(dfs.keys()):
        dfs[one_table].createOrReplaceTempView(f"{one_table}")
        tem_views.append(f"{one_table}")
        
    return None # this functions returns nothing but creates temporary views with the same name as the tables

### 1757 Recyclable and Low Fat Products

In [11]:
dfs = spark_df_reader('Leetcode_Q_1757', ['Products'])

In [12]:
dfs['Products'].show()

+----------+--------+----------+
|product_id|low_fats|recyclable|
+----------+--------+----------+
|         0|       Y|         N|
|         1|       Y|         Y|
|         2|       N|         Y|
|         3|       Y|         Y|
|         4|       N|         N|
+----------+--------+----------+



In [13]:
temporary_view_registrator(dfs)

In [14]:
spark.sql("""
SELECT product_id
FROM Products
WHERE low_fats = 'y' AND recyclable = 'Y'

""").show()

+----------+
|product_id|
+----------+
|         1|
|         3|
+----------+



### 1350 Students With Invalid Departments

In [16]:
dfs = spark_df_reader('Leetcode_Q_1350', ['Departments', 'Students'])
temporary_view_registrator(dfs)

In [31]:
spark.sql("""
SELECT 
    s.id, s.name
FROM Students s 
LEFT JOIN Departments d
    ON d.id = s.department_id
WHERE d.id IS NULL
""").show()

+---+-------+
| id|   name|
+---+-------+
|  4|Jasmine|
|  7| Daiana|
|  2|   John|
|  3|  Steve|
+---+-------+



In [30]:
# better solution

spark.sql("""
SELECT id, name
FROM Students
WHERE department_id NOT IN (SELECT id FROM Departments)
""").show()

+---+-------+
| id|   name|
+---+-------+
|  2|   John|
|  4|Jasmine|
|  3|  Steve|
|  7| Daiana|
+---+-------+



### 1303 Find the Team Size

In [32]:
dfs = spark_df_reader('Leetcode_Q_1303', ['Employee'])
temporary_view_registrator(dfs)

In [40]:
spark.sql("""
SELECT 
    employee_id,
    COUNT(employee_id) OVER(PARTITION BY team_id)AS team_size
FROM Employee
""").show()

+-----------+---------+
|employee_id|team_size|
+-----------+---------+
|          4|        1|
|          1|        3|
|          2|        3|
|          3|        3|
|          5|        2|
|          6|        2|
+-----------+---------+



### 1741 Find Total Time Spent by Each Employee

In [41]:
dfs = spark_df_reader('Leetcode_Q_1741', ['Employees'])
temporary_view_registrator(dfs)

In [48]:
spark.sql("""
SELECT 
    DISTINCT event_day AS day,
    emp_id,
    SUM(out_time - in_time) OVER(PARTITION BY emp_id, event_day) AS total_time    
FROM Employees

""").show()

+----------+------+----------+
|       day|emp_id|total_time|
+----------+------+----------+
|2020-12-09|     2|        27|
|2020-11-28|     1|       173|
|2020-11-28|     2|        30|
|2020-12-03|     1|        41|
+----------+------+----------+



### 1821 Find Customers With Positive Revenue this Year

In [6]:
dfs = spark_df_reader('Leetcode_Q_1821', ['Customers'])
temporary_view_registrator(dfs)

In [7]:
spark.sql("""
SELECT customer_id
FROM Customers
WHERE revenue > 0 AND year = 2021
""").show()

+-----------+
|customer_id|
+-----------+
|          1|
|          4|
+-----------+



### 1571 Warehouse Manager

In [8]:
dfs = spark_df_reader('Leetcode_Q_1571', ['Warehouse', 'Products'])
temporary_view_registrator(dfs)

In [34]:
spark.sql("""
SELECT 
    DISTINCT name AS warehouse_name,
    SUM(Width * Length * Height * units) OVER(PARTITION BY name) AS volume
FROM Warehouse w
JOIN Products p
    USING (product_id)
""").show()

+--------------+------+
|warehouse_name|volume|
+--------------+------+
|      LCHouse2| 20250|
|      LCHouse1| 12250|
|      LCHouse3|   800|
+--------------+------+



### 2356 Number of Unique Subjects Taught by Each Teacher

In [35]:
dfs = spark_df_reader('Leetcode_Q_2356', ['Teacher'])
temporary_view_registrator(dfs)

In [42]:
spark.sql("""
SELECT 
    teacher_id,
    COUNT(DISTINCT subject_id) AS cnt
FROM Teacher
GROUP BY teacher_id
""").show()

+----------+---+
|teacher_id|cnt|
+----------+---+
|         1|  2|
|         2|  4|
+----------+---+



### 1693 Daily Leads and Partners


In [43]:
dfs = spark_df_reader('Leetcode_Q_1693', ['DailySales'])
temporary_view_registrator(dfs)

In [45]:
spark.sql("""
SELECT 
    date_id, 
    make_name,
    COUNT(DISTINCT lead_id) AS unique_leads,
    COUNT(DISTINCT partner_id) AS unique_partners
FROM DailySales
GROUP BY date_id, make_name
""").show()

+----------+---------+------------+---------------+
|   date_id|make_name|unique_leads|unique_partners|
+----------+---------+------------+---------------+
|2020-12-07|    honda|           3|              2|
|2020-12-08|   toyota|           2|              3|
|2020-12-08|    honda|           2|              2|
|2020-12-07|   toyota|           1|              2|
+----------+---------+------------+---------------+



### 2339 All the Matches of the League

In [7]:
dfs = spark_df_reader('Leetcode_Q_2339', ['Teams'])
temporary_view_registrator(dfs)

In [10]:
spark.sql("""
SELECT t.team_name AS home_team, tt.team_name AS away_team 
FROM Teams t
CROSS JOIN Teams tt 
WHERE t.team_name <> tt.team_name
""").show()

+-----------+-----------+
|  home_team|  away_team|
+-----------+-----------+
|Leetcode FC|    Ahly SC|
|Leetcode FC|Real Madrid|
|    Ahly SC|Leetcode FC|
|    Ahly SC|Real Madrid|
|Real Madrid|Leetcode FC|
|Real Madrid|    Ahly SC|
+-----------+-----------+



### 1683 Invalid Tweets

In [11]:
dfs = spark_df_reader('Leetcode_Q_1683', ['Tweets'])
temporary_view_registrator(dfs)

In [12]:
spark.sql("""
SELECT tweet_id
FROM Tweets
WHERE LENGTH(content) > 15

""").show()

+--------+
|tweet_id|
+--------+
|       2|
+--------+



### 1853 Convert Date Format


In [13]:
dfs = spark_df_reader('Leetcode_Q_1853', ['Days'])
temporary_view_registrator(dfs)

In [68]:
spark.sql("""
SELECT 
    DATE_FORMAT(day, 'E, M d, y') AS day
FROM Days
""").show()

+---------------+
|            day|
+---------------+
|Tue, 4 12, 2022|
| Mon, 8 9, 2021|
|Fri, 6 26, 2020|
+---------------+



### 1378 Replace Employee ID With The Unique Identifier


In [8]:
dfs = spark_df_reader('Leetcode_Q_1378', ['Employees', 'EmployeeUNI'])
temporary_view_registrator(dfs)

In [10]:
spark.sql("""
SELECT 
    ee.unique_id,
    e.name
FROM Employees e
LEFT JOIN EmployeeUNI ee
    USING (id)
""").show()

+---------+--------+
|unique_id|    name|
+---------+--------+
|     null|   Alice|
|        1|Jonathan|
|     null|     Bob|
|        3| Winston|
|        2|    Meir|
+---------+--------+



### 1623 All Valid Triplets That Can Represent a Country

In [7]:
dfs = spark_df_reader('Leetcode_Q_1623', ['SchoolA', 'SchoolB', 'SchoolC'])
temporary_view_registrator(dfs)

In [21]:
spark.sql("""

SELECT 
    a.student_name AS member_A,
    b.student_name AS member_B,
    c.student_name AS member_C
FROM SchoolA a
JOIN SchoolB b
    ON a.student_id <> b.student_id AND a.student_name <> b.student_name 
JOIN SchoolC c
    ON a.student_id <> c.student_id AND a.student_name <> c.student_name  AND
       c.student_id <> b.student_id AND c.student_name <> b.student_name 
""").show() 

+--------+--------+--------+
|member_A|member_B|member_C|
+--------+--------+--------+
|   Alice|     Tom|   Jerry|
|     Bob|     Tom|   Alice|
+--------+--------+--------+



### 1587 Bank Account Summary II

In [23]:
dfs = spark_df_reader('Leetcode_Q_1587', ['Users', 'Transactions'])
temporary_view_registrator(dfs)

In [29]:
spark.sql("""
SELECT 
    DISTINCT u.name,
    SUM(amount) AS balance
FROM Users u
JOIN Transactions t
    USING (account)
GROUP BY u.name
HAVING balance > 10000
""").show()

+-----+-------+
| name|balance|
+-----+-------+
|Alice|  11000|
+-----+-------+



### 2026 Low-Quality Problems

In [31]:
dfs = spark_df_reader('Leetcode_Q_2026', ['Problems'])
temporary_view_registrator(dfs)

In [42]:
spark.sql("""
SELECT 
    problem_id
FROM Problems
WHERE likes / (likes + dislikes) * 100 < 60 
GROUP BY problem_id
ORDER BY problem_id

""").show()

+----------+
|problem_id|
+----------+
|         7|
|        10|
|        11|
|        13|
+----------+



### 627 Swap Salary

In [43]:
dfs = spark_df_reader('Leetcode_Q_627', ['Salary'])
temporary_view_registrator(dfs)

In [49]:
spark.sql("""
SELECT id, name, CASE
                    WHEN sex = 'f' THEN 'm' ELSE 'f'
                END AS sex, salary
FROM Salary

""").show()

+---+----+---+------+
| id|name|sex|salary|
+---+----+---+------+
|  1|   A|  f|  2500|
|  2|   B|  m|  1500|
|  3|   C|  f|  5500|
|  4|   D|  m|   500|
+---+----+---+------+



### 1421 NPV Queries

In [8]:
dfs = spark_df_reader('Leetcode_Q_1421', ['NPV', 'Queries'])
temporary_view_registrator(dfs)

In [12]:
spark.sql("""
SELECT 
    q.id, q.year, IFNULL(n.npv, 0) AS npv
FROM Queries q
LEFT JOIN NPV n
USING (id, year)
""").show()


+---+----+---+
| id|year|npv|
+---+----+---+
|  7|2018|  0|
|  7|2020| 30|
|  2|2008|121|
|  1|2019|113|
|  7|2019|  0|
| 13|2019| 40|
|  3|2009| 21|
+---+----+---+



### 1777 Product's Price for Each Store

In [13]:
dfs = spark_df_reader('Leetcode_Q_1777', ['Products'])
temporary_view_registrator(dfs)

In [22]:
spark.sql("""
SELECT 
    product_id, 
    MAX(IF (store = 'store1', price, null)) AS store1,
    MAX(IF (store = 'store2', price, null)) AS store2,
    MAX(IF (store = 'store3', price, null)) AS store3
FROM Products
GROUP BY product_id 
""").show()

+----------+------+------+------+
|product_id|store1|store2|store3|
+----------+------+------+------+
|         1|    70|  null|    80|
|         0|    95|   100|   105|
+----------+------+------+------+



### 1565 Unique Orders and Customers Per Month

In [23]:
dfs = spark_df_reader('Leetcode_Q_1565', ['Orders'])
temporary_view_registrator(dfs)

In [30]:
spark.sql("""
SELECT
    LEFT(order_date, 7) AS month,
    COUNT(DISTINCT order_id) AS order_count,
    COUNT(DISTINCT customer_id) AS customer_count
FROM Orders
WHERE invoice > 20
GROUP BY month

""").show()

+-------+-----------+--------------+
|  month|order_count|customer_count|
+-------+-----------+--------------+
|2020-12|          2|             1|
|2020-09|          2|             2|
|2021-01|          1|             1|
|2020-10|          1|             1|
+-------+-----------+--------------+



### 1173 Immediate Food Delivery I

In [31]:
dfs = spark_df_reader('Leetcode_Q_1173', ['Delivery'])
temporary_view_registrator(dfs)

In [41]:
spark.sql("""
SELECT 
        ROUND(SUM(CASE 
                    WHEN order_date = customer_pref_delivery_date THEN 1 ELSE 0 END)/COUNT(*) * 100, 2) AS immediate_percentage
FROM Delivery
""").show()

+--------------------+
|immediate_percentage|
+--------------------+
|               33.33|
+--------------------+



### 613 Shortest Distance in a Line

In [7]:
dfs = spark_df_reader('Leetcode_Q_613 ', ['Point'])
temporary_view_registrator(dfs)

In [8]:
spark.sql("""
SELECT 
    MIN(ABS(p.x - pp.x)) AS shortest
FROM Point p
JOIN Point pp
    ON p.x <> pp.x
""").show()


+--------+
|shortest|
+--------+
|       1|
+--------+



### 2082 The Number of Rich Customers

In [9]:
dfs = spark_df_reader('Leetcode_Q_2082', ['Store'])
temporary_view_registrator(dfs)

In [10]:
spark.sql("""
SELECT 
    COUNT(DISTINCT customer_id)AS rich_count
FROM Store
WHERE amount > 500
""").show()

+----------+
|rich_count|
+----------+
|         2|
+----------+



### 1179 Reformat Department Table


In [16]:
dfs = spark_df_reader('Leetcode_Q_1179', ['Department'])
temporary_view_registrator(dfs)

In [30]:
spark.sql("""
SELECT 
    id,
    SUM(IF(month = 'Jan', revenue, NULL)) AS Jan_Revenue,
    SUM(IF(month = 'Feb', revenue, NULL)) AS Feb_Revenue,
    SUM(IF(month = 'Mar', revenue, NULL)) AS Mar_Revenue,
    SUM(IF(month = 'Apr', revenue, NULL)) AS Apr_Revenue,
    SUM(IF(month = 'May', revenue, NULL)) AS May_Revenue,
    SUM(IF(month = 'Jun', revenue, NULL)) AS Jun_Revenue,
    SUM(IF(month = 'Jul', revenue, NULL)) AS Jul_Revenue,
    SUM(IF(month = 'Aug', revenue, NULL)) AS Aug_Revenue,
    SUM(IF(month = 'Sep', revenue, NULL)) AS Sep_Revenue,
    SUM(IF(month = 'Oct', revenue, NULL)) AS Oct_Revenue,
    SUM(IF(month = 'Nov', revenue, NULL)) AS Nov_Revenue,
    SUM(IF(month = 'Dec', revenue, NULL)) AS Dec_Revenue
FROM Department
GROUP BY id
""").show()


+---+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
| id|Jan_Revenue|Feb_Revenue|Mar_Revenue|Apr_Revenue|May_Revenue|Jun_Revenue|Jul_Revenue|Aug_Revenue|Sep_Revenue|Oct_Revenue|Nov_Revenue|Dec_Revenue|
+---+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|  1|       8000|       7000|       6000|       null|       null|       null|       null|       null|       null|       null|       null|       null|
|  3|       null|      10000|       null|       null|       null|       null|       null|       null|       null|       null|       null|       null|
|  2|       9000|       null|       null|       null|       null|       null|       null|       null|       null|       null|       null|       null|
+---+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------

### 1581 Customer Who Visited but Did Not Make Any Transactions

In [7]:
dfs = spark_df_reader('Leetcode_Q_1581', ['Visits', 'Transactions'])
temporary_view_registrator(dfs)

In [15]:
spark.sql("""
SELECT customer_id,
        SUM(CASE 
            WHEN amount IS NULL THEN 1 ELSE 0 END ) AS count_no_trans
FROM Visits
LEFT JOIN Transactions
    USING (visit_id)
GROUP BY customer_id
HAVING count_no_trans <> 0 
""").show()

+-----------+--------------+
|customer_id|count_no_trans|
+-----------+--------------+
|         54|             2|
|         96|             1|
|         30|             1|
+-----------+--------------+



### 2377 Sort the Olympic Table

In [7]:
dfs = spark_df_reader('Leetcode_Q_2377 ', ['Olympic'])
temporary_view_registrator(dfs)

In [8]:
spark.sql("""
SELECT *
FROM Olympic
ORDER BY 
  gold_medals DESC, 
  silver_medals DESC, 
  bronze_medals DESC, 
  country
""").show()

+-----------+-----------+-------------+-------------+
|    country|gold_medals|silver_medals|bronze_medals|
+-----------+-----------+-------------+-------------+
|      China|         10|           10|           20|
|        USA|         10|           10|           20|
|     Israel|          2|            2|            3|
|      Egypt|          2|            2|            2|
|South Sudan|          0|            0|            1|
+-----------+-----------+-------------+-------------+



### 1484 Group Sold Products By The Date


In [9]:
dfs = spark_df_reader('Leetcode_Q_1484', ['Activities'])
temporary_view_registrator(dfs)

In [19]:
spark.sql("""
SELECT 
    sell_date, 
    COUNT(DISTINCT product) AS num_sold, 
    CONCAT_WS(', ', SORT_ARRAY(COLLECT_LIST(DISTINCT product))) AS products
FROM Activities
GROUP BY sell_date
ORDER BY sell_date  
""").show()


# SELECT 
#     sell_date, 
#     COUNT(DISTINCT product) AS num_sold, 
#     GROUP_CONCAT(DISTINCT product) AS products
# FROM Activities
# GROUP BY sell_date
# ORDER BY sell_date 

# Apache Spark  doesn't natively support the GROUP_CONCAT() function that's available in MySQL. 
# In Spark SQL, I can achieve similar results using the collect_list() or collect_set() functions 
# along with the concat_ws() function. 

+----------+--------+--------------------+
| sell_date|num_sold|            products|
+----------+--------+--------------------+
|2020-05-30|       3|Basketball, Headp...|
|2020-06-01|       2|       Bible, Pencil|
|2020-06-02|       1|                Mask|
+----------+--------+--------------------+



### 1890 The Latest Login in 2020


In [22]:
dfs = spark_df_reader('Leetcode_Q_1890', ['Logins'])
temporary_view_registrator(dfs)

In [23]:
spark.sql("""
SELECT
    user_id,
    MAX(time_stamp) AS last_stamp 
FROM Logins
WHERE YEAR(time_stamp) = 2020
GROUP BY user_id
""").show()

+-------+-------------------+
|user_id|         last_stamp|
+-------+-------------------+
|      6|2020-06-30 15:06:07|
|      8|2020-12-30 00:46:50|
|      2|2020-01-16 02:49:50|
+-------+-------------------+



### 1251 Average Selling Price


In [24]:
dfs = spark_df_reader('Leetcode_Q_1251', ['Prices', 'UnitsSold'])
temporary_view_registrator(dfs)

In [26]:
spark.sql("""
SELECT
    p.product_id,
    ROUND(SUM(p.price * u.units) / SUM(units), 2) AS average_price
FROM Prices p
JOIN UnitsSold u
    ON p.product_id = u.product_id AND
    purchase_date BETWEEN start_date AND end_date
GROUP BY p.product_id""").show()

+----------+-------------+
|product_id|average_price|
+----------+-------------+
|         1|         6.96|
|         2|        16.96|
+----------+-------------+



### 1435 Create a Session Bar Chart

In [7]:
dfs = spark_df_reader('Leetcode_Q_1435', ['Sessions'])
temporary_view_registrator(dfs)

In [8]:
spark.sql("""
SELECT 
    '[0-5>' AS bin,
    SUM(CASE
        WHEN duration/60 >= 0 AND duration/60 < 5 THEN 1 ELSE 0
    END) AS total
FROM Sessions
UNION
SELECT 
    '[5-10>' AS bin,
    SUM(CASE
        WHEN duration/60 >= 5 AND duration/60 < 10 THEN 1 ELSE 0
    END) AS total
FROM Sessions
UNION
SELECT 
    '[10-15>' AS bin,
    SUM(CASE
        WHEN duration/60 >= 10 AND duration/60 < 15 THEN 1 ELSE 0
    END) AS total
FROM Sessions
UNION
SELECT 
    '15 or more' AS bin,
    SUM(CASE
        WHEN duration/60 >= 15 THEN 1 ELSE 0
    END) AS total
FROM Sessions
""").show()

[Stage 0:>                                                          (0 + 1) / 1][Stage 0:>                  (0 + 1) / 1][Stage 1:>                  (0 + 1) / 1]

+----------+-----+
|       bin|total|
+----------+-----+
|     [0-5>|    3|
|    [5-10>|    1|
|   [10-15>|    0|
|15 or more|    1|
+----------+-----+



                                                                                

### 1148 Article Views I


In [9]:
dfs = spark_df_reader('Leetcode_Q_1148', ['Views'])
temporary_view_registrator(dfs)

In [13]:
spark.sql("""
SELECT DISTINCT author_id AS id
FROM Views
WHERE author_id = viewer_id
ORDER BY id 
""").show()

+---+
| id|
+---+
|  4|
|  7|
+---+



### 2687. Bikes Last Time Used


In [8]:
dfs = spark_df_reader('Leetcode_Q_2687', ['Bikes'])
temporary_view_registrator(dfs)

In [9]:
spark.sql("""
SELECT *
FROM Bikes
""").show()



+-------+-----------+-------------------+-------------------+
|ride_id|bike_number|         start_time|           end_time|
+-------+-----------+-------------------+-------------------+
|      1|     W00576|2012-03-25 11:30:00|2012-03-25 12:40:00|
|      2|     W00300|2012-03-25 10:30:00|2012-03-25 10:50:00|
|      3|     W00455|2012-03-26 14:30:00|2012-03-26 17:40:00|
|      4|     W00455|2012-03-25 12:30:00|2012-03-25 13:40:00|
|      5|     W00576|2012-03-25 08:10:00|2012-03-25 09:10:00|
|      6|     W00576|2012-03-28 02:30:00|2012-03-28 02:50:00|
+-------+-----------+-------------------+-------------------+



                                                                                

In [12]:
spark.sql("""
SELECT 
    bike_number,
    end_time
FROM Bikes
WHERE end_time IN (
        SELECT MAX(end_time)
        FROM Bikes
        GROUP BY bike_number

)
ORDER BY end_time DESC

""").show()

+-----------+-------------------+
|bike_number|           end_time|
+-----------+-------------------+
|     W00576|2012-03-28 02:50:00|
|     W00455|2012-03-26 17:40:00|
|     W00300|2012-03-25 10:50:00|
+-----------+-------------------+



### 175. Combine Two Tables


In [14]:
dfs = spark_df_reader('Leetcode_Q_175', ['Person', 'Address'])
temporary_view_registrator(dfs)

In [15]:
spark.sql("""
SELECT 
    firstName,
    lastName,
    city,
    state
FROM Person p
LEFT JOIN Address a
    USING (personId)
    """).show()

+---------+--------+-------------+--------+
|firstName|lastName|         city|   state|
+---------+--------+-------------+--------+
|     Wang|   Allen|         null|    null|
|    Alice|     Bob|New York City|New York|
+---------+--------+-------------+--------+



### 511. Game Play Analysis I


In [16]:
dfs = spark_df_reader('Leetcode_Q_511', ['Activity'])
temporary_view_registrator(dfs)

In [17]:
spark.sql("""
SELECT 
    player_id,
    min(event_date) AS first_login
FROM Activity
GROUP BY player_id
""").show()

+---------+-----------+
|player_id|first_login|
+---------+-----------+
|        1| 2016-03-01|
|        3| 2016-03-02|
|        2| 2017-06-25|
+---------+-----------+



### 1082. Sales Analysis I


In [18]:
dfs = spark_df_reader('Leetcode_Q_1082', ['Product', 'Sales'])
temporary_view_registrator(dfs)

In [19]:
spark.sql("""
WITH cte AS (SELECT 
    seller_id, SUM(price) AS summ
FROM Sales
GROUP BY seller_id)

SELECT seller_id
FROM cte
WHERE summ = (
    SELECT MAX(summ)
    FROM cte
)

""").show()

+---------+
|seller_id|
+---------+
|        1|
|        3|
+---------+



### 577. Employee Bonus


In [9]:
dfs = spark_df_reader('Leetcode_Q_577', ['Employee', 'Bonus'])
temporary_view_registrator(dfs)

In [11]:
spark.sql("""
SELECT 
    e.name,
    b.bonus
FROM Employee e
LEFT JOIN Bonus b
    USING (empId)
WHERE bonus < 1000 OR bonus IS NULL 
""").show()

+----+-----+
|name|bonus|
+----+-----+
|John| null|
|Brad| null|
| Dan|  500|
+----+-----+



### 2072. The Winner University


In [12]:
dfs = spark_df_reader('Leetcode_Q_2072', ['NewYork', 'California'])
temporary_view_registrator(dfs)

In [17]:
spark.sql("""
WITH cte AS (SELECT 
    'New York University' AS id,
    SUM(
        CASE WHEN score >= 90 THEN 1 ELSE 0 END
    ) AS num_exc
FROM NewYork
UNION 
SELECT 
    'California University' AS id, 
    SUM(
        CASE WHEN score >= 90 THEN 1 ELSE 0 END
    ) AS num_exc
FROM California)

SELECT 
    IF(COUNT(id) = 1, id, 'No Winner') AS winner
FROM cte
WHERE num_exc = (
    SELECT MAX(num_exc)
    FROM cte    
)
GROUP BY id
""").show()

+-------------------+
|             winner|
+-------------------+
|New York University|
+-------------------+



### 2837. Total Traveled Distance

In [18]:
dfs = spark_df_reader('Leetcode_Q_2837', ['Users', 'Rides'])
temporary_view_registrator(dfs)

In [36]:
spark.sql("""

SELECT
    user_id,
    name,
    IFNULL(SUM(distance), 0) AS `traveled distance`
FROM Users u
LEFT JOIN Rides r
    USING (user_id)
GROUP BY user_id, name
ORDER BY user_id



""").show()

+-------+-------+-----------------+
|user_id|   name|traveled distance|
+-------+-------+-----------------+
|      2|  Avery|              393|
|      4|Michael|              416|
|     10|Eleanor|                0|
|     14|  Ethan|              186|
|     17|Addison|              160|
+-------+-------+-----------------+



### 620. Not Boring Movies


In [38]:
dfs = spark_df_reader('Leetcode_Q_620', ['cinema'])
temporary_view_registrator(dfs)

In [40]:
spark.sql("""

SELECT 
    *
FROM cinema
WHERE id%2 <> 0 AND description <> 'boring'
ORDER BY rating DESC
""").show()

+---+----------+-----------+------+
| id|     movie|description|rating|
+---+----------+-----------+------+
|  5|House card|Interesting|   9.1|
|  1|       War|   great 3D|   8.9|
+---+----------+-----------+------+



### 1965. Employees With Missing Information

In [7]:
dfs = spark_df_reader('Leetcode_Q_1965', ['Employees', 'Salaries'])
temporary_view_registrator(dfs)

In [23]:
spark.sql("""
SELECT employee_id
FROM Employees e
LEFT JOIN Salaries S
    USING (employee_id)
WHERE salary IS NULL
UNION ALL
SELECT employee_id
FROM Employees e
RIGHT JOIN Salaries S
    USING (employee_id)
WHERE name IS NULL
ORDER BY employee_id
""").show()

+-----------+
|employee_id|
+-----------+
|          1|
|          2|
+-----------+



### 1327. List the Products Ordered in a Period


In [24]:
dfs = spark_df_reader('Leetcode_Q_1327', ['Products', 'Orders'])
temporary_view_registrator(dfs)

In [44]:
spark.sql("""
SELECT product_name, SUM(unit) AS unit
FROM Products p
JOIN Orders o
    USING (product_id) 
WHERE MONTH(order_date) = 2 AND YEAR(order_date) = 2020
GROUP BY product_name
HAVING SUM(unit) >= 100 
""").show()

+------------------+----+
|      product_name|unit|
+------------------+----+
|      Leetcode Kit| 100|
|Leetcode Solutions| 130|
+------------------+----+

