In [11]:
from pyspark.sql import SparkSession

# spark = SparkSession.builder.master("local[*]").getOrCreate()
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [12]:
# create data array (rows), and columns name array
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
df.show()

# Creates a temporay session-scoped logical table from a DataFrame
df.createOrReplaceTempView("sqltable")

# runing an SQL querry - produces a new dataframe - notice how we querry from the temp table sqltable we created
res = spark.sql("select * from sqltable where age > 25")
res.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+

+-------+---+
|   Name|Age|
+-------+---+
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [13]:
# JSON style data
data = [
 {"name": "Alice", "age": 25, "department": "HR", "salary": 50000},
 {"name": "Bob", "age": 30, "department": "IT", "salary": 70000},
 {"name": "Charlie", "age": 35, "department": "Finance", "salary": 80000},
 {"name": "David", "age": 40, "department": "IT", "salary": 90000},
 {"name": "Eve", "age": 45, "department": "Finance", "salary": 100000},
]

df_json = spark.createDataFrame(data)
df_json.show()

df_json.createOrReplaceTempView("jsontable")
spark.sql("select * from jsontable where salary >= 80000").show()

# ex with a bigger querry
query = """
 SELECT department, COUNT(*) AS employee_count, AVG(salary) AS avg_salary
 FROM jsontable
 WHERE age > 20
 GROUP BY department
 ORDER BY avg_salary DESC
"""

result = spark.sql(query)
result.show()

+---+----------+-------+------+
|age|department|   name|salary|
+---+----------+-------+------+
| 25|        HR|  Alice| 50000|
| 30|        IT|    Bob| 70000|
| 35|   Finance|Charlie| 80000|
| 40|        IT|  David| 90000|
| 45|   Finance|    Eve|100000|
+---+----------+-------+------+

+---+----------+-------+------+
|age|department|   name|salary|
+---+----------+-------+------+
| 35|   Finance|Charlie| 80000|
| 40|        IT|  David| 90000|
| 45|   Finance|    Eve|100000|
+---+----------+-------+------+

+----------+--------------+----------+
|department|employee_count|avg_salary|
+----------+--------------+----------+
|   Finance|             2|   90000.0|
|        IT|             2|   80000.0|
|        HR|             1|   50000.0|
+----------+--------------+----------+



In [23]:
employees_data = [
 (1, "Alice", "HR"),
 (2, "Bob", "IT"),
 (3, "Charlie", "Finance"),
 (4, "David", "IT"),
 (5, "Eve", "Finance"),
]

employees_columns = ["emp_id", "name", "department"]

emp_df = spark.createDataFrame(employees_data, employees_columns)

# again, we do this to create a temporary SQL table
emp_df.createOrReplaceTempView("employees")

salaries_data = [
 (1, 50000),
 (2, 70000),
 (3, 80000),
 (4, 90000),
 (5, 100000),
]

salaries_columns = ["emp_id", "salary"]
salary_df = spark.createDataFrame(salaries_data, salaries_columns)
salary_df.createOrReplaceTempView("salary")

# SQL refresh - we are selecting from both tables, but we are merging both first
# then we look at whatever the data is in the merge, and select from there.

# The 'from' here tells SQL what table to start with, what is the beggining dataset
# to work with, and then the 'join' joins that dataset with the salary table.
q = """
select e.emp_id as ID, e.name as Name, e.department as Dept, s.salary as Bread
from employees e
join salary s on e.emp_id = s.emp_id
"""

# joining both tables, this produces another table which SPARK saves as a DF.
employees_joined = spark.sql(q)
employees_joined.show()

new_employees_data = [
 (6, "Frank", "Marketing", 110000),
 (7, "Grace", "HR", 60000),
]

new_employees_columns = ["emp_id", "name", "department", "salary"]

new_emp_df = spark.createDataFrame(new_employees_data, new_employees_columns)
new_emp_df.createOrReplaceTempView("newemps")

# recall: union appends rows to the table (vertical), "join" appends columns (horizontal).
# in the nested querry, we are creating a new talbe, which we call "joined" - from now on,
# to referance the values inside this new table we need to do "joined.<col name>"
q_union = """
select joined.emp_id as ID, joined.name as Name, joined.department as Dept, joined.salary as Bread
  from (select e.emp_id, e.name, e.department, s.salary
  from employees e
  join salary s on e.emp_id = s.emp_id) joined
union
select ne.emp_id, ne.name, ne.department, ne.salary
from newemps ne
"""
spark.sql(q_union).show()

+---+-------+-------+------+
| ID|   Name|   Dept| Bread|
+---+-------+-------+------+
|  1|  Alice|     HR| 50000|
|  2|    Bob|     IT| 70000|
|  3|Charlie|Finance| 80000|
|  4|  David|     IT| 90000|
|  5|    Eve|Finance|100000|
+---+-------+-------+------+

+------+-------+----------+------+
|emp_id|   name|department|salary|
+------+-------+----------+------+
|     5|    Eve|   Finance|100000|
|     3|Charlie|   Finance| 80000|
|     2|    Bob|        IT| 70000|
|     4|  David|        IT| 90000|
|     1|  Alice|        HR| 50000|
|     6|  Frank| Marketing|110000|
|     7|  Grace|        HR| 60000|
+------+-------+----------+------+

