In [None]:
# Setup
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Create SparkSession
spark = SparkSession.builder.appName("SQL Queries Practice").getOrCreate()

# Create sample datasets
employees_data = [
    (1, "Alice", "Engineering", 75000, "2020-01-15"),
    (2, "Bob", "Sales", 65000, "2019-03-20"),
    (3, "Charlie", "Engineering", 80000, "2018-06-10"),
    (4, "Diana", "Marketing", 70000, "2021-02-28"),
    (5, "Eve", "Sales", 68000, "2017-11-05"),
    (6, "Frank", "Engineering", 82000, "2020-09-12")
]

departments_data = [
    ("Engineering", "Tech", "Alice Johnson"),
    ("Sales", "Business", "Bob Smith"),
    ("Marketing", "Business", "Charlie Brown"),
    ("HR", "Support", "Diana Prince")
]

employees_df = spark.createDataFrame(employees_data, ["id", "name", "department", "salary", "hire_date"])
departments_df = spark.createDataFrame(departments_data, ["dept_name", "division", "manager"])

print("Employees DataFrame:")
employees_df.show()

print("Departments DataFrame:")
departments_df.show()


In [None]:
# Register DataFrames as temporary views
employees_df.createOrReplaceTempView("employees")
departments_df.createOrReplaceTempView("departments")

print("Temporary views created successfully!")

# List all temporary views
print("\nCurrent temporary views:")
spark.catalog.listTables()


In [None]:
# Basic SELECT queries
print("1. Select all employees:")
result1 = spark.sql("SELECT * FROM employees")
result1.show()

print("\n2. Select specific columns:")
result2 = spark.sql("SELECT name, department, salary FROM employees")
result2.show()

print("\n3. Filter with WHERE clause:")
result3 = spark.sql("SELECT name, salary FROM employees WHERE salary > 70000")
result3.show()

print("\n4. Order by salary:")
result4 = spark.sql("SELECT name, salary FROM employees ORDER BY salary DESC")
result4.show()

print("\n5. Count employees by department:")
result5 = spark.sql("""
    SELECT department, COUNT(*) as employee_count
    FROM employees
    GROUP BY department
""")
result5.show()


In [None]:
# Create global temporary view
employees_df.createGlobalTempView("global_employees")

print("Global temporary view created!")

# Access global temporary view (note the global_temp prefix)
print("\nAccess global temporary view:")
result_global = spark.sql("SELECT * FROM global_temp.global_employees WHERE department = 'Engineering'")
result_global.show()

# You can also access it from a different SparkSession
# new_spark = SparkSession.builder.appName("NewSession").getOrCreate()
# result_from_new_session = new_spark.sql("SELECT COUNT(*) FROM global_temp.global_employees")
# result_from_new_session.show()

print("\nDifference between temporary and global temporary views:")
print("- Temporary views: Session-scoped, accessed directly by name")
print("- Global temporary views: Application-scoped, accessed via global_temp.view_name")


In [None]:
# Example 1: Filter and select
print("=== EXAMPLE 1: Filter and Select ===")

# DataFrame API
print("DataFrame API:")
df_api_result = employees_df.filter(employees_df.salary > 70000).select("name", "department", "salary")
df_api_result.show()

# SQL
print("SQL:")
sql_result = spark.sql("""
    SELECT name, department, salary
    FROM employees
    WHERE salary > 70000
""")
sql_result.show()

# Example 2: Group by and aggregate
print("\n=== EXAMPLE 2: Group By and Aggregate ===")

# DataFrame API
print("DataFrame API:")
df_api_agg = employees_df.groupBy("department").agg(
    F.count("*").alias("count"),
    F.avg("salary").alias("avg_salary"),
    F.max("salary").alias("max_salary")
)
df_api_agg.show()

# SQL
print("SQL:")
sql_agg = spark.sql("""
    SELECT department,
           COUNT(*) as count,
           AVG(salary) as avg_salary,
           MAX(salary) as max_salary
    FROM employees
    GROUP BY department
""")
sql_agg.show()


In [None]:
# JOIN operations
print("=== JOIN OPERATIONS ===")

print("1. Inner join employees with departments:")
join_result = spark.sql("""
    SELECT e.name, e.salary, d.division, d.manager
    FROM employees e
    INNER JOIN departments d ON e.department = d.dept_name
""")
join_result.show()

print("\n2. Left join to include all employees:")
left_join_result = spark.sql("""
    SELECT e.name, e.department, e.salary, d.division
    FROM employees e
    LEFT JOIN departments d ON e.department = d.dept_name
""")
left_join_result.show()

# Subqueries
print("\n=== SUBQUERIES ===")

print("3. Employees with above-average salary:")
subquery_result = spark.sql("""
    SELECT name, salary
    FROM employees
    WHERE salary > (SELECT AVG(salary) FROM employees)
    ORDER BY salary DESC
""")
subquery_result.show()

print("\n4. Department with highest average salary:")
dept_avg_result = spark.sql("""
    SELECT department, avg_salary
    FROM (
        SELECT department, AVG(salary) as avg_salary
        FROM employees
        GROUP BY department
    ) dept_avg
    ORDER BY avg_salary DESC
    LIMIT 1
""")
dept_avg_result.show()


In [None]:
# Window functions
print("=== WINDOW FUNCTIONS ===")

print("5. Rank employees by salary within each department:")
window_result = spark.sql("""
    SELECT name, department, salary,
           RANK() OVER (PARTITION BY department ORDER BY salary DESC) as rank_in_dept,
           ROW_NUMBER() OVER (ORDER BY salary DESC) as overall_rank
    FROM employees
""")
window_result.show()

print("\n6. Running total of salaries:")
running_total_result = spark.sql("""
    SELECT name, department, salary,
           SUM(salary) OVER (ORDER BY salary ROWS UNBOUNDED PRECEDING) as running_total
    FROM employees
    ORDER BY salary
""")
running_total_result.show()

# Common Table Expressions (CTEs)
print("\n=== COMMON TABLE EXPRESSIONS ===")

print("7. Using CTE to find top performers:")
cte_result = spark.sql("""
    WITH dept_stats AS (
        SELECT department, AVG(salary) as avg_salary
        FROM employees
        GROUP BY department
    ),
    top_performers AS (
        SELECT e.name, e.department, e.salary
        FROM employees e
        JOIN dept_stats d ON e.department = d.department
        WHERE e.salary > d.avg_salary
    )
    SELECT * FROM top_performers
    ORDER BY salary DESC
""")
cte_result.show()


In [None]:
# Create additional data for exercises
products_data = [
    (1, "Laptop", "Electronics", 1200, 50),
    (2, "Mouse", "Electronics", 25, 200),
    (3, "Keyboard", "Electronics", 75, 150),
    (4, "Chair", "Furniture", 300, 30),
    (5, "Desk", "Furniture", 500, 20),
    (6, "Book", "Education", 15, 1000)
]

orders_data = [
    (101, 1, 2, "2024-01-15"),
    (102, 2, 5, "2024-01-16"),
    (103, 1, 1, "2024-01-17"),
    (104, 3, 3, "2024-01-18"),
    (105, 4, 1, "2024-01-19"),
    (106, 5, 2, "2024-01-20")
]

products_df = spark.createDataFrame(products_data, ["product_id", "product_name", "category", "price", "stock"])
orders_df = spark.createDataFrame(orders_data, ["order_id", "product_id", "quantity", "order_date"])

# Register as temp views
products_df.createOrReplaceTempView("products")
orders_df.createOrReplaceTempView("orders")

print("Products table:")
products_df.show()
print("Orders table:")
orders_df.show()

# TODO: Complete the following exercises using SQL

print("\n=== EXERCISE 1 ===")
print("Find all products with stock less than 100")
# Your SQL query here
# result1 = spark.sql("YOUR QUERY HERE")
# result1.show()

print("\n=== EXERCISE 2 ===")
print("Calculate total revenue by category")
# Your SQL query here (hint: join products and orders, then group by category)
# result2 = spark.sql("YOUR QUERY HERE")
# result2.show()

print("\n=== EXERCISE 3 ===")
print("Find the most expensive product in each category")
# Your SQL query here (hint: use window functions)
# result3 = spark.sql("YOUR QUERY HERE")
# result3.show()

print("\n=== EXERCISE 4 ===")
print("List products that have never been ordered")
# Your SQL query here (hint: use LEFT JOIN or NOT EXISTS)
# result4 = spark.sql("YOUR QUERY HERE")
# result4.show()

print("\n=== EXERCISE 5 ===")
print("Create a report showing order details with product information")
# Your SQL query here
# result5 = spark.sql("YOUR QUERY HERE")
# result5.show()
