# Data Exploration


In [None]:
from spark_init import init_spark
from pyspark.sql.functions import col, count, sum, avg, max, min

# Initialize Spark (HADOOP_HOME is set in spark_init.py)
spark = init_spark("DataExploration")

In [None]:
# Create tables from existing Delta data
import os

# Get the project root (parent of src)
project_root = os.path.dirname(os.getcwd())
data_path = os.path.join(project_root, "data")

# Create tables pointing to existing Delta files
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS employees
    USING DELTA
    LOCATION '{data_path}/employees'
""")

spark.sql(f"""
    CREATE TABLE IF NOT EXISTS departments
    USING DELTA
    LOCATION '{data_path}/departments'
""")

spark.sql(f"""
    CREATE TABLE IF NOT EXISTS projects
    USING DELTA
    LOCATION '{data_path}/projects'
""")

spark.sql(f"""
    CREATE TABLE IF NOT EXISTS assignments
    USING DELTA
    LOCATION '{data_path}/assignments'
""")

print(f"âœ“ Tables created from {data_path}")
spark.sql("SHOW TABLES").show()

In [None]:
spark.sql("select * from employees").show()

In [None]:
spark.table("departments").show()

In [None]:
spark.table("projects").show()

In [None]:
spark.table("assignments").show()

## Playground


## SHUTDOWN


In [None]:
# Shutdown embedded Derby metastore (expected SQLException on success) and stop Spark
from pathlib import Path
import time, os

if 'spark' not in globals() or spark is None:
    raise RuntimeError("No active Spark session found in `spark` variable.")

# Locate metastore directory (tries a few common locations)
candidates = [
    Path.cwd() / "metastore_db",
    Path.cwd().parent / "metastore_db",
    Path(os.getcwd()).parent / "metastore_db"
]
metastore_dir = next((p for p in candidates if p.exists()), candidates[0])

metastore_jdbc = "jdbc:derby:{};shutdown=true".format(metastore_dir.as_posix())

print("Attempting Derby shutdown for:", metastore_dir)
try:
    # Derby throws an exception on successful shutdown - ignore it
    spark._jvm.java.sql.DriverManager.getConnection(metastore_jdbc)
except Exception as e:
    print("Derby shutdown raised (expected on success)")

# small pause to allow JVM threads to settle
time.sleep(0.5)

# Stop Spark context
spark.stop()
print("Spark stopped. If Derby lock persists, restart the kernel to fully terminate the JVM.")