### Desarrollo Taller 04 - Databricks
#### Mario Alonso Vento Alvarado

####Arquitectura Medallion

In [0]:
catalog_name = "dmc_taller04"
schema_bronze = "bronze"
schema_silver = "silver"
schema_gold = "gold"

In [0]:
spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_bronze}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_silver}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_gold}")

### Bronze

In [0]:
path_base = "/Volumes/dmc_taller04/default/input/"

path_clientes = f"{path_base}/clientes.csv"
path_envios_base = f"{path_base}/envios_base.csv"
path_envios_incremento = f"{path_base}/envios_incremento.csv"
path_rutas = f"{path_base}/rutas.csv"
path_sucursales = f"{path_base}/sucursales.csv"

In [0]:
clientes = spark.read.option("header", True).option("inferSchema", True).csv(path_clientes)
envios_base = spark.read.option("header", True).option("inferSchema", True).csv(path_envios_base)
envios_incremento = spark.read.option("header", True).option("inferSchema", True).csv(path_envios_incremento)
matriculas = spark.read.option("header", True).option("inferSchema", True).csv(path_matriculas)

In [0]:
estudiantes.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_bronze}.estudiantes")
cursos.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_bronze}.cursos")
profesores.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_bronze}.profesores")
matriculas.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_bronze}.matriculas")

### Silver

In [0]:
from pyspark.sql.functions import col, trim, initcap, when, lit, year, month, dayofmonth, concat, concat_ws

In [0]:
silver_estudiantes = (
  spark.table(f"{catalog_name}.{schema_bronze}.estudiantes")
  .withColumn("nombre", initcap(trim(col("nombre"))))
  .withColumn("apellido", initcap(trim(col("apellido"))))
  .withColumn("carrera", initcap(trim(col("carrera"))))
  .withColumn("fecha_ingreso", col("fecha_ingreso").cast("date"))
  .dropna(subset=["id_estudiante"])
  .dropDuplicates(["id_estudiante"])
)

In [0]:
silver_cursos = (
  spark.table(f"{catalog_name}.{schema_bronze}.cursos")
  .withColumn("nombre_curso", initcap(trim(col("nombre_curso"))))
  .withColumn("facultad", initcap(trim(col("facultad"))))
  .withColumn("creditos", col("creditos").cast("int"))
  .filter(col("creditos") > 0)
  .dropna(subset=["id_curso"])
  .dropDuplicates(["id_curso"])
)

display(silver_cursos)

In [0]:
silver_profesores = (
  spark.table(f"{catalog_name}.{schema_bronze}.profesores")
  .withColumn("nombre", initcap(trim(col("nombre"))))
  .withColumn("apellido", initcap(trim(col("apellido"))))
  .withColumn("facultad", initcap(trim(col("facultad"))))
  .dropna(subset=["id_profesor"])
  .dropDuplicates(["id_profesor"])
)

display(silver_profesores)

In [0]:
silver_matriculas = (
    spark.table(f"{catalog_name}.{schema_bronze}.matriculas")
    .dropna(subset=["id_matricula", "id_estudiante", "id_curso"])
    .withColumn("nota_final", col("nota_final").cast("int"))
    .withColumn(
        "aprobado",
        when(col("nota_final") >= 11, lit(1))
        .otherwise(lit(0))
    )
)

display(silver_matriculas)

In [0]:
silver_estudiantes.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_silver}.estudiantes")
silver_cursos.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_silver}.cursos")
silver_profesores.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_silver}.profesores")
silver_matriculas.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_silver}.matriculas")

### Gold

In [0]:
#se crea la dimensión TIEMPO en base a la data de la tabla q vemos
#q es transaccional (donde hay mayor cantidad de registros)

# eliminar valores nulos = .dropna()
# eliminar valores duplicados = .distinct() || .dropDuplicates(subset=["columna"])
# eliminar columna = .drop("columna")

dim_tiempo = (
    spark.table(f"{catalog_name}.{schema_silver}.matriculas")
    .select(
        col("fecha_matricula").alias("fecha")
    ).dropna().distinct()
    .withColumn("anio", year(col("fecha")))
    .withColumn("mes", month(col("fecha")))
    .withColumn("dia", dayofmonth(col("fecha")))
    .withColumn("semestre", concat_ws("-",col("anio"), when(col("mes") <=7, lit("01")).otherwise(lit("02"))))
    .withColumn("id_tiempo", (col("anio")*10000 + col("mes")*100 + col("dia")).cast("int"))
)

display(dim_tiempo)

In [0]:
dim_estudiante = (
    spark.table(f"{catalog_name}.{schema_silver}.estudiantes")
    .dropDuplicates(["id_estudiante"])
)

display(dim_estudiante)

In [0]:
dim_cursos = (
    spark.table(f"{catalog_name}.{schema_silver}.cursos")
    .dropDuplicates(["id_curso"])
)

display(dim_cursos)

In [0]:
dim_profesor = (
    spark.table(f"{catalog_name}.{schema_silver}.profesores")
    .dropDuplicates(["id_profesor"])
)

display(dim_profesor)

In [0]:
fact_matriculas = (
    spark.table(f"{catalog_name}.{schema_silver}.matriculas").alias("m")
    .join(
        dim_tiempo.alias("t"),
        col("m.fecha_matricula") == col("t.fecha"),
        "left"
    )
    .join(
        dim_cursos.alias("c"),
        col("m.id_curso") == col("c.id_curso"),
        "left"
    )
    .select(
        col("m.id_matricula"),
        col("t.id_tiempo"),
        col("m.id_curso"),
        col("m.id_estudiante"),
        col("m.id_profesor"),
        col("c.creditos").alias("creditos_curso"),
        col("m.nota_final"),
        col("m.aprobado")
    )
)

display(fact_matriculas)

In [0]:
dim_tiempo.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_gold}.dim_tiempo")
dim_estudiante.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_gold}.dim_estudiante")
dim_cursos.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_gold}.dim_cursos")
dim_profesor.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_gold}.dim_profesor")
fact_matriculas.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_gold}.fact_matriculas")

In [0]:
%sql

Sele....

In [0]:
spark.sql("select.....")

In [0]:
%sql

CREATE OR REPLACE VIEW IF NOT EXISTS sesion_0701.gold.vw_kpi_carrera
AS


In [0]:
spark.sql(f"""
    CREATE OR REPLACE VIEW {catalog_name}.{schema_gold}.vw_kpi_carrera
    AS
    SELECT 
    de.carrera,
    COUNT(DISTINCT fm.id_estudiante) AS cantidad_alumnos,
    AVG(fm.nota_final) AS promedio_notas,
    SUM(fm.creditos_curso) AS creditos_totales
    FROM {catalog_name}.{schema_gold}.fact_matriculas fm
    LEFT JOIN {catalog_name}.{schema_gold}.dim_estudiante de ON fm.id_estudiante = de.id_estudiante
    GROUP BY de.carrera
""") 

In [0]:
%sql

select * from sesion_0701.gold.vw_kpi_carrera