In [2]:
file_path = "abfss://fsogvd@datalakeogvdlabs.dfs.core.windows.net/user/products.csv"

df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

print("Datos de productos cargados:")
print(f"Registros: {df.count()}")
print(f"Columnas: {len(df.columns)}")
df.show(10)

StatementMeta(sparkpool01, 0, 2, Finished, Available, Finished)

Datos de productos cargados:
Registros: 295
Columnas: 4
+---------+--------------------+--------------+---------+
|ProductID|         ProductName|      Category|ListPrice|
+---------+--------------------+--------------+---------+
|      771|Mountain-100 Silv...|Mountain Bikes|  3399.99|
|      772|Mountain-100 Silv...|Mountain Bikes|  3399.99|
|      773|Mountain-100 Silv...|Mountain Bikes|  3399.99|
|      774|Mountain-100 Silv...|Mountain Bikes|  3399.99|
|      775|Mountain-100 Blac...|Mountain Bikes|  3374.99|
|      776|Mountain-100 Blac...|Mountain Bikes|  3374.99|
|      777|Mountain-100 Blac...|Mountain Bikes|  3374.99|
|      778|Mountain-100 Blac...|Mountain Bikes|  3374.99|
|      779|Mountain-200 Silv...|Mountain Bikes|  2319.99|
|      780|Mountain-200 Silv...|Mountain Bikes|  2319.99|
+---------+--------------------+--------------+---------+
only showing top 10 rows



In [3]:
df.createOrReplaceTempView("products")
print("Vista 'products' creada para consultas SQL")

print("Esquema de datos:")
df.printSchema()

StatementMeta(sparkpool01, 0, 3, Finished, Available, Finished)

Vista 'products' creada para consultas SQL
Esquema de datos:
root
 |-- ProductID: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- ListPrice: double (nullable = true)



In [6]:
total_products = spark.sql("SELECT COUNT(*) as total_products FROM products")
total_products.show()

expensive_products = spark.sql("""
SELECT ProductName, ListPrice 
FROM products 
WHERE ListPrice IS NOT NULL 
ORDER BY CAST(ListPrice AS DOUBLE) DESC 
LIMIT 10
""")
print("Top 10 productos más caros:")
expensive_products.show()

products_by_category = spark.sql("""
SELECT Category, COUNT(*) as product_count
FROM products 
WHERE Category IS NOT NULL
GROUP BY Category
ORDER BY product_count DESC
""")
print("Productos por categoría:")
products_by_category.show()

price_stats = spark.sql("""
SELECT 
    Category,
    COUNT(*) as product_count,
    AVG(CAST(ListPrice AS DOUBLE)) as avg_price,
    MIN(CAST(ListPrice AS DOUBLE)) as min_price,
    MAX(CAST(ListPrice AS DOUBLE)) as max_price
FROM products 
WHERE ListPrice IS NOT NULL
GROUP BY Category
ORDER BY avg_price DESC
""")
print("Estadísticas de precios por categoría:")
price_stats.show()

StatementMeta(sparkpool01, 0, 6, Finished, Available, Finished)

+--------------+
|total_products|
+--------------+
|           295|
+--------------+

Top 10 productos más caros:
+--------------------+---------+
|         ProductName|ListPrice|
+--------------------+---------+
|    Road-150 Red, 44|  3578.27|
|    Road-150 Red, 62|  3578.27|
|    Road-150 Red, 48|  3578.27|
|    Road-150 Red, 52|  3578.27|
|    Road-150 Red, 56|  3578.27|
|Mountain-100 Silv...|  3399.99|
|Mountain-100 Silv...|  3399.99|
|Mountain-100 Silv...|  3399.99|
|Mountain-100 Silv...|  3399.99|
|Mountain-100 Blac...|  3374.99|
+--------------------+---------+

Productos por categoría:
+-----------------+-------------+
|         Category|product_count|
+-----------------+-------------+
|       Road Bikes|           43|
|      Road Frames|           33|
|   Mountain Bikes|           32|
|  Mountain Frames|           28|
|    Touring Bikes|           22|
|   Touring Frames|           18|
|           Wheels|           14|
|  Tires and Tubes|           11|
|          Saddles|     

In [7]:
# Cell 4: Guardar resultados en Data Lake (según PDF)
print("Guardando resultados en Data Lake...")

# Guardar productos caros como Parquet
expensive_products.write.mode("overwrite").parquet("abfss://fsogvd@datalakeogvdlabs.dfs.core.windows.net/user/expensive_products.parquet")

# Guardar estadísticas por categoría
products_by_category.write.mode("overwrite").parquet("abfss://fsogvd@datalakeogvdlabs.dfs.core.windows.net/user/products_by_category.parquet")

print("Datos guardados en formato Parquet en Data Lake")
print("Archivos creados:")
print("- expensive_products.parquet")
print("- products_by_category.parquet")

StatementMeta(sparkpool01, 0, 7, Finished, Available, Finished)

Guardando resultados en Data Lake...
Datos guardados en formato Parquet en Data Lake
Archivos creados:
- expensive_products.parquet
- products_by_category.parquet
