In [1]:
# Instalar PySpark
!pip install pyspark

# Importar las bibliotecas necesarias
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, approx_count_distinct, corr
from pyspark.sql import functions as F




In [2]:
# Iniciar una sesión de Spark
spark = SparkSession.builder.appName("InformeDeVentas").getOrCreate()


In [3]:
# Datos de ejemplo (ID de producto, ventas, inventario)
data = [(1, 100, 50), (2, 150, 60), (3, 130, 55), (4, 170, 40),
        (5, 200, 30), (6, 130, 20), (7, 110, 70)]
columns = ["ProductoID", "Ventas", "Inventario"]
df = spark.createDataFrame(data, columns)
df.show()


+----------+------+----------+
|ProductoID|Ventas|Inventario|
+----------+------+----------+
|         1|   100|        50|
|         2|   150|        60|
|         3|   130|        55|
|         4|   170|        40|
|         5|   200|        30|
|         6|   130|        20|
|         7|   110|        70|
+----------+------+----------+



In [4]:
# Calcular la media de ventas
df.select(mean(col("Ventas")).alias("Media de Ventas")).show()


+------------------+
|   Media de Ventas|
+------------------+
|141.42857142857142|
+------------------+



In [5]:
# Calcular la mediana de ventas
mediana = df.approxQuantile("Ventas", [0.5], 0.01)
print("Mediana de Ventas:", mediana[0])


Mediana de Ventas: 130.0


In [6]:
# Calcular la moda de ventas
mode_df = df.groupBy("Ventas").count().orderBy(F.desc("count"))
mode_df.show(1)  # Mostrar la moda


+------+-----+
|Ventas|count|
+------+-----+
|   130|    2|
+------+-----+
only showing top 1 row



In [7]:
# Calcular la correlación entre inventario y ventas
df.select(corr("Ventas", "Inventario").alias("Correlación Ventas-Inventario")).show()


+-----------------------------+
|Correlación Ventas-Inventario|
+-----------------------------+
|          -0.4687535763196143|
+-----------------------------+

