In [1]:
from pyspark.sql import SparkSession

In [2]:
spark= SparkSession.builder.appName('agrupaciones').getOrCreate()

In [3]:
df= spark.read.csv('ventas.csv', inferSchema=True, header= True)

In [4]:
df.show()

+-------+-------+------+
|Empresa|Persona|Ventas|
+-------+-------+------+
|   GOOG| Carlos|   200|
|   GOOG|   Juan|   120|
|   GOOG| Felipe|   340|
|   MSFT|   Tina|   600|
|   MSFT| Andrea|   124|
|   MSFT|  Carla|   243|
|     FB|   Sara|   870|
|     FB|Ignacio|   350|
|   APPL| Miguel|   250|
|   APPL|  Oscar|   130|
|   APPL|  Jorge|   750|
|   APPL|   Ivan|   350|
+-------+-------+------+



In [5]:
df.printSchema()

root
 |-- Empresa: string (nullable = true)
 |-- Persona: string (nullable = true)
 |-- Ventas: integer (nullable = true)



## groupBy()

In [6]:
#Agrupar por empresa
df.groupBy('Empresa').count().show()

+-------+-----+
|Empresa|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



## Agregaciones

### agg()

In [7]:
df.agg({'Ventas': 'sum'}).show()

+-----------+
|sum(Ventas)|
+-----------+
|       4327|
+-----------+



In [8]:
df.agg({'ventas': 'min'}).show()

+-----------+
|min(ventas)|
+-----------+
|        120|
+-----------+



In [9]:
df.agg({'ventas': 'mean'}).show()

+-----------------+
|      avg(ventas)|
+-----------------+
|360.5833333333333|
+-----------------+



In [10]:
df.agg({'ventas': 'max'}).show()

+-----------+
|max(ventas)|
+-----------+
|        870|
+-----------+



In [12]:
agrupado= df.groupBy('Empresa')

In [13]:
agrupado.agg({'ventas': 'max'}).show()

+-------+-----------+
|Empresa|max(ventas)|
+-------+-----------+
|   APPL|        750|
|   GOOG|        340|
|     FB|        870|
|   MSFT|        600|
+-------+-----------+



## Otras Funciones

In [15]:
from pyspark.sql.functions import countDistinct, avg, stddev

In [18]:
#Conteo disitinto de empresa
df.select(countDistinct('Ventas')).show()

+----------------------+
|count(DISTINCT Ventas)|
+----------------------+
|                    11|
+----------------------+



In [19]:
# Conteo distinto por empresa
df.select(countDistinct('Empresa')).show()

+-----------------------+
|count(DISTINCT Empresa)|
+-----------------------+
|                      4|
+-----------------------+



In [20]:
df.select(avg('Ventas').alias('Promedio de las ventas')).show()

+----------------------+
|Promedio de las ventas|
+----------------------+
|     360.5833333333333|
+----------------------+



In [22]:
df.select(stddev('Ventas').alias('Desviacion Estandar')).show()

+-------------------+
|Desviacion Estandar|
+-------------------+
| 250.08742410799007|
+-------------------+



In [23]:
from pyspark.sql.functions import format_number #Para reducir los decimales 

In [25]:
# Para dejarlo en dos decimales 
ventas_stdv= df.select(format_number(stddev('Ventas'),2).alias('Desviacion Estandar'),).show()

+-------------------+
|Desviacion Estandar|
+-------------------+
|             250.09|
+-------------------+



### Para ordenar 

## orderBy()

In [26]:
df.orderBy('Ventas').show()

+-------+-------+------+
|Empresa|Persona|Ventas|
+-------+-------+------+
|   GOOG|   Juan|   120|
|   MSFT| Andrea|   124|
|   APPL|  Oscar|   130|
|   GOOG| Carlos|   200|
|   MSFT|  Carla|   243|
|   APPL| Miguel|   250|
|   GOOG| Felipe|   340|
|     FB|Ignacio|   350|
|   APPL|   Ivan|   350|
|   MSFT|   Tina|   600|
|   APPL|  Jorge|   750|
|     FB|   Sara|   870|
+-------+-------+------+



In [29]:
# Para ordenar de forma descendente
df.orderBy(df['Ventas'].desc()).show()

+-------+-------+------+
|Empresa|Persona|Ventas|
+-------+-------+------+
|     FB|   Sara|   870|
|   APPL|  Jorge|   750|
|   MSFT|   Tina|   600|
|     FB|Ignacio|   350|
|   APPL|   Ivan|   350|
|   GOOG| Felipe|   340|
|   APPL| Miguel|   250|
|   MSFT|  Carla|   243|
|   GOOG| Carlos|   200|
|   APPL|  Oscar|   130|
|   MSFT| Andrea|   124|
|   GOOG|   Juan|   120|
+-------+-------+------+

