#### Importar las bibliotecas necesarias. 
#### Como estamos usando Python, importar el SparkSession y funciones relacionadas desde el módulo PySpark.

In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

#### Construya un SparkSession usando las APIs de SparkSession. Si no existe, entonces cree una instancia. Solo puede haber un SparkSession por JVM.

In [2]:
spark = (SparkSession
 .builder
 .appName("PythonMnMCount")
 .getOrCreate())

#### Get the M&M data set filename from the command-line arguments 

In [3]:
mnm_file = "mnm_dataset.csv"

#### Read the file into a Spark DataFrame using the CSV
#### format by inferring the schema and specifying that the
#### file contains a header, which provides column names for comma-
#### separated fields.

In [4]:
mnm_df = (spark.read.format("csv")
 .option("header", "true")
 .option("inferSchema", "true")
 .load(mnm_file))

#### We use the DataFrame high-level APIs. Note
#### that we don't use RDDs at all. Because some of Spark's
#### functions return the same object, we can chain function calls.
#### 1. Select from the DataFrame the fields "State", "Color", and "Count"
#### 2. Since we want to group each state and its M&M color count, we use groupBy()
#### 3. Aggregate counts of all colors and groupBy() State and Color
#### 4 orderBy() in descending order

In [5]:
count_mnm_df = (mnm_df
 .select("State", "Color", "Count")
 .groupBy("State", "Color")
 .agg(count("Count").alias("Total"))
 .orderBy("Total", ascending=False))

#### Show the resulting aggregations for all the states and colors;
#### a total count of each color per state.
#### Note show() is an action, which will trigger the above
#### query to be executed.

In [6]:
count_mnm_df.show(n=60, truncate=False)
print("Total Rows = %d" % (count_mnm_df.count()))

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|CA   |Yellow|1807 |
|WA   |Green |1779 |
|OR   |Orange|1743 |
|TX   |Green |1737 |
|TX   |Red   |1725 |
|CA   |Green |1723 |
|CO   |Yellow|1721 |
|CA   |Brown |1718 |
|CO   |Green |1713 |
|NV   |Orange|1712 |
|TX   |Yellow|1703 |
|NV   |Green |1698 |
|AZ   |Brown |1698 |
|CO   |Blue  |1695 |
|WY   |Green |1695 |
|NM   |Red   |1690 |
|AZ   |Orange|1689 |
|NM   |Yellow|1688 |
|NM   |Brown |1687 |
|UT   |Orange|1684 |
|NM   |Green |1682 |
|UT   |Red   |1680 |
|AZ   |Green |1676 |
|NV   |Yellow|1675 |
|NV   |Blue  |1673 |
|WA   |Red   |1671 |
|WY   |Red   |1670 |
|WA   |Brown |1669 |
|NM   |Orange|1665 |
|WY   |Blue  |1664 |
|WA   |Yellow|1663 |
|WA   |Orange|1658 |
|CA   |Orange|1657 |
|NV   |Brown |1657 |
|CO   |Brown |1656 |
|CA   |Red   |1656 |
|UT   |Blue  |1655 |
|AZ   |Yellow|1654 |
|TX   |Orange|1652 |
|AZ   |Red   |1648 |
|OR   |Blue  |1646 |
|UT   |Yellow|1645 |
|OR   |Red   |1645 |
|CO   |Orange|1642 |
|TX   |Brown 

#### While the above code aggregated and counted for all the states, what if we just want to see the data for a single state, e.g., CA?
#### 1. Select from all rows in the DataFrame
#### 2. Filter only CA state
#### 3. groupBy() State and Color as we did above
#### 4. Aggregate the counts for each color
#### 5. orderBy() in descending order
#### Find the aggregate count for California by filtering

In [7]:
ca_count_mnm_df = (mnm_df
 .select("State", "Color", "Count")
 .where(mnm_df.State == "CA")
 .groupBy("State", "Color")
 .agg(count("Count").alias("Total"))
 .orderBy("Total", ascending=False))

#### Show the resulting aggregation for California.
#### As above, show() is an action that will trigger the execution of the entire computation. 

In [8]:
ca_count_mnm_df.show(n=10, truncate=False)

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|CA   |Yellow|1807 |
|CA   |Green |1723 |
|CA   |Brown |1718 |
|CA   |Orange|1657 |
|CA   |Red   |1656 |
|CA   |Blue  |1603 |
+-----+------+-----+



In [None]:
# Stop the SparkSession
spark.stop()