In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'], os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable, sys.executable

# Import the basic spark library
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("spark_tutorial") \
      .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/12/06 15:22:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/06 15:22:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/12/06 15:22:54 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType

#Createe the schema using StructField(Name, Type, Nullable)
schema = StructType([ \
    StructField("Pizza Name", StringType(), True), \
    StructField("Price", FloatType(), True), \
    StructField("Ingredients", ArrayType(StringType()), True) \
])

df_data = [("Margherita", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Basil"]),
        ("Calzone", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Diavola", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Spicy Salame"]),
        ("Prosciutto", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Speck & Brie", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Speck", "Brie"]),
        ("Tonno & Cipolle", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Tuna", "Onions"]),
        ("Fries", 3.95, ["Potatoes"])]

df = spark.createDataFrame(data = df_data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- Pizza Name: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)



[Stage 0:>                                                          (0 + 1) / 1]

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Margherita     |5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Diavola        |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]    |
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
|Fries          |3.95 |[Potatoes]                                         |
+---------------+-----+---------------------------------------------------+



                                                                                

<h4>Grouping using groupBy</h4>

In [3]:
# Count
df.groupBy("Price").count().show(truncate = False)

+-----+-----+
|Price|count|
+-----+-----+
|7.95 |4    |
|3.95 |1    |
|5.95 |2    |
+-----+-----+



In [6]:
# Minimum
df.groupBy().min("Price").show(truncate = False)

+----------+
|min(Price)|
+----------+
|3.95      |
+----------+



In [9]:
# Average
df.groupBy().avg("Price").show(truncate = False)

+-----------------+
|avg(Price)       |
+-----------------+
|6.807142700467791|
+-----------------+



<h4>Grouping Multiple Columns</h4>

In [11]:
# Let's explode our array to perform more interesting operations
from pyspark.sql.functions import explode, col

exploded_df = df.select(col("Pizza Name"), df.Price, explode(df.Ingredients))
exploded_df = exploded_df.withColumnRenamed("col", "Ingredient")
exploded_df.printSchema()
exploded_df.show(truncate = False)

root
 |-- Pizza Name: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Ingredient: string (nullable = true)

+---------------+-----+-----------------+
|Pizza Name     |Price|Ingredient       |
+---------------+-----+-----------------+
|Margherita     |5.95 |Tomato Sauce     |
|Margherita     |5.95 |Mozzarella Cheese|
|Margherita     |5.95 |Basil            |
|Calzone        |7.95 |Tomato Sauce     |
|Calzone        |7.95 |Mozzarella Cheese|
|Calzone        |7.95 |Prosciutto Cotto |
|Diavola        |5.95 |Tomato Sauce     |
|Diavola        |5.95 |Mozzarella Cheese|
|Diavola        |5.95 |Spicy Salame     |
|Prosciutto     |7.95 |Tomato Sauce     |
|Prosciutto     |7.95 |Mozzarella Cheese|
|Prosciutto     |7.95 |Prosciutto Cotto |
|Speck & Brie   |7.95 |Tomato Sauce     |
|Speck & Brie   |7.95 |Mozzarella Cheese|
|Speck & Brie   |7.95 |Speck            |
|Speck & Brie   |7.95 |Brie             |
|Tonno & Cipolle|7.95 |Tomato Sauce     |
|Tonno & Cipolle|7.95 |Mozzarella 

In [12]:
# counting
exploded_df.groupBy("Ingredient", "Price").count().show(truncate = False)

+-----------------+-----+-----+
|Ingredient       |Price|count|
+-----------------+-----+-----+
|Prosciutto Cotto |7.95 |2    |
|Basil            |5.95 |1    |
|Mozzarella Cheese|7.95 |4    |
|Tomato Sauce     |7.95 |4    |
|Tuna             |7.95 |1    |
|Speck            |7.95 |1    |
|Brie             |7.95 |1    |
|Mozzarella Cheese|5.95 |2    |
|Tomato Sauce     |5.95 |2    |
|Spicy Salame     |5.95 |1    |
|Onions           |7.95 |1    |
|Potatoes         |3.95 |1    |
+-----------------+-----+-----+



<h4>Multiple Aggregations</h4>

In [13]:
from pyspark.sql.functions import sum, avg, count, max

exploded_df.groupBy("Pizza Name").agg(
    sum("Price").alias("Sum Price"),
    avg("Price").alias("Average Price"),
    count("Ingredient").alias("Number of Ingredients"),
    max("Price").alias("Price")).show(truncate = False)

+---------------+------------------+-----------------+---------------------+-----+
|Pizza Name     |Sum Price         |Average Price    |Number of Ingredients|Price|
+---------------+------------------+-----------------+---------------------+-----+
|Calzone        |23.84999942779541 |7.949999809265137|3                    |7.95 |
|Diavola        |17.84999942779541 |5.949999809265137|3                    |5.95 |
|Fries          |3.950000047683716 |3.950000047683716|1                    |3.95 |
|Prosciutto     |23.84999942779541 |7.949999809265137|3                    |7.95 |
|Margherita     |17.84999942779541 |5.949999809265137|3                    |5.95 |
|Speck & Brie   |31.799999237060547|7.949999809265137|4                    |7.95 |
|Tonno & Cipolle|31.799999237060547|7.949999809265137|4                    |7.95 |
+---------------+------------------+-----------------+---------------------+-----+



In [14]:
# Let's keep only the Pizza's with at least four ingredients
exploded_df.groupBy("Pizza Name") \
    .agg(count("Ingredient").alias("Number of Ingredients")) \
    .filter(col("Number of Ingredients") >= 4) \
    .show(truncate = False)

+---------------+---------------------+
|Pizza Name     |Number of Ingredients|
+---------------+---------------------+
|Speck & Brie   |4                    |
|Tonno & Cipolle|4                    |
+---------------+---------------------+



<h4>Aggregate Functions</h4>

In [15]:
from pyspark.sql.functions import approx_count_distinct

# Count the number of unique values in a field
print("Number of different ingredients", str(exploded_df.select(approx_count_distinct("Ingredient")).collect()[0][0]))

23/12/06 15:31:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
Number of different ingredients 10


In [17]:
from pyspark.sql.functions import avg

# Compute the average price for a Pizza
print("Average Price: ", str(df.select(avg("Price")).collect()[0][0]))

Average Price:  6.807142700467791


In [18]:
from pyspark.sql.functions import collect_list

# Return all the values from a column (with duplicates)
exploded_df.select(collect_list("Ingredient")).collect()[0][0]

['Tomato Sauce',
 'Mozzarella Cheese',
 'Basil',
 'Tomato Sauce',
 'Mozzarella Cheese',
 'Prosciutto Cotto',
 'Tomato Sauce',
 'Mozzarella Cheese',
 'Spicy Salame',
 'Tomato Sauce',
 'Mozzarella Cheese',
 'Prosciutto Cotto',
 'Tomato Sauce',
 'Mozzarella Cheese',
 'Speck',
 'Brie',
 'Tomato Sauce',
 'Mozzarella Cheese',
 'Tuna',
 'Onions',
 'Potatoes']

In [19]:
exploded_df.select(collect_list("Ingredient")).show(truncate = False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|collect_list(Ingredient)                                                                                                                                                                                                                                                                            |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Tomato Sauce, Mozzarella Cheese, Basil, Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto, Tomato Sauce, Mozzarel

In [20]:
from pyspark.sql.functions import collect_set

# Return all the values from a column (without duplicates)
exploded_df.select(collect_set("Ingredient")).collect()[0][0]

['Basil',
 'Tuna',
 'Mozzarella Cheese',
 'Brie',
 'Onions',
 'Speck',
 'Prosciutto Cotto',
 'Tomato Sauce',
 'Potatoes',
 'Spicy Salame']

In [21]:
from pyspark.sql.functions import countDistinct

# Return all the values from a column (without duplicates)
exploded_df.select(countDistinct("Ingredient", "Price")).withColumnRenamed("count(DISTINCT Ingredient, Price)", "count").show(truncate = False)

+-----+
|count|
+-----+
|12   |
+-----+



In [41]:
from pyspark.sql.functions import first, last

# Select the first non-null element of the column
df.select(first("Ingredients")).show(truncate = False)

# Select the last non-null element of the column
df.select(last("Ingredients")).show(truncate = False)

# A wide variety of functions is also available, like avg(), sum(), mean(), variance(), stddev(), etc.

+----------------------------------------+
|first(Ingredients)                      |
+----------------------------------------+
|[Tomato Sauce, Mozzarella Cheese, Basil]|
+----------------------------------------+

+-----------------+
|last(Ingredients)|
+-----------------+
|[Potatoes]       |
+-----------------+

