<h4>Operations</h4>
<p>Spark supports two different types of operations</p>
<ul>
    <li><b>Transformations</b> on RDDs return another RDD as a result (e.g., filter()), this is why they are called lazy operations.</li>
    <li><b>Actions</b> return values from RDDs</li>
</ul>

In [5]:
import os
import sys

os.environ['PYSPARK_PYTHON'], os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable, sys.executable

### ATTENZIONE --> devi settare le variabili OS qui sopra prima di creare la spark session, altrimenti non funziona niente

# Import the basic spark library
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder.master("local").appName("spark_tutorial").getOrCreate()

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType
from pyspark.sql.functions import col

#Createe the schema using StructField(Name, Type, Nullable)
schema = StructType([
    StructField("Pizza Name", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("Ingredients", ArrayType(StringType()), True)
])

df_data = [("Margherita", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Basil"]),
        ("Calzone", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Diavola", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Spicy Salame"]),
        ("Prosciutto", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Speck & Brie", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Speck", "Brie"]),
        ("Tonno & Cipolle", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Tuna", "Onions"]),
        ("Fries", 3.95, ["Potatoes"])]

df = spark.createDataFrame(data = df_data, schema = schema)
df.printSchema()
df.show()

root
 |-- Pizza Name: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---------------+-----+--------------------+
|     Pizza Name|Price|         Ingredients|
+---------------+-----+--------------------+
|     Margherita| 5.95|[Tomato Sauce, Mo...|
|        Calzone| 7.95|[Tomato Sauce, Mo...|
|        Diavola| 5.95|[Tomato Sauce, Mo...|
|     Prosciutto| 7.95|[Tomato Sauce, Mo...|
|   Speck & Brie| 7.95|[Tomato Sauce, Mo...|
|Tonno & Cipolle| 7.95|[Tomato Sauce, Mo...|
|          Fries| 3.95|          [Potatoes]|
+---------------+-----+--------------------+



<h4>Filtering operations (i.e., WHERE conditions)</h4>

In [8]:
# Filtering using equal condition
df.filter(col("Price") == "7.95").show(truncate = False) # col("Name") è il miglior modo per indicare una colonna

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+



In [4]:
# Filtering using not equal condition
df.filter(df.Price != "7.95").show(truncate = False)

+----------+-----+-----------------------------------------------+
|Pizza Name|Price|Ingredients                                    |
+----------+-----+-----------------------------------------------+
|Margherita|5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]       |
|Diavola   |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]|
|Fries     |3.95 |[Potatoes]                                     |
+----------+-----+-----------------------------------------------+



In [5]:
# Filtering using the col() function
#from pyspark.sql.functions import col

df.filter(col("Price") == "7.95").show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+



In [6]:
# Filtering using SQL Expression
df.filter("Price == '7.95'").show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+



In [11]:
# Filtering with multiple conditions
df.filter((df.Price == "7.95") | (col("Pizza Name") == "Margherita")).show(truncate = False)

# N.B. Parenthesis are essential!

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Margherita     |5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+



In [18]:
# Filtering w.r.t. a list of elements
favourite_pizzas = ["Speck & Brie", "Tonno & Cipolle"]

# "is in" Filtering
df.filter(col("Pizza Name").isin(favourite_pizzas)).show(truncate = False)

# "is not in" Filtering
df.filter(col("Pizza Name").isin(favourite_pizzas) == False).show(truncate = False)

+---------------+-----+-----------------------------------------------+
|Pizza Name     |Price|Ingredients                                    |
+---------------+-----+-----------------------------------------------+
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie] |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]|
+---------------+-----+-----------------------------------------------+

+----------+-----+---------------------------------------------------+
|Pizza Name|Price|Ingredients                                        |
+----------+-----+---------------------------------------------------+
|Margherita|5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone   |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Diavola   |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]    |
|Prosciutto|7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Fries     |3.95 |[Potatoes]                                         |

In [19]:
# Filtering w.r.t. a list of elements

# collect() -> Extract the list of rows from the resulting RDD
expensive_pizzas = df.filter(col("Price") == "7.95").select("Pizza Name").collect()

print(expensive_pizzas)

# Extract the value of the chosen field
expensive_pizzas = [ep["Pizza Name"] for ep in expensive_pizzas]

print(expensive_pizzas)

# "is in" Filtering
df.filter(col("Pizza Name").isin(expensive_pizzas)).show(truncate = False)

# "is not in" Filtering
df.filter(col("Pizza Name").isin(expensive_pizzas) == False).show(truncate = False)

[Row(Pizza Name='Calzone'), Row(Pizza Name='Prosciutto'), Row(Pizza Name='Speck & Brie'), Row(Pizza Name='Tonno & Cipolle')]
['Calzone', 'Prosciutto', 'Speck & Brie', 'Tonno & Cipolle']
+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+

+----------+-----+-----------------------------------------------+
|Pizza Name|Price|Ingredients                                    |
+----------+-----+-----------------------------------------------+
|Mar

In [20]:
# Filtering based on the content of the column

# Filtering based on the initial letter(s)
df.filter(col("Pizza Name").startswith("To")).show(truncate = False)

# Filtering based on the ending letter(s)
df.filter(col("Pizza Name").endswith("one")).show(truncate = False)

# Filtering based on whether a word is contained in the word
df.filter(col("Pizza Name").contains("&")).show(truncate = False)

+---------------+-----+-----------------------------------------------+
|Pizza Name     |Price|Ingredients                                    |
+---------------+-----+-----------------------------------------------+
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]|
+---------------+-----+-----------------------------------------------+

+----------+-----+---------------------------------------------------+
|Pizza Name|Price|Ingredients                                        |
+----------+-----+---------------------------------------------------+
|Calzone   |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
+----------+-----+---------------------------------------------------+

+---------------+-----+-----------------------------------------------+
|Pizza Name     |Price|Ingredients                                    |
+---------------+-----+-----------------------------------------------+
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Bri

In [21]:
# Filtering using like (i.e., SQL LIKE) ALLOWED
df.filter(col("Pizza Name").like("%on%")).show(truncate = False)

# Filtering using rlike (i.e., REGEX LIKE)
df.filter(col("Pizza Name").rlike("[A-z]*&[A-z]*")).show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+

+---------------+-----+-----------------------------------------------+
|Pizza Name     |Price|Ingredients                                    |
+---------------+-----+-----------------------------------------------+
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie] |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]|
+---------------+-----+-----------------------------------------------+



In [23]:
# Filtering on array columns
from pyspark.sql.functions import array_contains

# Filtering on a single value
df.filter(array_contains(df.Ingredients, "Tomato Sauce")).show(truncate = False)

# Filtering on multiple values
df.filter(array_contains(df.Ingredients, "Tomato Sauce") & array_contains(df.Ingredients, "Basil")).show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Margherita     |5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Diavola        |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]    |
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+

+----------+-----+----------------------------------------+
|Pizza Name|Price|Ingredients                             |
+----------+-----+----------------------------------------+
|Margherita|5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]|

In [24]:
# Limit the results to the first 5 elements
df.limit(5).show(truncate = False)

+------------+-----+---------------------------------------------------+
|Pizza Name  |Price|Ingredients                                        |
+------------+-----+---------------------------------------------------+
|Margherita  |5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Diavola     |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]    |
|Prosciutto  |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie|7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
+------------+-----+---------------------------------------------------+

