# PySpark DataFrames, Schemas, and Data Types

## Import Modules

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, DateType, BooleanType
from pyspark.sql.functions import col, expr, concat_ws, round, year, array_contains, count, desc

from datetime import datetime

## Initiate Spark Session

In [2]:
spark = SparkSession.builder.appName("PySparkDataFrames").getOrCreate()

25/03/03 15:34:56 WARN Utils: Your hostname, Cesars-MBP.local resolves to a loopback address: 127.0.0.1; using 172.20.10.3 instead (on interface en0)
25/03/03 15:34:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/03/03 15:34:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Create a list of tuples
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

In [4]:
type(data)

list

## StructType

A `StructType()` is a collection of `StructField()` that define the column's name, data type, and a boolean value to specify if the field can have NULL values or not.

In [5]:
schema = StructType([
    StructField("firstName", StringType(), True),
    StructField("middleName", StringType(), True),
    StructField("lastName", StringType(), True),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])

## Create DataFrame

In [6]:
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()

root
 |-- firstName: string (nullable = true)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [7]:
df.show(truncate=False)

                                                                                

+---------+----------+--------+-----+------+------+
|firstName|middleName|lastName|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



## DataFrame Reader and Writer

The DataFrame Reader is a built-in API within the DataFrame that allows one to read various source files (CSV, JSON) and other Big Data file types (Parquet, ORC, and AVRO). 

In [8]:
file_path = "../../data/input/fire-incidents.csv"

In [9]:
fire_incidents_df = (spark.read.format("csv")
                     .option("header", True)
                     .option("inferSchema", True)
                     .load(file_path))

                                                                                

In [10]:
fire_incidents_df.select("IncidentNumber", "IncidentDate", "City").show(10)

+--------------+-------------------+-------------+
|IncidentNumber|       IncidentDate|         City|
+--------------+-------------------+-------------+
|      20104668|2020-09-11 00:00:00|San Francisco|
|      20104708|2020-09-11 00:00:00|San Francisco|
|      20104648|2020-09-10 00:00:00|San Francisco|
|      20104598|2020-09-10 00:00:00|San Francisco|
|      20104575|2020-09-10 00:00:00|San Francisco|
|      20104477|2020-09-10 00:00:00|San Francisco|
|      20104443|2020-09-10 00:00:00|San Francisco|
|      20104605|2020-09-10 00:00:00|San Francisco|
|      20104474|2020-09-10 00:00:00|San Francisco|
|      20104652|2020-09-10 00:00:00|San Francisco|
+--------------+-------------------+-------------+
only showing top 10 rows



***Note***: The `select()` statement refers to a *projection*, which projects (selects) the columns one requires. Spark will then resolve at the schema level after an action is called. 

In [11]:
fire_incidents_df.printSchema()

root
 |-- IncidentNumber: integer (nullable = true)
 |-- ExposureNumber: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- IncidentDate: timestamp (nullable = true)
 |-- CallNumber: integer (nullable = true)
 |-- AlarmDtTm: timestamp (nullable = true)
 |-- ArrivalDtTm: timestamp (nullable = true)
 |-- CloseDtTm: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- ZIPCode: string (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- SuppressionUnits: integer (nullable = true)
 |-- SuppressionPersonnel: integer (nullable = true)
 |-- EMSUnits: integer (nullable = true)
 |-- EMSPersonnel: integer (nullable = true)
 |-- OtherUnits: integer (nullable = true)
 |-- OtherPersonnel: integer (nullable = true)
 |-- FirstUnitOnScene: string (nullable = true)
 |-- EstimatedPropertyLoss: integer (nullable = true)
 |-- EstimatedContentsLoss: d

Adding a datetime stamp to the directory name that will contain the collection of PARQUET files.

In [12]:
date_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

In [13]:
export_path = f"../../data/output/fire_incidents_{date_timestamp}"

fire_incidents_df.write.format("parquet").mode("overwrite").save(export_path)

25/03/03 15:35:18 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 6:>                                                          (0 + 8) / 8]

25/03/03 15:35:19 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

## Structured Operations

### Reading JSON Files

In [14]:
schema_json_persons = StructType([
    StructField("id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("fav_movies", ArrayType(StringType()), True), # Note the StringType inside ArrayType
    StructField("salary", FloatType(), True), 
    StructField("image_url", StringType(), True), 
    StructField("date_of_birth", DateType(), True), 
    StructField("active", BooleanType(), True),  
])

In [15]:
json_file_path = "../../data/input/persons.json"

In [16]:
persons_df = spark.read.json(json_file_path, schema=schema_json_persons, multiLine=True)

In [17]:
persons_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- fav_movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- salary: float (nullable = true)
 |-- image_url: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- active: boolean (nullable = true)



In [18]:
persons_df.show(10, truncate=False)

+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                   |salary |image_url                                      |date_of_birth|active|
+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy    |[I giorni contati]                                           |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|2  |Emelyne   |Blaza    |[Musketeer, The, Topralli]                                   |3006.04|http://dummyimage.com/158x106.bmp/cc0000/ffffff|1991-11-02   |false |
|3  |Max       |Rettie   |[The Forgotten Space, Make It Happen]                        |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|4  

### Columns and Expressions

In [19]:
persons_df.select("first_name", "last_name", "date_of_birth").show(5)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|     Drucy|    Poppy|   1991-02-16|
|   Emelyne|    Blaza|   1991-11-02|
|       Max|   Rettie|   1990-03-03|
|    Ilario|     Kean|   1987-06-09|
|     Toddy|   Drexel|   1992-10-28|
+----------+---------+-------------+
only showing top 5 rows



In [20]:
persons_df.select(col("first_name"), col("last_name"), col("date_of_birth")).show(5)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|     Drucy|    Poppy|   1991-02-16|
|   Emelyne|    Blaza|   1991-11-02|
|       Max|   Rettie|   1990-03-03|
|    Ilario|     Kean|   1987-06-09|
|     Toddy|   Drexel|   1992-10-28|
+----------+---------+-------------+
only showing top 5 rows



***Note***: There is no noticeable difference between the two outputs, but there is a difference:
- In the first example, one passes the column names as strings. Spark converts these strings into column objects which is concise and works wells for simple column selections. 
- By using the `col()` function, one explicitly creates a column object for each specified column name, which is more flexible when wanting to perform further transformations on the column.

In [21]:
(persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                   col("salary"),
                   (col("salary") * 0.10 + col("salary")).alias("salary_increase"))).show(10)

+----------------+-------+------------------+
|       full_name| salary|   salary_increase|
+----------------+-------+------------------+
|     Drucy Poppy|1463.36|1609.6959838867188|
|   Emelyne Blaza|3006.04|  3306.64404296875|
|      Max Rettie|1422.88|1565.1680053710938|
|     Ilario Kean|3561.36|3917.4961181640624|
|    Toddy Drexel|4934.87|  5428.35712890625|
| Oswald Petrolli|1153.23| 1268.552978515625|
|   Adrian Clarey|1044.73| 1149.202978515625|
|Dominica Goodnow|1147.76|1262.5360107421875|
|   Emory Slocomb|1082.11|1190.3209838867188|
|   Jeremias Bode|3472.63|  3819.89287109375|
+----------------+-------+------------------+
only showing top 10 rows



In [22]:
(persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                   col("salary"),
                   expr("salary * 0.10 + salary").alias("salary_increase"))).show(10)

+----------------+-------+------------------+
|       full_name| salary|   salary_increase|
+----------------+-------+------------------+
|     Drucy Poppy|1463.36|1609.6959838867188|
|   Emelyne Blaza|3006.04|  3306.64404296875|
|      Max Rettie|1422.88|1565.1680053710938|
|     Ilario Kean|3561.36|3917.4961181640624|
|    Toddy Drexel|4934.87|  5428.35712890625|
| Oswald Petrolli|1153.23| 1268.552978515625|
|   Adrian Clarey|1044.73| 1149.202978515625|
|Dominica Goodnow|1147.76|1262.5360107421875|
|   Emory Slocomb|1082.11|1190.3209838867188|
|   Jeremias Bode|3472.63|  3819.89287109375|
+----------------+-------+------------------+
only showing top 10 rows



***Note***: The outputs are identical, but differ by applying different functions to each approach.
- With the `concat_ws()` and `alias()` functions, one concatenates two columns into a new feature, and adds another feature based on the calculation of a column, where the added features are given an alias. 
- By using the `expr()`, the code in the second example is much cleaner and easier to apply the transformation. 

In [23]:
(persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                   col("salary"),
                   round(expr("salary * 0.10 + salary"), 2).alias("salary_increase"))).show(10)

+----------------+-------+---------------+
|       full_name| salary|salary_increase|
+----------------+-------+---------------+
|     Drucy Poppy|1463.36|         1609.7|
|   Emelyne Blaza|3006.04|        3306.64|
|      Max Rettie|1422.88|        1565.17|
|     Ilario Kean|3561.36|         3917.5|
|    Toddy Drexel|4934.87|        5428.36|
| Oswald Petrolli|1153.23|        1268.55|
|   Adrian Clarey|1044.73|         1149.2|
|Dominica Goodnow|1147.76|        1262.54|
|   Emory Slocomb|1082.11|        1190.32|
|   Jeremias Bode|3472.63|        3819.89|
+----------------+-------+---------------+
only showing top 10 rows



***Note***: Applied rounding to the salary_increase column. Not necessary for data engineering purposes, but useful for data analysis

### Filter and Where Condition

In [24]:
persons_df.filter("salary > 3000").show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|
| 10|  Jeremias|     Bode|[Farewell to Arms...|3472.63|http://dummyimage...|   1997-08-02|  true|
| 14|   Ambrosi| Vidineev|[Wall Street: Mon...|4550.88|http://dummyimage...|   1989-07-20|  true|
| 18|     Alfie| Hatliffe|     [Lord of Tears]| 3893.1|http://dummyimage...|   1989-06-21|  true|
| 19|      Lura|   Follis|[My Life in Pink ...|3331.26|http://dummyimage...|   1998-11-03| false|
| 20|      Maxi|    

In [25]:
persons_df.where("salary > 3000").show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|
| 10|  Jeremias|     Bode|[Farewell to Arms...|3472.63|http://dummyimage...|   1997-08-02|  true|
| 14|   Ambrosi| Vidineev|[Wall Street: Mon...|4550.88|http://dummyimage...|   1989-07-20|  true|
| 18|     Alfie| Hatliffe|     [Lord of Tears]| 3893.1|http://dummyimage...|   1989-06-21|  true|
| 19|      Lura|   Follis|[My Life in Pink ...|3331.26|http://dummyimage...|   1998-11-03| false|
| 20|      Maxi|    

In [26]:
persons_df.where((col("salary") <= 3000) & (col("active") == True)).show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  9|     Emory|  Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 16|   Margaux| Archbold|[And Now a Word f...|1013.75|http://dummyimage...|   1988-07-29|  true|
| 26|     Clive|      Lax|             [Rabid]|2126.87|http://dummyimage...|   1981-10-26|  true|
| 33|  Sherline|  Primett|   [Jungle Fighters]|2309.39|http://dummyimage...|   1972-07-23|  true|
| 34|     Davis|    Pinks|          [Hounddog]|1337.14|http://dummyimage...|   1989-07-27|  true|
| 37|    Carlen|  Sharply|[Dr. Jekyll and M...|2051.85|http://dummyimage...|   2002-06-01|  true|
| 40|    Jordan|   L

In [27]:
persons_df.filter((year("date_of_birth") == 2000) | (year("date_of_birth") == 1989)).show(10)

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| 14|   Ambrosi|   Vidineev|[Wall Street: Mon...|4550.88|http://dummyimage...|   1989-07-20|  true|
| 15|    Feodor|Nancekivell|   [Monsoon Wedding]|2218.46|http://dummyimage...|   2000-10-07| false|
| 18|     Alfie|   Hatliffe|     [Lord of Tears]| 3893.1|http://dummyimage...|   1989-06-21|  true|
| 25|     Kelcy|     Wogdon|    [Iron Mask, The]|4512.51|http://dummyimage...|   2000-10-20|  true|
| 32|      Redd|   Akenhead|[Century of the D...| 2470.9|http://dummyimage...|   2000-06-05| false|
| 34|     Davis|      Pinks|          [Hounddog]|1337.14|http://dummyimage...|   1989-07-27|  true|
| 61|    Shanna|    Samples|[Thomas in Love (...| 2703.0|http://dummyimage...|   1989-07-07| false|


In [28]:
persons_df.where(array_contains(persons_df.fav_movies, "Land of the Lost")).show()

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| 11|   Timothy|   Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+



### Distinct, Drop Duplicates, and Order By

In [29]:
persons_df.select("active").show(10)

+------+
|active|
+------+
|  true|
| false|
| false|
|  true|
|  true|
| false|
| false|
| false|
|  true|
|  true|
+------+
only showing top 10 rows



In [30]:
persons_df.select("active").distinct().show()

+------+
|active|
+------+
|  true|
| false|
+------+



In [31]:
(persons_df.select(col("first_name"), 
                   year(col("date_of_birth")).alias("year"),
                   col("active")).orderBy("year", "first_name")).show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|       Sky|1971| false|
|   Timothy|1971| false|
|    Lucita|1972|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|     Toddy|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
+----------+----+------+
only showing top 10 rows



In [32]:
persons_dropped_df = (persons_df.select(col("first_name"), 
                                        year(col("date_of_birth")).alias("year"), 
                                        col("active")).dropDuplicates(["year", "active"])).orderBy("year", "first_name")

In [33]:
persons_dropped_df.show()

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
|   Balduin|1974| false|
|     Emory|1974|  true|
|    Janean|1975|  true|
|       Bev|1976|  true|
| Franciska|1976| false|
|     Johny|1977| false|
|    Daveta|1978| false|
|   Guthrie|1978|  true|
|      Maxi|1979| false|
|   Melinda|1979|  true|
|    Carter|1980| false|
|   Loralyn|1980|  true|
|     Clive|1981|  true|
|   Leanora|1981| false|
+----------+----+------+
only showing top 20 rows



In [34]:
(persons_df.select(col("first_name"), 
                   year(col("date_of_birth")).alias("year"), 
                   col("active")).orderBy("year", ascending=False)).show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|     Daron|2002|  true|
|    Virgie|2002|  true|
|    Carlen|2002|  true|
|   Lorilee|2002| false|
|    Maxine|2001| false|
|    Feodor|2000| false|
|     Kelcy|2000|  true|
|  Annabell|2000|  true|
|      Redd|2000| false|
|     Jobie|2000| false|
+----------+----+------+
only showing top 10 rows



### Rows and Unions

In [35]:
person_row = Row(101, "Juan", "Jimenez", ["The Godfather", "Gladiator"], 45900.08, "http://dummyimage.com/126x166.png/cc0000/fgfgfg", "1980-03-01", True)

In [36]:
type(person_row)

pyspark.sql.types.Row

In [37]:
print(person_row)

<Row(101, 'Juan', 'Jimenez', ['The Godfather', 'Gladiator'], 45900.08, 'http://dummyimage.com/126x166.png/cc0000/fgfgfg', '1980-03-01', True)>


In [38]:
person_row[1]

'Juan'

In [39]:
persons_row_list = [Row(102, "Antonio", "Dominguez", ["Goodfellas", "Heat"], 48900.08, "http://dummyimage.com/144x144.png/cc0000/fgfgfg", "1990-05-31", True), 
                Row(103, "Felix", "Caro", ["Bronx Tale", "The Matrix"], 54490.08, "http://dummyimage.com/96x144.png/cc0000/fgfgfg", "2000-11-04", False)]

In [40]:
type(persons_row_list)

list

In [41]:
persons_row_list.append(person_row)

In [42]:
print(persons_row_list)

[<Row(102, 'Antonio', 'Dominguez', ['Goodfellas', 'Heat'], 48900.08, 'http://dummyimage.com/144x144.png/cc0000/fgfgfg', '1990-05-31', True)>, <Row(103, 'Felix', 'Caro', ['Bronx Tale', 'The Matrix'], 54490.08, 'http://dummyimage.com/96x144.png/cc0000/fgfgfg', '2000-11-04', False)>, <Row(101, 'Juan', 'Jimenez', ['The Godfather', 'Gladiator'], 45900.08, 'http://dummyimage.com/126x166.png/cc0000/fgfgfg', '1980-03-01', True)>]


### Add, Rename, and Drop Columns

In [43]:
augment_persons_df = persons_df.withColumn("salary_increase", expr("salary * 0.10 + salary"))
augment_persons_df.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+------------------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|   salary_increase|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+------------------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|1609.6959838867188|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|  3306.64404296875|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|1565.1680053710938|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|3917.4961181640624|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|  5428.35712890625|
|  6|    Oswald| Petrolli|[Wing and the Thi...|1153.23|http://du

In [44]:
augment_persons_df.columns

['id',
 'first_name',
 'last_name',
 'fav_movies',
 'salary',
 'image_url',
 'date_of_birth',
 'active',
 'salary_increase']

In [46]:
augment_persons_df_2 = (augment_persons_df.withColumn("birth_year", year("date_of_birth"))
                        .withColumnRenamed("fav_movies", "movies")
                        .withColumn("salary_x10", round(col("salary_increase"), 2))
                        .drop("salary_increase"))

In [47]:
augment_persons_df_2.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
| id|first_name|last_name|              movies| salary|           image_url|date_of_birth|active|birth_year|salary_x10|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|      1991|    1609.7|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|      1991|   3306.64|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|      1990|   1565.17|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|      1987|    3917.5|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|      1992|   5428.36|
|  6|    Oswald| Petrolli|[Wing and the 

### Missing/Bad Data

In [48]:
bad_movies_list = [Row(None, None, None),
                   Row(None, None, 2020),
                   Row("John Doe", "Awesome Movie", None),
                   Row(None, "Awesome Movie", 2021),
                   Row("Mary Jane", None, 2019),
                   Row("Vikter Duplaix", "Not another teen movie", 2001)]

In [49]:
bad_movies_list_columns = ["actor_name", "movie_title", "produced_year"]

In [50]:
bad_movies_df = spark.createDataFrame(bad_movies_list, schema=bad_movies_list_columns)

In [51]:
bad_movies_df.show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|          null|                null|         null|
|          null|                null|         2020|
|      John Doe|       Awesome Movie|         null|
|          null|       Awesome Movie|         2021|
|     Mary Jane|                null|         2019|
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



In [52]:
bad_movies_df.na.drop().show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



***Note***: With the `na.drop()` function, one drops all the records with NA values from the DataFrame.

In [55]:
bad_movies_df.na.drop("any").show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



In [56]:
bad_movies_df.na.drop("all").show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|          null|                null|         2020|
|      John Doe|       Awesome Movie|         null|
|          null|       Awesome Movie|         2021|
|     Mary Jane|                null|         2019|
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



***Note***: The `na.drop()` function has specific features when handling missing values
- `na.drop("any")` will drop records with missing values from any column
- `na.drop("all")` will drop records with missing values in all the columns 

In [57]:
bad_movies_df.filter(col("actor_name").isNull() != True).show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|      John Doe|       Awesome Movie|         null|
|     Mary Jane|                null|         2019|
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



***Note***: Filters out rows where the `actor_name` column is NULL from the DataFrame.

In [58]:
bad_movies_df.filter(col("actor_name").isNull() != False).show()

+----------+-------------+-------------+
|actor_name|  movie_title|produced_year|
+----------+-------------+-------------+
|      null|         null|         null|
|      null|         null|         2020|
|      null|Awesome Movie|         2021|
+----------+-------------+-------------+



***Note***: Filters out rows where the `actor_name` column is not NULL from the DataFrame.

In [None]:
bad_movies_df.describe().show()

+-------+--------------+--------------------+-----------------+
|summary|    actor_name|         movie_title|    produced_year|
+-------+--------------+--------------------+-----------------+
|  count|             3|                   3|                4|
|   mean|          null|                null|          2015.25|
| stddev|          null|                null|9.535023160258536|
|    min|      John Doe|       Awesome Movie|             2001|
|    max|Vikter Duplaix|Not another teen ...|             2021|
+-------+--------------+--------------------+-----------------+



In [63]:
bad_movies_df.describe("produced_year").show()

+-------+-----------------+
|summary|    produced_year|
+-------+-----------------+
|  count|                4|
|   mean|          2015.25|
| stddev|9.535023160258536|
|    min|             2001|
|    max|             2021|
+-------+-----------------+



***Note***: The `describe()` function will list the basic statistical summary of the DataFrame or specified column.

### User Defined Functions