In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [2]:
spark = (SparkSession.builder.appName('PySparkDfPractice').getOrCreate())

In [3]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

In [4]:
schema = StructType([
    StructField("firstName", StringType(), True),
    StructField("middleName", StringType(), True),
    StructField("lastName", StringType(), True),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True),
])

In [5]:
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()

root
 |-- firstName: string (nullable = true)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [6]:
df.show(truncate=False)

+---------+----------+--------+-----+------+------+
|firstName|middleName|lastName|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



In [7]:
file_path = './data/fire-incidents/fire-incidents.csv'

In [8]:
fire_df = (
    spark.read.format('csv')
    .option('header', True)
    .option('inferSchema', True)
    .load(file_path)
)

In [9]:
fire_df.select('IncidentNumber', 'IncidentDate', 'City').show(10)

+--------------+-------------------+-------------+
|IncidentNumber|       IncidentDate|         City|
+--------------+-------------------+-------------+
|      20104668|2020-09-11 00:00:00|San Francisco|
|      20104708|2020-09-11 00:00:00|San Francisco|
|      20104648|2020-09-10 00:00:00|San Francisco|
|      20104598|2020-09-10 00:00:00|San Francisco|
|      20104575|2020-09-10 00:00:00|San Francisco|
|      20104477|2020-09-10 00:00:00|San Francisco|
|      20104443|2020-09-10 00:00:00|San Francisco|
|      20104605|2020-09-10 00:00:00|San Francisco|
|      20104474|2020-09-10 00:00:00|San Francisco|
|      20104652|2020-09-10 00:00:00|San Francisco|
+--------------+-------------------+-------------+
only showing top 10 rows



In [10]:
fire_df.printSchema()

root
 |-- IncidentNumber: integer (nullable = true)
 |-- ExposureNumber: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- IncidentDate: timestamp (nullable = true)
 |-- CallNumber: integer (nullable = true)
 |-- AlarmDtTm: timestamp (nullable = true)
 |-- ArrivalDtTm: timestamp (nullable = true)
 |-- CloseDtTm: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- ZIPCode: string (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- SuppressionUnits: integer (nullable = true)
 |-- SuppressionPersonnel: integer (nullable = true)
 |-- EMSUnits: integer (nullable = true)
 |-- EMSPersonnel: integer (nullable = true)
 |-- OtherUnits: integer (nullable = true)
 |-- OtherPersonnel: integer (nullable = true)
 |-- FirstUnitOnScene: string (nullable = true)
 |-- EstimatedPropertyLoss: integer (nullable = true)
 |-- EstimatedContentsLoss: d

In [11]:
fire_df.columns

['IncidentNumber',
 'ExposureNumber',
 'ID',
 'Address',
 'IncidentDate',
 'CallNumber',
 'AlarmDtTm',
 'ArrivalDtTm',
 'CloseDtTm',
 'City',
 'ZIPCode',
 'Battalion',
 'StationArea',
 'Box',
 'SuppressionUnits',
 'SuppressionPersonnel',
 'EMSUnits',
 'EMSPersonnel',
 'OtherUnits',
 'OtherPersonnel',
 'FirstUnitOnScene',
 'EstimatedPropertyLoss',
 'EstimatedContentsLoss',
 'FireFatalities',
 'FireInjuries',
 'CivilianFatalities',
 'CivilianInjuries',
 'NumberofAlarms',
 'PrimarySituation',
 'MutualAid',
 'ActionTakenPrimary',
 'ActionTakenSecondary',
 'ActionTakenOther',
 'DetectorAlertedOccupants',
 'PropertyUse',
 'AreaofFireOrigin',
 'IgnitionCause',
 'IgnitionFactorPrimary',
 'IgnitionFactorSecondary',
 'HeatSource',
 'ItemFirstIgnited',
 'HumanFactorsAssociatedwithIgnition',
 'StructureType',
 'StructureStatus',
 'FloorofFireOrigin',
 'FireSpread',
 'NoFlameSpead',
 'Numberoffloorswithminimumdamage',
 'Numberoffloorswithsignificantdamage',
 'Numberoffloorswithheavydamage',
 'Numbe

In [12]:
output_path = './data/output/fireincidents'
fire_df.write.format('parquet').mode('overwrite').save(output_path)

# Working with Structured Operations

## Reading a JSON file

In [13]:
from pyspark.sql.types import ArrayType, FloatType, DateType, BooleanType

In [14]:
persons_schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('fav_movies', ArrayType(StringType()), True),
    StructField('salary', FloatType(), True),
    StructField('image_url', StringType(), True),
    StructField('date_of_birth', DateType(), True),
    StructField('active', BooleanType(), True),
])

In [15]:
json_file_path = './data/persons.json'
persons_df = (
    spark.read.json(json_file_path, persons_schema, multiLine='True')
)

In [16]:
persons_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- fav_movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- salary: float (nullable = true)
 |-- image_url: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- active: boolean (nullable = true)



In [17]:
persons_df.show(10, truncate=10)

+---+----------+---------+----------+-------+----------+-------------+------+
| id|first_name|last_name|fav_movies| salary| image_url|date_of_birth|active|
+---+----------+---------+----------+-------+----------+-------------+------+
|  1|     Drucy|    Poppy|[I gior...|1463.36|http://...|   1991-02-16|  true|
|  2|   Emelyne|    Blaza|[Musket...|3006.04|http://...|   1991-11-02| false|
|  3|       Max|   Rettie|[The Fo...|1422.88|http://...|   1990-03-03| false|
|  4|    Ilario|     Kean|[Up Clo...|3561.36|http://...|   1987-06-09|  true|
|  5|     Toddy|   Drexel|[Walk i...|4934.87|http://...|   1992-10-28|  true|
|  6|    Oswald| Petrolli|[Wing a...|1153.23|http://...|   1986-09-02| false|
|  7|    Adrian|   Clarey|[Walkin...|1044.73|http://...|   1971-08-24| false|
|  8|  Dominica|  Goodnow|[Hearts...|1147.76|http://...|   1973-08-27| false|
|  9|     Emory|  Slocomb|[Snake ...|1082.11|http://...|   1974-06-08|  true|
| 10|  Jeremias|     Bode|[Farewe...|3472.63|http://...|   1997-

# Columns and Expressions

In [18]:
from pyspark.sql.functions import col, expr

In [19]:
persons_df.select(col('first_name'), col('last_name'), col('date_of_birth')).show(5)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|     Drucy|    Poppy|   1991-02-16|
|   Emelyne|    Blaza|   1991-11-02|
|       Max|   Rettie|   1990-03-03|
|    Ilario|     Kean|   1987-06-09|
|     Toddy|   Drexel|   1992-10-28|
+----------+---------+-------------+
only showing top 5 rows



In [20]:
persons_df.select(col('first_name'), expr('last_name'), expr('date_of_birth')).show(5)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|     Drucy|    Poppy|   1991-02-16|
|   Emelyne|    Blaza|   1991-11-02|
|       Max|   Rettie|   1990-03-03|
|    Ilario|     Kean|   1987-06-09|
|     Toddy|   Drexel|   1992-10-28|
+----------+---------+-------------+
only showing top 5 rows



In [21]:
from pyspark.sql.functions import concat_ws

In [22]:
(
    persons_df.
    select(concat_ws(' ', col('first_name'), col('last_name')).alias('full_name'), 
           col('salary'),
           expr('salary * 0.10 + salary').alias('salary_increase'))
).show(10)

+----------------+-------+------------------+
|       full_name| salary|   salary_increase|
+----------------+-------+------------------+
|     Drucy Poppy|1463.36|1609.6959838867188|
|   Emelyne Blaza|3006.04|  3306.64404296875|
|      Max Rettie|1422.88|1565.1680053710938|
|     Ilario Kean|3561.36|3917.4961181640624|
|    Toddy Drexel|4934.87|  5428.35712890625|
| Oswald Petrolli|1153.23| 1268.552978515625|
|   Adrian Clarey|1044.73| 1149.202978515625|
|Dominica Goodnow|1147.76|1262.5360107421875|
|   Emory Slocomb|1082.11|1190.3209838867188|
|   Jeremias Bode|3472.63|  3819.89287109375|
+----------------+-------+------------------+
only showing top 10 rows



## Filter and where condition

In [23]:
persons_df.filter('salary <= 3000').show(3)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|
|  6|    Oswald| Petrolli|[Wing and the Thi...|1153.23|http://dummyimage...|   1986-09-02| false|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
only showing top 3 rows



In [24]:
persons_df.where('salary <= 3000').show()

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|      Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  3|       Max|     Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|
|  6|    Oswald|   Petrolli|[Wing and the Thi...|1153.23|http://dummyimage...|   1986-09-02| false|
|  7|    Adrian|     Clarey|[Walking Tall, Pa...|1044.73|http://dummyimage...|   1971-08-24| false|
|  8|  Dominica|    Goodnow|    [Hearts Divided]|1147.76|http://dummyimage...|   1973-08-27| false|
|  9|     Emory|    Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 11|   Timothy|     Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|


In [25]:
persons_df.where((col('salary') <= 3000) & (col('active') == True)).show()

+---+----------+----------+--------------------+-------+--------------------+-------------+------+
| id|first_name| last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+----------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|     Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  9|     Emory|   Slocomb|[Snake and Crane ...|1082.11|http://dummyimage...|   1974-06-08|  true|
| 16|   Margaux|  Archbold|[And Now a Word f...|1013.75|http://dummyimage...|   1988-07-29|  true|
| 26|     Clive|       Lax|             [Rabid]|2126.87|http://dummyimage...|   1981-10-26|  true|
| 33|  Sherline|   Primett|   [Jungle Fighters]|2309.39|http://dummyimage...|   1972-07-23|  true|
| 34|     Davis|     Pinks|          [Hounddog]|1337.14|http://dummyimage...|   1989-07-27|  true|
| 37|    Carlen|   Sharply|[Dr. Jekyll and M...|2051.85|http://dummyimage...|   2002-06-01|  true|
| 40|    J

In [26]:
from pyspark.sql.functions import year

In [27]:
persons_df.filter((year('date_of_birth') == 2000) | (year('date_of_birth') == 1989)).show()

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| 14|   Ambrosi|   Vidineev|[Wall Street: Mon...|4550.88|http://dummyimage...|   1989-07-20|  true|
| 15|    Feodor|Nancekivell|   [Monsoon Wedding]|2218.46|http://dummyimage...|   2000-10-07| false|
| 18|     Alfie|   Hatliffe|     [Lord of Tears]| 3893.1|http://dummyimage...|   1989-06-21|  true|
| 25|     Kelcy|     Wogdon|    [Iron Mask, The]|4512.51|http://dummyimage...|   2000-10-20|  true|
| 32|      Redd|   Akenhead|[Century of the D...| 2470.9|http://dummyimage...|   2000-06-05| false|
| 34|     Davis|      Pinks|          [Hounddog]|1337.14|http://dummyimage...|   1989-07-27|  true|
| 61|    Shanna|    Samples|[Thomas in Love (...| 2703.0|http://dummyimage...|   1989-07-07| false|


In [28]:
from pyspark.sql.functions import array_contains

In [29]:
persons_df.where(array_contains(persons_df.fav_movies, 'Land of the Lost')).show()

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| 11|   Timothy|   Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+



## Distinct, Drop duplicates, Order by

In [30]:
from pyspark.sql.functions import count, desc

In [31]:
persons_df.select('active').distinct().show()

+------+
|active|
+------+
|  true|
| false|
+------+



In [32]:
(
    persons_df
    .select(
        col('first_name'), year(col('date_of_birth')).alias('year'), col('active'))
    .orderBy('year', 'first_name')
    .show()
)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|       Sky|1971| false|
|   Timothy|1971| false|
|    Lucita|1972|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|     Toddy|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
|  Wolfgang|1973|  true|
|   Balduin|1974| false|
|     Emory|1974|  true|
|    Norean|1974|  true|
|    Janean|1975|  true|
|       Bev|1976|  true|
| Franciska|1976| false|
|    Bennie|1977| false|
|     Johny|1977| false|
|    Daveta|1978| false|
+----------+----+------+
only showing top 20 rows



In [33]:
dropped_df = (
    persons_df
    .select(
        col('first_name'),
        year(col('date_of_birth')).alias('year'),
        col('active')
    )
    .dropDuplicates(['year', 'active'])
    .orderBy('year', 'first_name')
)

In [34]:
dropped_df.show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
|   Balduin|1974| false|
|     Emory|1974|  true|
|    Janean|1975|  true|
|       Bev|1976|  true|
+----------+----+------+
only showing top 10 rows



In [35]:
(
    persons_df
    .select(
        col('first_name'), year(col('date_of_birth')).alias('year'), col('active'))
    .orderBy('year', ascending=False)
    .show()
)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|   Lorilee|2002| false|
|    Virgie|2002|  true|
|    Carlen|2002|  true|
|     Daron|2002|  true|
|    Maxine|2001| false|
|    Feodor|2000| false|
|  Annabell|2000|  true|
|     Kelcy|2000|  true|
|     Jobie|2000| false|
|      Redd|2000| false|
|  Theodore|1999| false|
| Kendricks|1999|  true|
|    Cecily|1999|  true|
|      Jere|1999| false|
|  Elianora|1999| false|
|     Deina|1999|  true|
|      Jere|1998| false|
|    Wilden|1998| false|
|      Rudy|1998|  true|
|    Eugine|1998| false|
+----------+----+------+
only showing top 20 rows



## Unions and rows

In [36]:
from pyspark.sql import Row

In [37]:
persons_row = Row(101, 'Breno', 'Gomes', ['Arrive', 'Parasite'], 4300.54, 'github.com/BrenoShelby', '20-08-2001', True)

In [38]:
persons_rows_list = [
    Row(101, 'Breno', 'Gomes', ['Arrive', 'Parasite'], 4300.54, 'github.com/BrenoShelby', '20-08-2001', True),
    Row(103, 'Juliana', 'Gomes', ['Arrive', 'Parasite'], 4300.54, 'github.com/BrenoShelby', '13-06-2000', True)
]

In [39]:
news_persons_df = spark.createDataFrame(
    persons_rows_list, ['id', 'first_name', 'last_name', 'fav_movies', 'salary', 'url', 'date_of_birth', 'active']
)

In [40]:
news_persons_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- fav_movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- salary: double (nullable = true)
 |-- url: string (nullable = true)
 |-- date_of_birth: string (nullable = true)
 |-- active: boolean (nullable = true)



In [41]:
# desc means descending order
union_persons_df = persons_df.union(news_persons_df)
union_persons_df.sort(desc('id')).show(10)

+---+----------+---------+--------------------+------------------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies|            salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+------------------+--------------------+-------------+------+
|103|   Juliana|    Gomes|  [Arrive, Parasite]|           4300.54|github.com/BrenoS...|   13-06-2000|  true|
|101|     Breno|    Gomes|  [Arrive, Parasite]|           4300.54|github.com/BrenoS...|   20-08-2001|  true|
|100|    Virgie| Domanski|[Horseman, The, S...| 2165.929931640625|http://dummyimage...|   2002-01-05|  true|
| 99|   Rozalie|   Wannop|[Suddenly, The No...|1259.6400146484375|http://dummyimage...|   1997-03-25| false|
| 98|     Davin|     Labb|[Viva Riva!, Kill...| 1452.739990234375|http://dummyimage...|   1988-01-27|  true|
| 97|      Rodi|   Farnan|[Code, The (Menta...|   2325.8798828125|http://dummyimage...|   1972-01-04| false|
| 96|       Dew| Co

## Adding, Renaming and Dropping columns

In [42]:
from pyspark.sql.functions import round

In [43]:
aug_persons_df1 = persons_df.withColumn('salary_increase', expr('salary * .1 + salary'))
aug_persons_df1.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+------------------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|   salary_increase|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+------------------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|1609.6959838867188|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|  3306.64404296875|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|1565.1680053710938|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|3917.4961181640624|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|  5428.35712890625|
|  6|    Oswald| Petrolli|[Wing and the Thi...|1153.23|http://du

In [44]:
aug_persons_df1.columns

['id',
 'first_name',
 'last_name',
 'fav_movies',
 'salary',
 'image_url',
 'date_of_birth',
 'active',
 'salary_increase']

In [45]:
aug_persons_df2 = (
    aug_persons_df1
    .withColumn('birth_year', year('date_of_birth'))
    .withColumnRenamed('fav_movies', 'movies')
    .withColumn('salary_x10', round(col('salary_increase'), 2))
    .drop('salary_increase')
) 

In [46]:
aug_persons_df2.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
| id|first_name|last_name|              movies| salary|           image_url|date_of_birth|active|birth_year|salary_x10|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|      1991|    1609.7|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|      1991|   3306.64|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|      1990|   1565.17|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|      1987|    3917.5|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|      1992|   5428.36|
|  6|    Oswald| Petrolli|[Wing and the 

## Working with missing or bad data

In [47]:
bad_movies_list = [Row(None, None, None),
                   Row(None, None, 2020),
                   Row("John Doe", "Awesome Movie", None),
                   Row(None, "Awesome Movie", 2021),
                   Row("Mary Jane", None, 2019),
                   Row("Vikter Duplaix", "Not another teen movie", 2001)]

In [48]:
bad_movies_columns = ['actor_name', 'movie_title', 'produced_year']

In [49]:
bad_movies_df = spark.createDataFrame(bad_movies_list, schema=bad_movies_columns)

In [50]:
bad_movies_df.show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|          null|                null|         null|
|          null|                null|         2020|
|      John Doe|       Awesome Movie|         null|
|          null|       Awesome Movie|         2021|
|     Mary Jane|                null|         2019|
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



In [51]:
bad_movies_df.na.drop().show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



In [52]:
bad_movies_df.na.drop('any').show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



In [53]:
bad_movies_df.na.drop('all').show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|          null|                null|         2020|
|      John Doe|       Awesome Movie|         null|
|          null|       Awesome Movie|         2021|
|     Mary Jane|                null|         2019|
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



In [54]:
bad_movies_df.filter(col('actor_name').isNull() != True).show()

+--------------+--------------------+-------------+
|    actor_name|         movie_title|produced_year|
+--------------+--------------------+-------------+
|      John Doe|       Awesome Movie|         null|
|     Mary Jane|                null|         2019|
|Vikter Duplaix|Not another teen ...|         2001|
+--------------+--------------------+-------------+



In [55]:
bad_movies_df.describe('produced_year').show()

+-------+-----------------+
|summary|    produced_year|
+-------+-----------------+
|  count|                4|
|   mean|          2015.25|
| stddev|9.535023160258536|
|    min|             2001|
|    max|             2021|
+-------+-----------------+



In [56]:
bad_movies_df.describe('actor_name').show()

+-------+--------------+
|summary|    actor_name|
+-------+--------------+
|  count|             3|
|   mean|          null|
| stddev|          null|
|    min|      John Doe|
|    max|Vikter Duplaix|
+-------+--------------+



## Working with user Defined Functions

### Spark UDF are very poorly optimized, try to avoid using them as much as you possibly can

In [57]:
from pyspark.sql.functions import udf

In [58]:
students_list = [
    ('Joe', 85),
    ('Jane', 88),
    ('Mary', 83),
]

In [59]:
students_columns = ['name', 'score']

In [60]:
students_df = spark.createDataFrame(students_list, schema=students_columns)

In [61]:
def letterGrade(score: int):
    grade = ''
    
    if score > 100:
        grade = 'Cheating'
    if score >= 90 and score <= 100:
        grade = 'A'
    if score >= 80 and score < 90:
        grade = 'B'
    if score >= 70 and score < 80:
        grade = 'C'
    
    return grade

In [62]:
letterGradeUDF = udf(letterGrade)

In [63]:
students_df.select('name', 'score', letterGradeUDF(col('score')).alias('grade')).show()

+----+-----+-----+
|name|score|grade|
+----+-----+-----+
| Joe|   85|    B|
|Jane|   88|    B|
|Mary|   83|    B|
+----+-----+-----+

