In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("example_2").getOrCreate()

In [6]:
df = spark.read.option("delimiter", ";").csv("example_2.csv", header=True, inferSchema=True)
df.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|   Agatha|  31|        12| 30000|
|     Beth|  32|         2| 25000|
|  Charles|  44|        12| 20000|
|   Damian|  23|        43| 20000|
|    Euler|  12|        12| 15000|
|Feuerbach|  23|        23| 18000|
|  Gabriel|  23|        43| 40000|
|    Henry|NULL|      NULL| 38000|
|     NULL|  34|        10|  NULL|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [7]:
df.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|   Agatha| 31|        12| 30000|
|     Beth| 32|         2| 25000|
|  Charles| 44|        12| 20000|
|   Damian| 23|        43| 20000|
|    Euler| 12|        12| 15000|
|Feuerbach| 23|        23| 18000|
|  Gabriel| 23|        43| 40000|
+---------+---+----------+------+



In [8]:
df.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|   Agatha|  31|        12| 30000|
|     Beth|  32|         2| 25000|
|  Charles|  44|        12| 20000|
|   Damian|  23|        43| 20000|
|    Euler|  12|        12| 15000|
|Feuerbach|  23|        23| 18000|
|  Gabriel|  23|        43| 40000|
|    Henry|NULL|      NULL| 38000|
|     NULL|  34|        10|  NULL|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [9]:
df.na.drop(how="any", thresh=2).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|   Agatha|  31|        12| 30000|
|     Beth|  32|         2| 25000|
|  Charles|  44|        12| 20000|
|   Damian|  23|        43| 20000|
|    Euler|  12|        12| 15000|
|Feuerbach|  23|        23| 18000|
|  Gabriel|  23|        43| 40000|
|    Henry|NULL|      NULL| 38000|
|     NULL|  34|        10|  NULL|
+---------+----+----------+------+



In [11]:
df.na.drop(how="any", subset=["name"]).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|   Agatha|  31|        12| 30000|
|     Beth|  32|         2| 25000|
|  Charles|  44|        12| 20000|
|   Damian|  23|        43| 20000|
|    Euler|  12|        12| 15000|
|Feuerbach|  23|        23| 18000|
|  Gabriel|  23|        43| 40000|
|    Henry|NULL|      NULL| 38000|
+---------+----+----------+------+



In [12]:
df.na.fill("missing", subset=["name"]).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|   Agatha|  31|        12| 30000|
|     Beth|  32|         2| 25000|
|  Charles|  44|        12| 20000|
|   Damian|  23|        43| 20000|
|    Euler|  12|        12| 15000|
|Feuerbach|  23|        23| 18000|
|  Gabriel|  23|        43| 40000|
|    Henry|NULL|      NULL| 38000|
|  missing|  34|        10|  NULL|
|  missing|  36|      NULL|  NULL|
+---------+----+----------+------+



In [13]:
from pyspark.ml.feature import Imputer

In [27]:
imputer = Imputer(inputCols=["Age", "Experience", "Salary"],
                outputCols=[f"filled_{col}" for col in ["Age", "Experience", "Salary"]]
                ).setStrategy("mean")

filled_df = imputer.fit(df).transform(df)
filled_df.show()

+---------+----+----------+------+----------+-----------------+-------------+
|     Name| Age|Experience|Salary|filled_Age|filled_Experience|filled_Salary|
+---------+----+----------+------+----------+-----------------+-------------+
|   Agatha|  31|        12| 30000|        31|               12|        30000|
|     Beth|  32|         2| 25000|        32|                2|        25000|
|  Charles|  44|        12| 20000|        44|               12|        20000|
|   Damian|  23|        43| 20000|        23|               43|        20000|
|    Euler|  12|        12| 15000|        12|               12|        15000|
|Feuerbach|  23|        23| 18000|        23|               23|        18000|
|  Gabriel|  23|        43| 40000|        23|               43|        40000|
|    Henry|NULL|      NULL| 38000|        28|               19|        38000|
|     NULL|  34|        10|  NULL|        34|               10|        25750|
|     NULL|  36|      NULL|  NULL|        36|               19| 

In [30]:
filled_df = filled_df.drop("Age")
filled_df = filled_df.drop("Experience")
filled_df = filled_df.drop("Salary")

filled_df = filled_df.withColumn("Age", filled_df["filled_Age"])
filled_df = filled_df.withColumn("Experience", filled_df["filled_Experience"])
filled_df = filled_df.withColumn("Salary", filled_df["filled_Salary"])
filled_df = filled_df.na.fill("missing", subset=["name"])

filled_df = filled_df.drop("filled_Age")
filled_df = filled_df.drop("filled_Experience")
filled_df = filled_df.drop("filled_Salary")

filled_df.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|   Agatha| 31|        12| 30000|
|     Beth| 32|         2| 25000|
|  Charles| 44|        12| 20000|
|   Damian| 23|        43| 20000|
|    Euler| 12|        12| 15000|
|Feuerbach| 23|        23| 18000|
|  Gabriel| 23|        43| 40000|
|    Henry| 28|        19| 38000|
|  missing| 34|        10| 25750|
|  missing| 36|        19| 25750|
+---------+---+----------+------+

