We can drop rows or columns containing missing values using the method .dropna().
We can fill missing data with a specific value or use interpolation methods with the method .fillna().
We can impute missing values using statistical methods, such as mean or median, using Imputer.


In [5]:
# Install numpy if not already installed
!pip install numpy

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.feature import Imputer

# Create SparkSession
spark = SparkSession.builder.appName("MissingDataHandling").getOrCreate()

# Define schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("price", FloatType(), True),
    StructField("rooms", IntegerType(), True)
])

# Create a list of Rows with some missing values
row_list = [
    Row(name="James", age=23, price=300000.0, rooms=3),
    Row(name="Ann", age=None, price=None, rooms=2),
    Row(name="John", age=45, price=500000.0, rooms=None),
    Row(name=None, age=30, price=400000.0, rooms=4)
]

# Create DataFrame
df = spark.createDataFrame(row_list, schema)
df.show()

# How to drop rows with any missing values
df_dropped = df.dropna(how="any")
df_dropped.show()

# How to fill missing values with a constant
df_filled = df.fillna(value=2) #.fillna(value=100000.0, subset=["price"]) #.fillna(value="name")
df_filled.show()

# How to impute values with median
imputer = Imputer(strategy="median", inputCols=["price", "rooms"], outputCols=["price_imputed", "rooms_imputed"])
model = imputer.fit(df)
df_imputed = model.transform(df)
df_imputed.show()

+-----+----+--------+-----+
| name| age|   price|rooms|
+-----+----+--------+-----+
|James|  23|300000.0|    3|
|  Ann|NULL|    NULL|    2|
| John|  45|500000.0| NULL|
| NULL|  30|400000.0|    4|
+-----+----+--------+-----+

+-----+---+--------+-----+
| name|age|   price|rooms|
+-----+---+--------+-----+
|James| 23|300000.0|    3|
+-----+---+--------+-----+

+-----+----+--------+-----+
| name| age|   price|rooms|
+-----+----+--------+-----+
|James|  23|300000.0|    3|
|  Ann|NULL|100000.0|    2|
| John|  45|500000.0| NULL|
| NULL|  30|400000.0|    4|
+-----+----+--------+-----+

+-----+----+--------+-----+-------------+-------------+
| name| age|   price|rooms|price_imputed|rooms_imputed|
+-----+----+--------+-----+-------------+-------------+
|James|  23|300000.0|    3|     300000.0|            3|
|  Ann|NULL|    NULL|    2|     400000.0|            2|
| John|  45|500000.0| NULL|     500000.0|            3|
| NULL|  30|400000.0|    4|     400000.0|            4|
+-----+----+--------+-