In [0]:
orders = spark.read.csv(
    '/public/retail_db/orders',
    schema='order_id INT, order_date STRING, order_customer_id INT, order_status STRING'
)
from pyspark.sql.functions import *

In [0]:
# DROPPING COLUMNS

orders.drop("order_status")

cols_to_drop = [col('order_id'), col('order_date')]
orders.drop(*cols_to_drop)


# DROPPING ROWS

orders.distinct()
orders.dropDuplicates(['order_date', 'order_customer_id'])

orders.na.drop(how='any', thresh=3, subset=['order_date', 'order_customer_id'])

In [0]:
# Typical column dropping:

orders.drop("order_status").show(n=3, truncate=False)

+--------+---------------------+-----------------+
|order_id|order_date           |order_customer_id|
+--------+---------------------+-----------------+
|1       |2013-07-25 00:00:00.0|11599            |
|2       |2013-07-25 00:00:00.0|256              |
|3       |2013-07-25 00:00:00.0|12111            |
+--------+---------------------+-----------------+
only showing top 3 rows



In [0]:
# We can also drop multiple columns at once

cols_to_drop = [col('order_id'), col('order_date')]
orders.drop(*cols_to_drop).show(n=3, truncate=False)

+-----------------+---------------+
|order_customer_id|order_status   |
+-----------------+---------------+
|11599            |CLOSED         |
|256              |PENDING_PAYMENT|
|12111            |COMPLETE       |
+-----------------+---------------+
only showing top 3 rows



In [0]:
# Dropping column that doesn't exist is ignored:

orders.drop("order_sample").show(n=3, truncate=False)

+--------+---------------------+-----------------+---------------+
|order_id|order_date           |order_customer_id|order_status   |
+--------+---------------------+-----------------+---------------+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |
+--------+---------------------+-----------------+---------------+
only showing top 3 rows



In [0]:
# df.distinct() - distinct is applied to all columns

print(orders.count())
orders.select('order_date').distinct().count()

68883


364

In [0]:
# df.dropDuplicates(subset<list>) - lets us specify which columns to consider for duplicate detection

print(orders.count())
orders.dropDuplicates(['order_date', 'order_customer_id']).count()

68883


68321

In [0]:
# To drop nulls we can use 2 options:
#   df.na.drop()
#   df.dropna()

'''
    Parameters
    ----------
    how : str, optional, the values that can be 'any' or 'all', default 'any'.
        If 'any', drop a row if it contains any nulls.
        If 'all', drop a row only if all its values are null.
    thresh: int, optional, default None.
        If specified, drop rows that have less than `thresh` non-null values.
        This overwrites the `how` parameter.
    subset : str, tuple or list, optional
        optional list of column names to consider.
'''

help(orders.na.drop)

Help on method drop in module pyspark.sql.dataframe:

drop(how: str = 'any', thresh: Optional[int] = None, subset: Union[str, Tuple[str, ...], List[str], NoneType] = None) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.dataframe.DataFrameNaFunctions instance
    Returns a new :class:`DataFrame` omitting rows with null values.
    :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are
    aliases of each other.
    
    .. versionadded:: 1.3.1
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    how : str, optional, the values that can be 'any' or 'all', default 'any'.
        If 'any', drop a row if it contains any nulls.
        If 'all', drop a row only if all its values are null.
    thresh: int, optional, default None.
        If specified, drop rows that have less than `thresh` non-null values.
        This overwrites the `how` parameter.
    subset : str, tuple or list, optional
        optional list of co