## Dropping columns from spark dataframe

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
import datetime

In [2]:
spark = SparkSession.builder.appName('DropColumnsSpark').getOrCreate()

In [3]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= "8273929", shop=None),
                "courses": [1, 3, 5, 7],
                "email": "pheobebuffay@abc.com",
                "is_customer": True,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= None, shop=None),
                "courses": [2, 4, 5],
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 3,
                "first_name": "Monica",
                "last_name": "Geller",
                "phone_numbers": Row(mobile= None, home= None, office= None, shop=None),
                "courses": [2],
                "email": "monica@abc.com",
                "is_customer": True,
                "amount_paid": 1000.90,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 28, 7, 33, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "phone_numbers": Row(mobile= "82349238942", home= None, office= None, shop=None),
                "courses": [],
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 5,
                "first_name": "Rachel",
                "last_name": "Green",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= "8273929", shop= "5343434654"),
                "courses": [3],
                "email": "rachel@abc.com",
                "is_customer": True,
                "amount_paid": None,
                "customer_from": datetime.date(2021, 2, 24),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 6,
                "first_name": "Chandler",
                "last_name": "Bing",
                "phone_numbers": Row(mobile= "8273929", home= None, office= None, shop=None),
                "courses": [2, 4],
                "email": "bing@abc.com",
                "is_customer": True,
                "amount_paid": 1000.80,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 25, 7, 33, 0)
            }
        ]

In [4]:
usersDF = spark.createDataFrame(users)

usersDF.printSchema()



root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_name: string (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)



In [5]:
help(usersDF.drop)

Help on method drop in module pyspark.sql.dataframe:

drop(*cols) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` that drops the specified column.
    This is a no-op if schema doesn't contain the given column name(s).
    
    :param cols: a string name of the column to drop, or a
        :class:`Column` to drop, or a list of string name of the columns to drop.
    
    >>> df.drop('age').collect()
    [Row(name='Alice'), Row(name='Bob')]
    
    >>> df.drop(df.age).collect()
    [Row(name='Alice'), Row(name='Bob')]
    
    >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
    [Row(age=5, height=85, name='Bob')]
    
    >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect()
    [Row(age=5, name='Bob', height=85)]
    
    >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect()
    [Row(name='Bob')]
    
    .. versionadded:: 1.4



In [6]:
usersDF.drop('last_updated_ts').printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_name: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)



In [7]:
usersDF.drop(usersDF['last_updated_ts']).printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_name: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)



In [8]:
usersDF.drop(col('last_updated_ts')).printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_name: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)



In [9]:
# If we have column name which does not exist, the column will be ignored
usersDF.drop(col('users_id')).printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_name: string (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)



In [10]:
# Drop Multiple columns
usersDF.drop('first_name', 'last_name').printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)



In [11]:
# This will fail as we are passing multiple column objects
# When we want to pass more than one column, we have to pass all column names as strings
usersDF.drop(col('first_name'), col('last_name')).printSchema()

TypeError: each col in the param list should be a string

In [12]:
usersDF.drop(col('first_name')).printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_name: string (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)



In [13]:
# If we have column name which does not exist, the column will be ignored
usersDF.drop('user_id', 'first_name', 'last_name').printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)



**NOTE:**
* `drop` accepts column names separated by string.
* If we are passing more than one column object, it will fail.

#### Drop duplicate records

* Drop duplicates based on all columns, it is known as distinct.
* Drop duplicates based on certain columns.
* We can use `distinct`, `drop_duplicates` or `dropDuplicates` for both scenarios.

In [14]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "email": "pheobebuffay@abc.com",
                "is_customer": True,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 1000.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            }
        ]

In [15]:
usersDF = spark.createDataFrame(users)

In [16]:
usersDF.show()

+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name| id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|  1|       true|   Buffay|2021-02-10 01:15:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|  2|       true|Tribbiani|2021-02-18 03:33:00|
|     1000.0|   2021-02-14|        joey@abc.com|      Joey|  2|       true|Tribbiani|2021-02-18 03:33:00|
|    1200.55|   2021-01-19|        ross@abc.com|      Ross|  4|       true|   Geller|2021-02-18 01:10:00|
|    1200.55|   2021-01-19|        ross@abc.com|      Ross|  4|       true|   Geller|2021-02-18 01:10:00|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+



In [17]:
usersDF.count()

5

In [18]:
help(usersDF.distinct)

Help on method distinct in module pyspark.sql.dataframe:

distinct() method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
    
    >>> df.distinct().count()
    2
    
    .. versionadded:: 1.3



In [19]:
# Drop exact duplicates
usersDF.distinct().show()

+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name| id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|    1200.55|   2021-01-19|        ross@abc.com|      Ross|  4|       true|   Geller|2021-02-18 01:10:00|
|     1000.0|   2021-02-14|        joey@abc.com|      Joey|  2|       true|Tribbiani|2021-02-18 03:33:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|  2|       true|Tribbiani|2021-02-18 03:33:00|
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|  1|       true|   Buffay|2021-02-10 01:15:00|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+



In [20]:
help(usersDF.drop_duplicates)

Help on method dropDuplicates in module pyspark.sql.dataframe:

dropDuplicates(subset=None) method of pyspark.sql.dataframe.DataFrame instance
    :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
    
    .. versionadded:: 1.4



In [21]:
help(usersDF.dropDuplicates)

Help on method dropDuplicates in module pyspark.sql.dataframe:

dropDuplicates(subset=None) method of pyspark.sql.dataframe.DataFrame instance
    Return a new :class:`DataFrame` with duplicate rows removed,
    optionally only considering certain columns.
    
    For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming
    :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop
    duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can
    be and system will accordingly limit the state. In addition, too late data older than
    watermark will be dropped to avoid any possibility of duplicates.
    
    :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
    
    >>> from pyspark.sql import Row
    >>> df = sc.parallelize([ \
    ...     Row(name='Alice', age=5, height=80), \
    ...     Row(name='Alice', age=5, height=80), \
    ...     Row(name='Alice', age=10, height=80)]).to

In [22]:
# We can also drop duplicates based on certain columns
# This will fail as the function expects sequence type object such as list or array
usersDF.dropDuplicates('id').show()

Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonUtils.toSeq. Trace:
py4j.Py4JException: Method toSeq([class java.lang.String]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:339)
	at py4j.Gateway.invoke(Gateway.java:276)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)



In [None]:
usersDF.dropDuplicates(['id']).show()

In [None]:
usersDF.dropDuplicates(['id', 'amount_paid']).show()

#### Dropping Null bases records from spark dataframe

* Drop records where all column value are nulls.
* Drop records any of the column value is null.
* Drop records that have less than `thresh` non-null values.
* Drop records when any of the column value or all column values are nulls for provided subset of columns.
* We can use `df.na.drop` or `df.dropna` to take care of dealing with records having columns with null values.

In [23]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "email": "pheobebuffay@abc.com",
                "is_customer": True,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": None,
                "first_name": None,
                "last_name": None,
                "email": None,
                "is_customer": None,
                "amount_paid": None,
                "customer_from": None,
                "last_updated_ts": None
            },
            {
                "id": None,
                "first_name": None,
                "last_name": None,
                "email": "ross@abc.com",
                "is_customer": None,
                "amount_paid": None,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 4,
                "first_name": None,
                "last_name": None,
                "email": None,
                "is_customer": None,
                "amount_paid": None,
                "customer_from": None,
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            }
        ]

In [25]:
usersDF = spark.createDataFrame(users)
usersDF.show()

+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name|  id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|   1|       true|   Buffay|2021-02-10 01:15:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|   2|       true|Tribbiani|2021-02-18 03:33:00|
|       null|         null|                null|      null|null|       null|     null|               null|
|       null|   2021-01-19|        ross@abc.com|      null|null|       null|     null|2021-02-18 01:10:00|
|       null|         null|                null|      null|   4|       null|     null|2021-02-18 01:10:00|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+



In [26]:
help(usersDF.na)

Help on DataFrameNaFunctions in module pyspark.sql.dataframe object:

class DataFrameNaFunctions(builtins.object)
 |  DataFrameNaFunctions(df)
 |  
 |  Functionality for working with missing data in :class:`DataFrame`.
 |  
 |  .. versionadded:: 1.4
 |  
 |  Methods defined here:
 |  
 |  __init__(self, df)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  drop(self, how='any', thresh=None, subset=None)
 |      Returns a new :class:`DataFrame` omitting rows with null values.
 |      :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.
 |      
 |      :param how: 'any' or 'all'.
 |          If 'any', drop a row if it contains any nulls.
 |          If 'all', drop a row only if all its values are null.
 |      :param thresh: int, default None
 |          If specified, drop rows that have less than `thresh` non-null values.
 |          This overwrites the `how` parameter.
 |      :param subset: optional list of column nam

In [30]:
usersDF.na.drop().show()

+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name| id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|  1|       true|   Buffay|2021-02-10 01:15:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|  2|       true|Tribbiani|2021-02-18 03:33:00|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+



In [32]:
usersDF.na.drop(how='all').show()

+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name|  id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|   1|       true|   Buffay|2021-02-10 01:15:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|   2|       true|Tribbiani|2021-02-18 03:33:00|
|       null|   2021-01-19|        ross@abc.com|      null|null|       null|     null|2021-02-18 01:10:00|
|       null|         null|                null|      null|   4|       null|     null|2021-02-18 01:10:00|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+



In [33]:
usersDF.na.drop(thresh=2).show()

+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name|  id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|   1|       true|   Buffay|2021-02-10 01:15:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|   2|       true|Tribbiani|2021-02-18 03:33:00|
|       null|   2021-01-19|        ross@abc.com|      null|null|       null|     null|2021-02-18 01:10:00|
|       null|         null|                null|      null|   4|       null|     null|2021-02-18 01:10:00|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+



In [34]:
usersDF.na.drop(thresh=3).show()

+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name|  id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|   1|       true|   Buffay|2021-02-10 01:15:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|   2|       true|Tribbiani|2021-02-18 03:33:00|
|       null|   2021-01-19|        ross@abc.com|      null|null|       null|     null|2021-02-18 01:10:00|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+



In [35]:
usersDF.na.drop(how='all', subset=['id', 'email']).show()

+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name|  id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|   1|       true|   Buffay|2021-02-10 01:15:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|   2|       true|Tribbiani|2021-02-18 03:33:00|
|       null|   2021-01-19|        ross@abc.com|      null|null|       null|     null|2021-02-18 01:10:00|
|       null|         null|                null|      null|   4|       null|     null|2021-02-18 01:10:00|
+-----------+-------------+--------------------+----------+----+-----------+---------+-------------------+



In [36]:
usersDF.na.drop(how='any', subset=['id', 'email']).show()

+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name| id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|    1000.55|   2021-01-13|pheobebuffay@abc.com|    Pheobe|  1|       true|   Buffay|2021-02-10 01:15:00|
|      900.0|   2021-02-14|        joey@abc.com|      Joey|  2|       true|Tribbiani|2021-02-18 03:33:00|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+

