In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType, LongType)
spark = SparkSession.builder.appName("Schema_Recovery").getOrCreate()

#DATASET 1 — USER PROFILE API (CORRUPTED TYPES)

###Problems intentionally introduced
Age as string,
Non-numeric age values,
Salary mixed as int, string, shorthand ( 45k ),
Missing names,
Empty salary

Exercises

1. Design a StructType schema for this data
2. Load the data using the schema
3. Identify records that fail type conversion
4. Convert age to integer safely
5. Normalize salary into integer (handle k )
6. Replace missing names with "UNKNOWN"
7. Drop records where age cannot be recovered
8. Produce a final clean DataFrame

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
#1
raw_users = [
    ("U001", "Amit", "29", "Hyderabad", "50000"),
    ("U002", "Neha", "Thirty Two", "Delhi", "62000"),
    ("U003", "Ravi", None, "Bangalore", "45k"),
    ("U004", "Pooja", "28", "Mumbai", 58000),
    ("U005", None, "31", "Chennai", "")
]
user_schema = StructType([
    StructField("user_id", StringType()),
    StructField("name", StringType()),
    StructField("age_raw", StringType()),
    StructField("city", StringType()),
    StructField("salary_raw", StringType())
])

In [5]:
#2
df_users = spark.createDataFrame(raw_users, user_schema)
df_users.printSchema()
df_users.show()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age_raw: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary_raw: string (nullable = true)

+-------+-----+----------+---------+----------+
|user_id| name|   age_raw|     city|salary_raw|
+-------+-----+----------+---------+----------+
|   U001| Amit|        29|Hyderabad|     50000|
|   U002| Neha|Thirty Two|    Delhi|     62000|
|   U003| Ravi|      NULL|Bangalore|       45k|
|   U004|Pooja|        28|   Mumbai|     58000|
|   U005| NULL|        31|  Chennai|          |
+-------+-----+----------+---------+----------+



In [9]:
#3
df_invalid_age = df_users.filter(~col("age_raw").rlike("^[0-9]+$") | col("age_raw").isNull()
)

df_invalid_age.show()

+-------+----+----------+---------+----------+
|user_id|name|   age_raw|     city|salary_raw|
+-------+----+----------+---------+----------+
|   U002|Neha|Thirty Two|    Delhi|     62000|
|   U003|Ravi|      NULL|Bangalore|       45k|
+-------+----+----------+---------+----------+



In [11]:
#4
df_age_converted = df_users.withColumn(
    "age_int",
    when(col("age_raw").rlike("^[0-9]+$"), col("age_raw").cast(IntegerType()))
)

df_age_converted.show()

+-------+-----+----------+---------+----------+-------+
|user_id| name|   age_raw|     city|salary_raw|age_int|
+-------+-----+----------+---------+----------+-------+
|   U001| Amit|        29|Hyderabad|     50000|     29|
|   U002| Neha|Thirty Two|    Delhi|     62000|   NULL|
|   U003| Ravi|      NULL|Bangalore|       45k|   NULL|
|   U004|Pooja|        28|   Mumbai|     58000|     28|
|   U005| NULL|        31|  Chennai|          |     31|
+-------+-----+----------+---------+----------+-------+



In [13]:
#5
df_salary_normalized = df_users.withColumn(
    "salary_int",
    when(col("salary_raw").rlike("^[0-9]+$"), col("salary_raw").cast(IntegerType()))
    .when(col("salary_raw").rlike("k$"),
          regexp_replace(col("salary_raw"), "k", "").cast(IntegerType()) * 1000)
)

df_salary_normalized.show()

+-------+-----+----------+---------+----------+----------+
|user_id| name|   age_raw|     city|salary_raw|salary_int|
+-------+-----+----------+---------+----------+----------+
|   U001| Amit|        29|Hyderabad|     50000|     50000|
|   U002| Neha|Thirty Two|    Delhi|     62000|     62000|
|   U003| Ravi|      NULL|Bangalore|       45k|     45000|
|   U004|Pooja|        28|   Mumbai|     58000|     58000|
|   U005| NULL|        31|  Chennai|          |      NULL|
+-------+-----+----------+---------+----------+----------+



In [14]:
#6
df_name_fixed = df_users.withColumn(
    "name",
    when(col("name").isNull(), "UNKNOWN").otherwise(col("name"))
)

df_name_fixed.show()

+-------+-------+----------+---------+----------+
|user_id|   name|   age_raw|     city|salary_raw|
+-------+-------+----------+---------+----------+
|   U001|   Amit|        29|Hyderabad|     50000|
|   U002|   Neha|Thirty Two|    Delhi|     62000|
|   U003|   Ravi|      NULL|Bangalore|       45k|
|   U004|  Pooja|        28|   Mumbai|     58000|
|   U005|UNKNOWN|        31|  Chennai|          |
+-------+-------+----------+---------+----------+



In [15]:
#7
df_valid_age = df_age_converted.withColumn(
    "age_int",
    when(col("age_raw").rlike("^[0-9]+$"), col("age_raw").cast(IntegerType()))
).filter(col("age_int").isNotNull())

df_valid_age.show()

+-------+-----+-------+---------+----------+-------+
|user_id| name|age_raw|     city|salary_raw|age_int|
+-------+-----+-------+---------+----------+-------+
|   U001| Amit|     29|Hyderabad|     50000|     29|
|   U004|Pooja|     28|   Mumbai|     58000|     28|
|   U005| NULL|     31|  Chennai|          |     31|
+-------+-----+-------+---------+----------+-------+



In [17]:
#8
df_clean_final = (
    df_users.withColumn("name",when(col("name").isNull(), "UNKNOWN").otherwise(col("name"))
    ).withColumn("age_raw",
        when(col("age_raw").rlike("^[0-9]+$"), col("age_raw").cast(IntegerType()))
    )
    .withColumn("salary_raw",
        when(col("salary_raw").rlike("^[0-9]+$"), col("salary_raw").cast(IntegerType()))
        .when(col("salary_raw").rlike("k$"),
              regexp_replace(col("salary_raw"), "k", "").cast(IntegerType()) * 1000)
    )
    .filter(col("age_raw").isNotNull())
    .select("user_id", "name", "age_raw", "city", "salary_raw")
)

df_clean_final.show()

+-------+-------+-------+---------+----------+
|user_id|   name|age_raw|     city|salary_raw|
+-------+-------+-------+---------+----------+
|   U001|   Amit|     29|Hyderabad|     50000|
|   U004|  Pooja|     28|   Mumbai|     58000|
|   U005|UNKNOWN|     31|  Chennai|      NULL|
+-------+-------+-------+---------+----------+



#DATASET 2 — E-COMMERCE ORDERS (ARRAY CORRUPTION)

Problems intentionally introduced

Items sometimes string, sometimes array

Different delimiters ( , and | )

Single item as string

Null items

Exercises

1. Define a schema with ArrayType
2. Normalize all item values into arrays
3. Handle multiple delimiters
4. Replace null items with empty arrays
5. Explode items into one row per item
6. Count frequency of each item
7. Identify orders with more than 2 items

In [19]:
#1
raw_orders = [
("O001","U001","Laptop,Mobile,Tablet",75000),
("O002","U002",["Mobile","Tablet"],32000),
("O003","U003","Laptop",72000),
("O004","U004",None,25000),
("O005","U005","Laptop|Mobile",68000)
]

schema = StructType([
    StructField("order_id", StringType()),
    StructField("user_id", StringType()),
    StructField("items", StringType()),
    StructField("amount", IntegerType())
])

In [20]:
df = spark.createDataFrame(raw_orders, schema)
df.printSchema()
df.show()

root
 |-- order_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- items: string (nullable = true)
 |-- amount: integer (nullable = true)

+--------+-------+--------------------+------+
|order_id|user_id|               items|amount|
+--------+-------+--------------------+------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|
|    O002|   U002|    [Mobile, Tablet]| 32000|
|    O003|   U003|              Laptop| 72000|
|    O004|   U004|                NULL| 25000|
|    O005|   U005|       Laptop|Mobile| 68000|
+--------+-------+--------------------+------+



In [22]:
#2
df_items_array = df.withColumn(
    "items_array",when(col("items").isNull(), array()).otherwise(split(col("items"), ","))
)
df_items_array.show()


+--------+-------+--------------------+------+--------------------+
|order_id|user_id|               items|amount|         items_array|
+--------+-------+--------------------+------+--------------------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|
|    O002|   U002|    [Mobile, Tablet]| 32000| [[Mobile,  Tablet]]|
|    O003|   U003|              Laptop| 72000|            [Laptop]|
|    O004|   U004|                NULL| 25000|                  []|
|    O005|   U005|       Laptop|Mobile| 68000|     [Laptop|Mobile]|
+--------+-------+--------------------+------+--------------------+



In [24]:
#3
df_items_delim = df.withColumn(
    "items_array",when(col("items").isNull(), array())
    .otherwise(split(regexp_replace(col("items"), "\\|", ","), ","))
)
df_items_delim.show()

+--------+-------+--------------------+------+--------------------+
|order_id|user_id|               items|amount|         items_array|
+--------+-------+--------------------+------+--------------------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|
|    O002|   U002|    [Mobile, Tablet]| 32000| [[Mobile,  Tablet]]|
|    O003|   U003|              Laptop| 72000|            [Laptop]|
|    O004|   U004|                NULL| 25000|                  []|
|    O005|   U005|       Laptop|Mobile| 68000|    [Laptop, Mobile]|
+--------+-------+--------------------+------+--------------------+



In [26]:
#4
df_no_null_items = df_items_delim.withColumn("items_array",
    when(col("items_array").isNull(), array()).otherwise(col("items_array"))
)
df_no_null_items.show()

+--------+-------+--------------------+------+--------------------+
|order_id|user_id|               items|amount|         items_array|
+--------+-------+--------------------+------+--------------------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|
|    O002|   U002|    [Mobile, Tablet]| 32000| [[Mobile,  Tablet]]|
|    O003|   U003|              Laptop| 72000|            [Laptop]|
|    O004|   U004|                NULL| 25000|                  []|
|    O005|   U005|       Laptop|Mobile| 68000|    [Laptop, Mobile]|
+--------+-------+--------------------+------+--------------------+



In [27]:
#5
df_exploded = df_no_null_items.withColumn("item", explode(col("items_array"))
)
df_exploded.show()

+--------+-------+--------------------+------+--------------------+--------+
|order_id|user_id|               items|amount|         items_array|    item|
+--------+-------+--------------------+------+--------------------+--------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|  Laptop|
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|  Mobile|
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|  Tablet|
|    O002|   U002|    [Mobile, Tablet]| 32000| [[Mobile,  Tablet]]| [Mobile|
|    O002|   U002|    [Mobile, Tablet]| 32000| [[Mobile,  Tablet]]| Tablet]|
|    O003|   U003|              Laptop| 72000|            [Laptop]|  Laptop|
|    O005|   U005|       Laptop|Mobile| 68000|    [Laptop, Mobile]|  Laptop|
|    O005|   U005|       Laptop|Mobile| 68000|    [Laptop, Mobile]|  Mobile|
+--------+-------+--------------------+------+--------------------+--------+



In [28]:
#6
df_exploded.groupBy("item").count().show()

+--------+-----+
|    item|count|
+--------+-----+
| [Mobile|    1|
|  Laptop|    3|
|  Mobile|    2|
|  Tablet|    1|
| Tablet]|    1|
+--------+-----+



In [29]:
#7
df_more_than_2 = df_no_null_items.withColumn("item_count", size(col("items_array"))
).filter(col("item_count") > 2)
df_more_than_2.show()

+--------+-------+--------------------+------+--------------------+----------+
|order_id|user_id|               items|amount|         items_array|item_count|
+--------+-------+--------------------+------+--------------------+----------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|         3|
+--------+-------+--------------------+------+--------------------+----------+



#DATASET 3 — DEVICE USAGE (MAP CORRUPTION)

Problems intentionally introduced

Map sometimes string

Inconsistent delimiters

Values as strings

Missing maps

Exercises

1. Design a MapType(StringType, IntegerType) schema
2. Parse string maps into proper maps
3. Convert all usage values to integers
4. Handle malformed key-value pairs
5. Replace missing maps with empty maps
6. Extract mobile usage safely
7. Identify users with usage above a threshold

In [36]:
#1
raw_devices = [
("U001",{"mobile":120,"laptop":300}),
("U002","mobile:200,tablet:100"),
("U003",{"desktop":"400","mobile":"150"}),
("U004",None),
("U005","laptop-250")
]
device_schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("device_usuage", StringType(), True) # Changed to StringType
])

df_devices = spark.createDataFrame(raw_devices, schema=device_schema)
df_devices.printSchema()
df_devices.show(truncate = False)

root
 |-- user_id: string (nullable = false)
 |-- device_usuage: string (nullable = true)

+-------+-------------------------+
|user_id|device_usuage            |
+-------+-------------------------+
|U001   |{mobile=120, laptop=300} |
|U002   |mobile:200,tablet:100    |
|U003   |{mobile=150, desktop=400}|
|U004   |NULL                     |
|U005   |laptop-250               |
+-------+-------------------------+



In [41]:
#Normalizing
df_clean_str = df_devices.withColumn(
    "usage_str",
    when(col("device_usuage").isNull(), lit(None))
    .otherwise(
        regexp_replace(
            regexp_replace(
                regexp_replace(
                    col("device_usuage"),
                    "\\{|\\}", ""  # Remove curly braces
                ),
                "=", ":"         # Replace = with :
            ),
            "-", ":"             # Replace - with :
        )
    )
)

df_map = (
    df_clean_str
    .withColumn("pairs", split(col("usage_str"), ","))
    .withColumn("kv", explode(col("pairs")))
    .withColumn("key", element_at(split(col("kv"), ":"), 1)) # Use element_at for safe access
    .withColumn("value", element_at(split(col("kv"), ":"), 2).cast("int")) # Use element_at for safe access
    .filter(col("key").isNotNull() & col("value").isNotNull())
    .groupBy("user_id")
    .agg(
        map_from_entries(
            collect_list(struct(col("key"), col("value")))
        ).alias("usage")
    )
)
df_map.printSchema()
df_map.show(truncate = False)

root
 |-- user_id: string (nullable = false)
 |-- usage: map (nullable = false)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)

+-------+--------------------------------+
|user_id|usage                           |
+-------+--------------------------------+
|U002   |{mobile -> 200, tablet -> 100}  |
|U001   |{mobile -> 120,  laptop -> 300} |
|U005   |{laptop -> 250}                 |
|U003   |{mobile -> 150,  desktop -> 400}|
+-------+--------------------------------+



In [43]:
#3
df_int_values = df_map.withColumn(
    "usage",
    expr("transform_values(usage, (k, v) -> cast(v as int))")
)

df_int_values.show(truncate=False)

+-------+--------------------------------+
|user_id|usage                           |
+-------+--------------------------------+
|U002   |{mobile -> 200, tablet -> 100}  |
|U001   |{mobile -> 120,  laptop -> 300} |
|U005   |{laptop -> 250}                 |
|U003   |{mobile -> 150,  desktop -> 400}|
+-------+--------------------------------+



In [44]:
#4
df_valid_usage = df_int_values.withColumn(
    "usage",
    expr("""  map_filter(usage, (k, v) -> k IS NOT NULL AND v IS NOT NULL)""")
)

df_valid_usage.show(truncate=False)

+-------+--------------------------------+
|user_id|usage                           |
+-------+--------------------------------+
|U002   |{mobile -> 200, tablet -> 100}  |
|U001   |{mobile -> 120,  laptop -> 300} |
|U005   |{laptop -> 250}                 |
|U003   |{mobile -> 150,  desktop -> 400}|
+-------+--------------------------------+



In [46]:
#5
df_no_null_maps = df_valid_usage.withColumn(
    "usage",
    when(col("usage").isNull(), create_map()).otherwise(col("usage"))
)

df_no_null_maps.show(truncate=False)

+-------+--------------------------------+
|user_id|usage                           |
+-------+--------------------------------+
|U002   |{mobile -> 200, tablet -> 100}  |
|U001   |{mobile -> 120,  laptop -> 300} |
|U005   |{laptop -> 250}                 |
|U003   |{mobile -> 150,  desktop -> 400}|
+-------+--------------------------------+



In [47]:
#6
df_mobile_usage = df_no_null_maps.withColumn(
    "mobile_usage",
    col("usage").getItem("mobile")
)

df_mobile_usage.show(truncate=False)

+-------+--------------------------------+------------+
|user_id|usage                           |mobile_usage|
+-------+--------------------------------+------------+
|U002   |{mobile -> 200, tablet -> 100}  |200         |
|U001   |{mobile -> 120,  laptop -> 300} |120         |
|U005   |{laptop -> 250}                 |NULL        |
|U003   |{mobile -> 150,  desktop -> 400}|150         |
+-------+--------------------------------+------------+



In [48]:
#7
THRESHOLD = 150

df_high_users = df_mobile_usage.filter(
    col("mobile_usage") > THRESHOLD
)

df_high_users.show(truncate=False)

+-------+------------------------------+------------+
|user_id|usage                         |mobile_usage|
+-------+------------------------------+------------+
|U002   |{mobile -> 200, tablet -> 100}|200         |
+-------+------------------------------+------------+



#DATASET 4 — NESTED ADDRESS JSON (BROKEN STRUCTS)

Problems intentionally introduced

Address sometimes string, map, tuple

Missing fields

Pincode as string

Partial address

Exercises

1. Design a nested StructType for address
2. Normalize all address formats into struct
3. Extract city, state, pincode safely
4. Set default pincode when missing
5. Drop irrecoverable records
6. Flatten the struct into columns

In [52]:
raw_profiles = [
("U001","Hyderabad,Telangana,500081"),
("U002",{"city":"Delhi","state":"Delhi","pincode":"110001"}),
("U003",("Bangalore","Karnataka",560001)),
("U004","Mumbai,MH"),
("U005",None)
]

schema = StructType([
    StructField("user_id", StringType()),
    StructField("address_raw", StringType())
])
df = spark.createDataFrame(raw_profiles, schema)
df.printSchema()
df.show()

root
 |-- user_id: string (nullable = true)
 |-- address_raw: string (nullable = true)

+-------+--------------------+
|user_id|         address_raw|
+-------+--------------------+
|   U001|Hyderabad,Telanga...|
|   U002|{pincode=110001, ...|
|   U003|[Ljava.lang.Objec...|
|   U004|           Mumbai,MH|
|   U005|                NULL|
+-------+--------------------+



In [53]:
#1
address_schema = StructType([
    StructField("city", StringType()),
    StructField("state", StringType()),
    StructField("pincode", StringType())
])

In [55]:
#2
df.withColumn(
    "parts", split(col("address_raw"), ",")
).show()

+-------+--------------------+--------------------+
|user_id|         address_raw|               parts|
+-------+--------------------+--------------------+
|   U001|Hyderabad,Telanga...|[Hyderabad, Telan...|
|   U002|{pincode=110001, ...|[{pincode=110001,...|
|   U003|[Ljava.lang.Objec...|[[Ljava.lang.Obje...|
|   U004|           Mumbai,MH|        [Mumbai, MH]|
|   U005|                NULL|                NULL|
+-------+--------------------+--------------------+



In [75]:
#3
df_extracted = (
    df_addr
    .withColumn("city", col("parts")[0])
    .withColumn("state", col("parts")[1])
    .withColumn("pincode", col("parts")[2])
)

In [63]:
#4
df_pincode = df_extracted.withColumn(
    "pincode",
    when(col("pincode").isNull(), "000000").otherwise(col("pincode"))
)

In [64]:
#5
df_valid = df_pincode.filter(col("city").isNotNull())

In [68]:
#6
df_flat = df_valid.select("user_id", "city", "state", "pincode")

#DATASET 5 — TRANSACTION LOGS (MIXED DATES & NUMBERS)

Problems intentionally introduced

Multiple date formats

Amount as words

Missing dates

Inconsistent separators

Exercises
1. Design schema using StructType
2. Normalize all dates into DateType
3. Convert amount into integer
4. Identify unrecoverable records
5. Separate valid vs invalid transactions
6. Produce a clean transactions DataFrame

In [69]:
raw_transactions = [
    ("T001", "2024-01-05", "45000"),
    ("T002", "05/01/2024", 52000),
    ("T003", "Jan 06 2024", "Thirty Thousand"),
    ("T004", None, 38000),
    ("T005", "2024/01/07", "42000")
]

schema = StructType([
    StructField("txn_id", StringType()),
    StructField("date_raw", StringType()),
    StructField("amount_raw", StringType())
])

df = spark.createDataFrame(raw_transactions, schema)
df.printSchema()
df.show()

root
 |-- txn_id: string (nullable = true)
 |-- date_raw: string (nullable = true)
 |-- amount_raw: string (nullable = true)

+------+-----------+---------------+
|txn_id|   date_raw|     amount_raw|
+------+-----------+---------------+
|  T001| 2024-01-05|          45000|
|  T002| 05/01/2024|          52000|
|  T003|Jan 06 2024|Thirty Thousand|
|  T004|       NULL|          38000|
|  T005| 2024/01/07|          42000|
+------+-----------+---------------+



In [79]:
from pyspark.sql.functions import to_date, coalesce, to_timestamp, try_to_timestamp, lit
from pyspark.sql.types import DateType

txn_schema = schema
df_dates = df.withColumn(
    "txn_date",
    coalesce(
        try_to_timestamp("date_raw", lit("yyyy-MM-dd")),
        try_to_timestamp("date_raw", lit("dd/MM/yyyy")),
        try_to_timestamp("date_raw", lit("MMM dd yyyy")),
        try_to_timestamp("date_raw", lit("yyyy/MM/dd"))
    ).cast(DateType())
)
df_dates.show()

+------+-----------+---------------+----------+
|txn_id|   date_raw|     amount_raw|  txn_date|
+------+-----------+---------------+----------+
|  T001| 2024-01-05|          45000|2024-01-05|
|  T002| 05/01/2024|          52000|2024-01-05|
|  T003|Jan 06 2024|Thirty Thousand|2024-01-06|
|  T004|       NULL|          38000|      NULL|
|  T005| 2024/01/07|          42000|2024-01-07|
+------+-----------+---------------+----------+



In [81]:
df_amount = df_dates.withColumn(
    "amount",
    when(col("amount_raw").rlike("^[0-9]+$"), col("amount_raw").cast("int"))
)
df_amount.show()

+------+-----------+---------------+----------+------+
|txn_id|   date_raw|     amount_raw|  txn_date|amount|
+------+-----------+---------------+----------+------+
|  T001| 2024-01-05|          45000|2024-01-05| 45000|
|  T002| 05/01/2024|          52000|2024-01-05| 52000|
|  T003|Jan 06 2024|Thirty Thousand|2024-01-06|  NULL|
|  T004|       NULL|          38000|      NULL| 38000|
|  T005| 2024/01/07|          42000|2024-01-07| 42000|
+------+-----------+---------------+----------+------+



In [82]:
df_invalid = df_amount.filter(
    col("txn_date").isNull() | col("amount").isNull()
)
df_invalid.show()

+------+-----------+---------------+----------+------+
|txn_id|   date_raw|     amount_raw|  txn_date|amount|
+------+-----------+---------------+----------+------+
|  T003|Jan 06 2024|Thirty Thousand|2024-01-06|  NULL|
|  T004|       NULL|          38000|      NULL| 38000|
+------+-----------+---------------+----------+------+



In [83]:
df_valid = df_amount.filter(
    col("txn_date").isNotNull() & col("amount").isNotNull()
)
df_valid.show()

+------+----------+----------+----------+------+
|txn_id|  date_raw|amount_raw|  txn_date|amount|
+------+----------+----------+----------+------+
|  T001|2024-01-05|     45000|2024-01-05| 45000|
|  T002|05/01/2024|     52000|2024-01-05| 52000|
|  T005|2024/01/07|     42000|2024-01-07| 42000|
+------+----------+----------+----------+------+



In [85]:
df_valid.select("txn_id", "txn_date", "amount").show()

+------+----------+------+
|txn_id|  txn_date|amount|
+------+----------+------+
|  T001|2024-01-05| 45000|
|  T002|2024-01-05| 52000|
|  T005|2024-01-07| 42000|
+------+----------+------+

