In [3]:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

raw_users = [
    ("U001","Amit","29","Hyderabad","50000"),
    ("U002","Neha","Thirty Two","Delhi","62000"),
    ("U003","Ravi",None,"Bangalore","45k"),
    ("U004","Pooja","28","Mumbai",58000),
    ("U005",None,"31","Chennai","")
]
user_schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("name",    StringType(), True),
    StructField("age_raw", StringType(), True),
    StructField("city",    StringType(), True),
    StructField("salary_raw", StringType(), True)
])
users_df = spark.createDataFrame(raw_users, schema=user_schema)

users_df = users_df.withColumn("age_trim", F.trim(F.col("age_raw")))
users_df.show()
users_df = users_df.withColumn("age_int",
    F.when(F.col("age_trim").rlike(r"^[0-9]+$"), F.col("age_trim").cast(IntegerType()))
     .otherwise(F.lit(None).cast(IntegerType()))
)
users_df.show()

conversion_fail_df = users_df.filter(F.col("age_int").isNull())

# 4) Convert age to integer safely (already done in age_int)
# 5) Normalize salary into integer (handle 'k', empty, None)
def normalize_salary(col):
    return F.when(F.col(col).isNull() | (F.trim(F.col(col)) == ""), F.lit(None).cast(IntegerType())) \
            .when(F.lower(F.trim(F.col(col))).rlike(r"^[0-9]+k$"),
                  (F.regexp_extract(F.lower(F.trim(F.col(col))), r"^([0-9]+)k$", 1).cast(IntegerType()) * F.lit(1000))) \
            .when(F.trim(F.col(col)).rlike(r"^[0-9]+$"),
                  F.trim(F.col(col)).cast(IntegerType())) \
            .otherwise(F.lit(None).cast(IntegerType()))

users_df = users_df.withColumn("salary_int", normalize_salary("salary_raw"))
users_df.show()

users_df = users_df.withColumn("name_clean",
    F.when(F.col("name").isNull() | (F.trim(F.col("name")) == ""), F.lit("UNKNOWN"))
     .otherwise(F.col("name"))
)
users_df.show()


clean_users_df = users_df.filter(F.col("age_int").isNotNull())
clean_users_df.show()


final_users_df = clean_users_df.select(
    "user_id",
    F.col("name_clean").alias("name"),
    F.col("age_int").alias("age"),
    "city",
    F.col("salary_int").alias("salary")
)


conversion_fail_df.show(truncate=False)
final_users_df.show()




+-------+-----+----------+---------+----------+----------+
|user_id| name|   age_raw|     city|salary_raw|  age_trim|
+-------+-----+----------+---------+----------+----------+
|   U001| Amit|        29|Hyderabad|     50000|        29|
|   U002| Neha|Thirty Two|    Delhi|     62000|Thirty Two|
|   U003| Ravi|      NULL|Bangalore|       45k|      NULL|
|   U004|Pooja|        28|   Mumbai|     58000|        28|
|   U005| NULL|        31|  Chennai|          |        31|
+-------+-----+----------+---------+----------+----------+

+-------+-----+----------+---------+----------+----------+-------+
|user_id| name|   age_raw|     city|salary_raw|  age_trim|age_int|
+-------+-----+----------+---------+----------+----------+-------+
|   U001| Amit|        29|Hyderabad|     50000|        29|     29|
|   U002| Neha|Thirty Two|    Delhi|     62000|Thirty Two|   NULL|
|   U003| Ravi|      NULL|Bangalore|       45k|      NULL|   NULL|
|   U004|Pooja|        28|   Mumbai|     58000|        28|     28|

In [8]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql import functions as F

raw_orders = [
    ("O001","U001","Laptop,Mobile,Tablet",75000),
    ("O002","U002",["Mobile","Tablet"],32000),
    ("O003","U003","Laptop",72000),
    ("O004","U004",None,25000),
    ("O005","U005","Laptop\nMobile",68000)
]

# 1) Schema with ArrayType for items (we'll ingest as string and array flexibly)
orders_schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("user_id",  StringType(), False),
    StructField("items_raw", StringType(), True),   # we will harmonize to arrays
    StructField("amount",   IntegerType(), True)
])

# Create DF but we need to coerce list entries to strings to fit schema uniformly
# So we transform raw_orders: if items is list -> join with comma; if None -> None; else str
def to_uniform_row(row):
    order_id, user_id, items, amount = row
    if items is None:
        items_str = None
    elif isinstance(items, list):
        items_str = ",".join(items)
    else:
        items_str = str(items)
    return (order_id, user_id, items_str, int(amount) if amount is not None else None)

uniform_orders = list(map(to_uniform_row, raw_orders))
print(uniform_orders)

orders_df = spark.createDataFrame(uniform_orders, schema=orders_schema)


split_pattern = r"[,\n]+"

orders_df = orders_df.withColumn(
    "items_array",
    F.when(F.col("items_raw").isNull(), F.array())  # 4) replace null items with empty arrays
     .otherwise(
         F.filter(
             F.transform(
                 F.split(F.col("items_raw"), split_pattern),
                 lambda x: F.trim(x)
             ),
             lambda x: x != ""
         )
     )
)
orders_df.show()

# 5) Explode items into one row per item
exploded_df = orders_df.select("order_id", "user_id", "amount", F.explode("items_array").alias("item"))

# 6) Count frequency of each item
item_freq_df = exploded_df.groupBy("item").agg(F.count("*").alias("freq")).orderBy(F.desc("freq"))

# 7) Identify orders with more than 2 items
orders_with_counts = orders_df.withColumn("item_count", F.size("items_array"))
orders_gt2_df = orders_with_counts.filter(F.col("item_count") > 2)

print("Orders normalized:")
orders_df.show(truncate=False)

print("Exploded items:")
exploded_df.show(truncate=False)

print("Item frequencies:")
item_freq_df.show(truncate=False)

print("Orders with > 2 items:")
orders_gt2_df.select("order_id", "user_id", "item_count").show(truncate=False)


[('O001', 'U001', 'Laptop,Mobile,Tablet', 75000), ('O002', 'U002', 'Mobile,Tablet', 32000), ('O003', 'U003', 'Laptop', 72000), ('O004', 'U004', None, 25000), ('O005', 'U005', 'Laptop\nMobile', 68000)]
+--------+-------+--------------------+------+--------------------+
|order_id|user_id|           items_raw|amount|         items_array|
+--------+-------+--------------------+------+--------------------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|
|    O002|   U002|       Mobile,Tablet| 32000|    [Mobile, Tablet]|
|    O003|   U003|              Laptop| 72000|            [Laptop]|
|    O004|   U004|                NULL| 25000|                  []|
|    O005|   U005|      Laptop\nMobile| 68000|    [Laptop, Mobile]|
+--------+-------+--------------------+------+--------------------+

Orders normalized:
+--------+-------+--------------------+------+------------------------+
|order_id|user_id|items_raw           |amount|items_array             |
+--------+-------+-----

In [9]:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

# -----------------------------
# 0) Raw input (as provided)
# -----------------------------
raw_users = [
    ("U001","Amit","28","Hyderabad","['AI','ML','Cloud']"),
    ("U002","Neha","Thirty","Delhi","AI,Testing"),
    ("U003","Ravi",None,"Bangalore",["Data","Spark"]),
    ("U004","Pooja","29","Mumbai",None),
    ("U005","", "31","Chennai","['DevOps']")
]

# -------------------------------------------------------
# 1) Explicit schema (store skills as STRING initially)
# -------------------------------------------------------
users_schema = StructType([
    StructField("user_id",   StringType(), False),
    StructField("name_raw",  StringType(), True),
    StructField("age_raw",   StringType(), True),
    StructField("city",      StringType(), True),
    StructField("skills_raw",StringType(), True)   # normalize to string for robust parsing
])

# We will convert any Python list in raw_users to a comma-separated string
def normalize_row(row):
    user_id, name, age, city, skills = row
    # skills: list -> "a,b", None -> None, str -> as-is
    if isinstance(skills, list):
        skills_str = ",".join(map(str, skills))
    else:
        skills_str = None if skills is None else str(skills)
    # age: ensure string/None
    age_str = None if age is None else str(age)
    # name: keep as-is (string or empty)
    return (str(user_id), None if name is None else str(name), age_str, str(city), skills_str)

uniform_rows = list(map(normalize_row, raw_users))

users_df = spark.createDataFrame(uniform_rows, schema=users_schema)

# -------------------------------------------------------
# 2) Normalize age into IntegerType
# -------------------------------------------------------
# Rule:
# - If age_raw is digits only -> cast to int
# - If age_raw equals 'thirty' (case-insensitive) -> 30
# - Else -> null
users_df = users_df.withColumn("age_trim", F.trim(F.col("age_raw")))
users_df = users_df.withColumn(
    "age_int",
    F.when(F.col("age_trim").rlike(r"^[0-9]+$"), F.col("age_trim").cast(IntegerType()))
     .when(F.lower(F.col("age_trim")) == F.lit("thirty"), F.lit(30))
     .otherwise(F.lit(None).cast(IntegerType()))
)

# -------------------------------------------------------
# 3) Normalize skills into ArrayType(StringType)
# -------------------------------------------------------
# We handle formats:
# - "['AI','ML','Cloud']"  -> strip [ ], strip quotes -> split by comma
# - "AI,Testing"           -> split by comma
# - None                   -> empty array
# After split: trim tokens and drop blanks.

# Step A: default null -> empty string for string preprocessing
skills_str = F.coalesce(F.col("skills_raw"), F.lit(""))

# Step B: remove surrounding [ ] if present, and remove single/double quotes
skills_no_brackets = F.regexp_replace(skills_str, r"^\s*\[|\]\s*$", "")  # strip leading '[' and trailing ']'
skills_no_quotes   = F.regexp_replace(skills_no_brackets, r"[\"']", "")  # drop any quotes

# Step C: split by comma, then trim each item, then filter out empty items
split_items = F.split(skills_no_quotes, r"\s*,\s*")
trimmed_items = F.transform(split_items, lambda x: F.trim(x))
nonempty_items = F.filter(trimmed_items, lambda x: x != "")

# Step D: if original was null -> empty array; otherwise use parsed array
users_df = users_df.withColumn(
    "skills",
    F.when(F.col("skills_raw").isNull(), F.array())  # empty array
     .otherwise(nonempty_items)
)

# (Optional) deduplicate skills and standardize case (uncomment if desired)
# users_df = users_df.withColumn(
#     "skills",
#     F.array_distinct(F.transform(F.col("skills"), lambda x: F.initcap(x)))  # e.g., "ml" -> "Ml"
# )

# -------------------------------------------------------
# 4) Handle empty or missing names
# -------------------------------------------------------
users_df = users_df.withColumn(
    "name",
    F.when(F.col("name_raw").isNull() | (F.trim(F.col("name_raw")) == ""), F.lit("UNKNOWN"))
     .otherwise(F.col("name_raw"))
)

# -------------------------------------------------------
# 5) Produce the final clean users_df
# -------------------------------------------------------
final_users_df = users_df.select(
    "user_id",
    "name",
    F.col("age_int").alias("age"),
    "city",
    "skills"
)

# (Optional) If you want to drop rows where age is unrecoverable:
# final_users_df = final_users_df.filter(F.col("age").isNotNull())

# Show results
print("Final cleaned users_df:")
final_users_df


Final cleaned users_df:


DataFrame[user_id: string, name: string, age: int, city: string, skills: array<string>]