In [1]:
from google.colab import files

# This will open a file picker. Select both files:
# orders_large_bad.csv and orders_large_bad.json
uploaded = files.upload()

# Check uploaded files
print("Uploaded files:", list(uploaded.keys()))

Saving orders_large_bad.csv to orders_large_bad.csv
Saving orders_large_bad.json to orders_large_bad.json
Uploaded files: ['orders_large_bad.csv', 'orders_large_bad.json']


In [2]:
!ls -lh /content

total 70M
-rw-r--r-- 1 root root  20M Dec 26 05:47 orders_large_bad.csv
-rw-r--r-- 1 root root  50M Dec 26 05:47 orders_large_bad.json
drwxr-xr-x 1 root root 4.0K Dec 11 14:34 sample_data


In [3]:
from pyspark.sql import SparkSession, functions as F, types as T

def main(
    csv_path: str = "orders_large_bad.csv",
    json_path: str = "orders_large_bad.json",
):
    spark = (
        SparkSession.builder.appName("Orders-Phase-1-Ingestion")
        # Optional: tweak for local runs; adjust as needed
        .config("spark.sql.shuffle.partitions", "200")
        .getOrCreate()
    )

    # --------------------------------------------------------------------
    # 1) READ CSV (disable schema inference; everything as STRING)
    # --------------------------------------------------------------------
    # NOTE: CSV reader defaults to strings when inferSchema=False (default),
    # but we set it explicitly for clarity.
    df_csv = (
        spark.read
        .option("header", True)
        .option("inferSchema", False)
        .option("mode", "PERMISSIVE")   # keep bad records instead of failing
        .option("multiLine", False)     # CSV is single-line records
        .csv(csv_path)
    )

    print("\n=== CSV — PRINT SCHEMA ===")
    df_csv.printSchema()

    csv_count = df_csv.count()
    print(f"\n=== CSV — RECORD COUNT: {csv_count:,} ===\n")

    print("=== CSV — 20 RANDOM ROWS (for eyeballing) ===")
    (
        df_csv
        .orderBy(F.rand())
        .limit(20)
        .show(truncate=False)
    )

    # --------------------------------------------------------------------
    # 2) READ JSON (force primitives as STRING to avoid inference)
    # --------------------------------------------------------------------
    # Many JSON readers infer numbers/dates automatically.
    # "primitivesAsString" forces numbers/booleans to string for Phase 1.
    df_json = (
        spark.read
        .option("multiLine", False)                 # line-delimited JSON
        .option("primitivesAsString", True)         # <- critical for Phase 1
        .option("mode", "PERMISSIVE")
        .json(json_path)
    )

    print("\n=== JSON — PRINT SCHEMA ===")
    df_json.printSchema()

    json_count = df_json.count()
    print(f"\n=== JSON — RECORD COUNT: {json_count:,} ===\n")

    print("=== JSON — 20 RANDOM ROWS (for eyeballing) ===")
    (
        df_json
        .orderBy(F.rand())
        .limit(20)
        .show(truncate=False)
    )

    # --------------------------------------------------------------------
    # 3) COMPARE CSV vs JSON basic metadata
    # --------------------------------------------------------------------
    csv_cols = set(df_csv.columns)
    json_cols = set(df_json.columns)
    only_in_csv = sorted(list(csv_cols - json_cols))
    only_in_json = sorted(list(json_cols - csv_cols))

    print("\n=== SCHEMA COMPARISON (CSV vs JSON) ===")
    print(f"Columns only in CSV : {only_in_csv}")
    print(f"Columns only in JSON: {only_in_json}")
    print(f"Have same columns?  {csv_cols == json_cols}")
    print(f"CSV rows: {csv_count:,} | JSON rows: {json_count:,}\n")

    # --------------------------------------------------------------------
    # 4) QUICK DATA-QUALITY PROBES (identify >= 5 issues)
    #    We’re still in Phase 1 (observation), so we don’t fix anything yet.
    # --------------------------------------------------------------------
    # Helper expressions
    trim_all = {c: F.trim(F.col(c)).alias(c) for c in df_csv.columns}
    df_trim = df_csv.select(*trim_all.values())

    # 4.1 Missing/empty values in some critical columns
    critical_cols = ["order_id", "customer_id", "city", "category", "product", "amount", "order_date", "status"]
    missing_stats = []
    for c in critical_cols:
        missing_stats.append(
            df_trim.where(F.col(c).isNull() | (F.length(F.col(c)) == 0)).agg(
                F.count(F.lit(1)).alias("missing_rows")
            ).withColumn("column", F.lit(c))
        )
    missing_summary = missing_stats[0]
    for m in missing_stats[1:]:
        missing_summary = missing_summary.unionByName(m)
    print("=== DQ Probe 1 — Missing/Empty in Critical Columns (CSV) ===")
    missing_summary.select("column", "missing_rows").orderBy(F.desc("missing_rows")).show(100, truncate=False)

    # 4.2 'amount' obvious issues:
    #  - literal 'invalid'
    #  - contains commas (e.g., "12,000")
    #  - non-numeric characters after removing commas
    amount_trim = F.trim(F.col("amount"))
    amount_no_commas = F.regexp_replace(amount_trim, ",", "")
    amt_invalid_literal = (F.lower(amount_trim) == F.lit("invalid"))
    amt_has_comma = F.col("amount").rlike(",")
    amt_non_numeric = ~amount_no_commas.rlike("^[0-9]+$")

    print("=== DQ Probe 2 — 'amount' validity buckets (CSV) ===")
    (
        df_trim
        .withColumn("amount_invalid_literal", amt_invalid_literal.cast("int"))
        .withColumn("amount_has_comma", amt_has_comma.cast("int"))
        .withColumn("amount_non_numeric_after_strip", (amt_non_numeric & ~amt_invalid_literal).cast("int"))
        .groupBy()
        .agg(
            F.sum("amount_invalid_literal").alias("count_invalid_literal"),
            F.sum("amount_has_comma").alias("count_contains_comma"),
            F.sum("amount_non_numeric_after_strip").alias("count_non_numeric_after_strip"),
        )
        .show(truncate=False)
    )

    # 4.3 'order_date' mixed formats / invalid tokens
    #  Patterns we commonly see in this dataset:
    #   - ISO:          YYYY-MM-DD
    #   - Slash (DMY):  DD/MM/YYYY
    #   - Slash (YMD):  YYYY/MM/DD
    #   - 'invalid_date'
    order_date = F.col("order_date")
    is_iso = order_date.rlike(r"^\d{4}-\d{2}-\d{2}$")
    is_slash_dmy = order_date.rlike(r"^\d{2}/\d{2}/\d{4}$")
    is_slash_ymd = order_date.rlike(r"^\d{4}/\d{2}/\d{2}$")
    is_invalid_token = F.lower(order_date) == F.lit("invalid_date")
    is_missing = order_date.isNull() | (F.length(F.trim(order_date)) == 0)

    print("=== DQ Probe 3 — 'order_date' format buckets (CSV) ===")
    (
        df_trim
        .withColumn("fmt_iso_yyyy_mm_dd", is_iso.cast("int"))
        .withColumn("fmt_slash_dd_mm_yyyy", is_slash_dmy.cast("int"))
        .withColumn("fmt_slash_yyyy_mm_dd", is_slash_ymd.cast("int"))
        .withColumn("invalid_date_token", is_invalid_token.cast("int"))
        .withColumn("missing_or_blank", is_missing.cast("int"))
        .groupBy()
        .agg(
            F.sum("fmt_iso_yyyy_mm_dd").alias("count_iso"),
            F.sum("fmt_slash_dd_mm_yyyy").alias("count_slash_dmy"),
            F.sum("fmt_slash_yyyy_mm_dd").alias("count_slash_ymd"),
            F.sum("invalid_date_token").alias("count_invalid_token"),
            F.sum("missing_or_blank").alias("count_missing_or_blank"),
        )
        .show(truncate=False)
    )

    # 4.4 Extra leading/trailing spaces in common string columns
    str_cols = ["city", "category", "product"]
    exprs_space = []
    for c in str_cols:
        exprs_space.append(
            F.sum((F.col(c) != F.trim(F.col(c))).cast("int")).alias(f"rows_with_space_{c}")
        )
    print("=== DQ Probe 4 — Leading/Trailing Spaces (CSV) ===")
    df_csv.agg(*exprs_space).show(truncate=False)

    # 4.5 Case inconsistency in 'city' (distinct before vs after lower+trim)
    city_distinct_before = df_csv.select("city").distinct().count()
    city_distinct_after = df_trim.select(F.lower(F.col("city")).alias("city_norm")).distinct().count()
    print("=== DQ Probe 5 — City distinct values (case/space inconsistency) (CSV) ===")
    print(f"Distinct city values (raw):  {city_distinct_before}")
    print(f"Distinct city values (norm): {city_distinct_after}")

    print("Top examples of city variants mapping to the same normalized token:")
    (
        df_trim
        .withColumn("city_norm", F.lower(F.col("city")))
        .groupBy("city_norm")
        .agg(F.collect_set("city").alias("variants"), F.count("*").alias("cnt"))
        .orderBy(F.desc("cnt"))
        .limit(15)
        .show(truncate=False)
    )

    # 4.6 Detect stray spaces in product/category tokens (like "Mobile " or " home ")
    print("=== DQ Probe 6 — Tokens with internal/edge spaces (CSV) ===")
    (
        df_csv
        .select(
            F.expr("filter(split(category, ' +'), x -> x <> '')").alias("category_tokens"),
            F.expr("filter(split(product, ' +'), x -> x <> '')").alias("product_tokens"),
            "category", "product"
        )
        .where((F.col("category") != F.trim(F.col("category"))) | (F.col("product") != F.trim(F.col("product"))))
        .limit(20)
        .show(truncate=False)
    )

    # 4.7 (Optional) Show a few suspicious rows for manual review:
    print("=== DQ Probe 7 — Sample suspicious rows (amount invalid/comma/non-numeric OR invalid_date) (CSV) ===")
    suspicious = df_trim.where(
        (F.lower(F.col("amount")) == "invalid")
        | (F.col("amount").rlike(","))
        | (~F.regexp_replace(F.col("amount"), ",", "").rlike("^[0-9]+$"))
        | (F.lower(F.col("order_date")) == "invalid_date")
        | (F.col("order_date").isNull())
        | (F.length(F.trim(F.col("order_date"))) == 0)
    )
    suspicious.orderBy(F.rand()).limit(20).show(truncate=False)

    print("\n=== Phase 1 COMPLETE ===")
    print("Identified issues include (non-exhaustive):")
    print(" - Missing / blank values (amount, order_date, category, product, etc.)")
    print(" - 'amount' with 'invalid', commas ('12,000'), or non-numeric characters")
    print(" - Mixed date formats (YYYY-MM-DD, DD/MM/YYYY, YYYY/MM/DD) and 'invalid_date'")
    print(" - Leading/trailing whitespace & inconsistent casing in city/category/product")
    print(" - Occasional trailing spaces in tokens (e.g., 'Mobile ' or ' home ')")
    print("\nNext Phase: define an explicit schema and validate rows against it.")

    spark.stop()


if __name__ == "__main__":
    main(csv_path="/content/orders_large_bad.csv",
         json_path="/content/orders_large_bad.json")




=== CSV — PRINT SCHEMA ===
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)


=== CSV — RECORD COUNT: 300,000 ===

=== CSV — 20 RANDOM ROWS (for eyeballing) ===
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id   |customer_id|city     |category   |product    |amount |order_date|status   |
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD00045790|C045790    |Delhi    |Fashion    |Shoes      |invalid|2024-01-11|Completed|
|ORD00171632|C021632    | mumbai  |Home       |AirPurifier|65633  |2024-02-02|Completed|
|ORD00190278|C040278    |Hyderabad| grocery   |Sugar      |24858  |19/01/2024|Completed|
|ORD00145245|C045245    |Hyderab