In [90]:
import os
import re
import logging
from datetime import datetime
from typing import List, Optional, Tuple, Dict

from pyspark.sql import SparkSession, DataFrame # type: ignore
from pyspark.sql.functions import col, input_file_name, year, month, to_timestamp, lit, to_date # type: ignore

# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("InsertRawToBronze")

# Constants
SOURCE_NAMESPACE='test_append'
BASE_RAW_DATA_DIR = "/src/data/gold_test.csv"
TARGET_CATALOG = "datalake"
TARGET_NAMESPACE = f"{TARGET_CATALOG}.bronze"
DATE_FORMATS = [
    'MM-dd-yy',
    "yyyy-MM-dd'T'HH:mm:ss.SSSZ", 
    "yyyy-MM-dd'T'HH:mm:ssZ", 
    'yyyy-MM-dd HH:mm:ss.SSS', 
    'yyyy-MM-dd HH:mm:ss', 
    'MM/dd/yyyy HH:mm:ss', 
    'MM/dd/yyyy', 
    'yyyy-MM-dd', 
    'dd/MM/yyyy', 
    'dd-MM-yyyy', 
    'MM-dd-yyyy',
    'MMM d, yyyy',
    'MMMM d, yyyy'
]

# Standard column mapping
STANDARD_COLUMNS = {
    'date': ['date', 'datetime', 'time'],
    'price': ['price', 'close'],
    'open': ['open', 'Open'],
    'high': ['high', 'High'],
    'low': ['low'],
    'volume': ['vol', 'vol.', 'volume', 'Volume'],
    'change': ['change', 'change %'],
    'id': ['id'],
    'adj': ['adj', 'Adj'],
    'price_tip': ['close_tip'],  
    'adj_price': ['adj_close', 'adj close']
}

In [91]:
spark = (
        SparkSession.builder.appName("Test_read")
        .enableHiveSupport()
        .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        .config("spark.sql.avro.datetimeRebaseModeInWrite", "CORRECTED")
        .getOrCreate()
    )
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {SOURCE_NAMESPACE}")

DataFrame[]

In [81]:
def clean_name(name: str) -> str:
    """Clean folder or column names for SQL use"""
    if not name:
        return "unnamed"
    
    # Replace non-alphanumeric characters with underscore
    cleaned = re.sub(r"[^a-zA-Z0-9_]", "_", name)
    # Clean up consecutive underscores
    cleaned = re.sub(r"_+", "_", cleaned)
    # Remove leading/trailing underscores
    cleaned = cleaned.strip("_")
    # Convert to lowercase
    cleaned = cleaned.lower()
    # Add leading underscore if starts with digit
    if cleaned and cleaned[0].isdigit():
        cleaned = "_" + cleaned
        
    return cleaned

def clean_column_names(df: DataFrame) -> DataFrame:
    """Clean all column names in DataFrame"""
    original_columns = df.columns
    new_columns = [clean_name(col) for col in original_columns]
    
    # Handle duplicate column names after cleaning
    final_columns = []
    seen = {}
    for col in new_columns:
        if col in seen:
            seen[col] += 1
            final_columns.append(f"{col}_{seen[col]}")
        else:
            seen[col] = 0
            final_columns.append(col)
    
    if original_columns != final_columns:
        logger.warning(f"Duplicate or invalid column names detected after cleaning. Renaming columns: {list(zip(original_columns, final_columns))}")
        return df.toDF(*final_columns)
    
    return df

In [82]:
def standardize_schema(df: DataFrame) -> DataFrame:
    """Map columns to standard names and ensure consistent schema"""
    # Log original columns for debugging
    logger.info(f"Original columns: {df.columns}")
    
    # Create a mapping from actual column names to standard column names
    column_mapping = {}
    
    # First, find all potential matches
    potential_matches = {}
    for std_col, possible_names in STANDARD_COLUMNS.items():
        for actual_col in df.columns:
            if actual_col.lower() in [name.lower() for name in possible_names]:
                if std_col not in potential_matches:
                    potential_matches[std_col] = []
                potential_matches[std_col].append(actual_col)
    
    # Resolve the mappings, ensuring no duplicate target columns
    used_columns = set()
    for std_col, matched_columns in potential_matches.items():
        if len(matched_columns) == 1:
            # Only one match for this standard column
            column_mapping[matched_columns[0]] = std_col
            used_columns.add(matched_columns[0])
        else:
            # Multiple matches - need to create unique names
            for idx, matched_col in enumerate(matched_columns):
                if idx == 0:
                    # First match gets the standard name
                    column_mapping[matched_col] = std_col
                else:
                    # Subsequent matches get suffixed names
                    column_mapping[matched_col] = f"{std_col}_{idx}"
                used_columns.add(matched_col)
    
    # Include any remaining columns with their original names
    for col in df.columns:
        if col not in used_columns and col != "_source_file":
            column_mapping[col] = col
            
    logger.info(f"Column mapping: {column_mapping}")
    
    # Create a standardized DataFrame
    std_df = df
    
    # Rename columns according to mapping
    for orig_col, std_col in column_mapping.items():
        std_df = std_df.withColumnRenamed(orig_col, std_col)
    
    # Ensure source_file column exists
    if "_source_file" in std_df.columns:
        std_df = std_df.withColumnRenamed("_source_file", "source_file")
    
    logger.info(f"Standardized columns: {std_df.columns}")
    return std_df

In [83]:
def parse_date_column(df: DataFrame, date_col_name: str) -> DataFrame:
    """Try to parse the date column using multiple formats"""
    logger.info(f"Attempting to parse date column '{date_col_name}' using formats: {DATE_FORMATS}")
    
    # Create a new DataFrame with a parsed date column
    parsed_df = df
    
    # Try each date format in succession
    for idx, date_format in enumerate(DATE_FORMATS):
        try:
            logger.info(f"Trying date format #{idx+1}: {date_format}")
            # Create temporary column with this format
            temp_df = parsed_df.withColumn(
                "parsed_date", 
                to_timestamp(col(date_col_name), date_format)
            )
            
            # Count non-null values with this format
            valid_count = temp_df.filter(col("parsed_date").isNotNull()).count()
            logger.info(f"Format {date_format} produced {valid_count} valid dates")
            
            if valid_count > 0:
                # Use this format since it worked for some values
                parsed_df = temp_df
                logger.info(f"Using format '{date_format}' for date parsing")
                break
        except Exception as e:
            logger.warning(f"Format '{date_format}' failed with error: {str(e)}")
    
    return parsed_df

In [84]:
def add_partition_columns(df: DataFrame) -> Tuple[DataFrame, List[str]]:
    """Add year and month partition columns from date column"""
    logger.info("Attempting to add partition columns...")
    
    # Find date column (case insensitive)
    date_col_candidates = [c for c in df.columns if c.lower() == "date"]
    
    if not date_col_candidates:
        logger.warning("No 'date' column found. Skipping partitioning.")
        return df, []
    
    date_col_name = date_col_candidates[0]
    logger.info(f"Found date column: '{date_col_name}'")
    
    try:
        # Parse the date column
        df_with_parsed_date = parse_date_column(df, date_col_name)
        
        # Check if we successfully parsed any dates
        if "parsed_date" not in df_with_parsed_date.columns:
            logger.warning("Failed to parse date column with any format. Skipping partitioning.")
            return df, []
            
        # Add year and month columns from the parsed date
        df_partitioned = (
            df_with_parsed_date
            .withColumn("year", year(col("parsed_date")))
            .withColumn("month", month(col("parsed_date")))
        )
        
        # Check if partition columns were successfully added
        null_years = df_partitioned.filter(col("year").isNull()).count()
        if null_years > 0:
            logger.warning(f"{null_years} rows have NULL year values after date parsing")
        
        logger.info(f"Successfully added partition columns 'year', 'month' from '{date_col_name}'")
        
        # Replace the original date column with the parsed_date and then drop the parsed_date column
        df_partitioned = df_partitioned.drop(date_col_name).withColumnRenamed("parsed_date", date_col_name)
        
        # Return the DataFrame with the partition columns
        return df_partitioned, ["year", "month"]
            
    except Exception as e:
        logger.error(f"Error adding partition columns: {str(e)}")
        logger.error("Continuing without partition columns")
        return df, []

In [85]:
def write_to_iceberg(df: DataFrame, table_name: str, partition_by: List[str]):
    """Write DataFrame to Iceberg table"""
    logger.info(f"Starting write operation to Iceberg table: {table_name}")
    logger.info(f"Partitioning by: {partition_by}")
    logger.info(f"Write mode: overwrite")
    logger.info(f"DataFrame contains {df.count()} rows to write")
    
    try:
        writer = (
            df.write
            .format("iceberg")
            .mode("append")
            .option("mergeSchema", "true")
        )
        
        if partition_by and all(p in df.columns for p in partition_by):
            # Ensure partition columns have valid data
            null_partition_count = df.filter(
                ' OR '.join([f"{p} IS NULL" for p in partition_by])
            ).count()
            
            if null_partition_count > 0:
                logger.warning(f"{null_partition_count} rows have NULL partition values")
            
            if null_partition_count < df.count():  # Only partition if some rows have valid values
                writer = writer.partitionBy(*partition_by)
                logger.info(f"Partitioning by {partition_by}")
            else:
                logger.warning("Skipping partitioning as all partition columns contain NULL values")
        else:
            logger.warning(f"Partition columns missing from DataFrame. Skipping partitioning.")
        
        writer.saveAsTable(table_name)
        logger.info(f"Successfully wrote data to Iceberg table: {table_name}")
        
    except Exception as e:
        logger.error(f"Failed to write to Iceberg table {table_name}: {str(e)}")
        raise

In [55]:
df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("mode", "PERMISSIVE")
    .option("multiLine", "true")
    .option("escape", "\"")
    .csv(BASE_RAW_DATA_DIR)
)

In [56]:
df_cleaned = clean_column_names(df)



In [57]:
df_standardized = standardize_schema(df_cleaned)

2025-05-20 16:03:01,395 - INFO - Original columns: ['c0', 'date', 'price', 'open', 'high', 'low', 'vol', 'change']
2025-05-20 16:03:01,396 - INFO - Column mapping: {'date': 'date', 'price': 'price', 'open': 'open', 'high': 'high', 'low': 'low', 'vol': 'volume', 'change': 'change', 'c0': 'c0'}
2025-05-20 16:03:01,423 - INFO - Standardized columns: ['c0', 'date', 'price', 'open', 'high', 'low', 'volume', 'change']


In [58]:
df_partitioned, partition_cols = add_partition_columns(df_standardized)

2025-05-20 16:03:02,121 - INFO - Attempting to add partition columns...
2025-05-20 16:03:02,124 - INFO - Found date column: 'date'
2025-05-20 16:03:02,126 - INFO - Attempting to parse date column 'date' using formats: ['MM-dd-yy', "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd'T'HH:mm:ssZ", 'yyyy-MM-dd HH:mm:ss.SSS', 'yyyy-MM-dd HH:mm:ss', 'MM/dd/yyyy HH:mm:ss', 'MM/dd/yyyy', 'yyyy-MM-dd', 'dd/MM/yyyy', 'dd-MM-yyyy', 'MM-dd-yyyy', 'MMM d, yyyy', 'MMMM d, yyyy']
2025-05-20 16:03:02,127 - INFO - Trying date format #1: MM-dd-yy
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 174.0 failed 1 times, most recent failure: Lost task 0.0 in stage 174.0 (TID 123) (dca658d2bc53 executor driver): org.apache.spark.SparkUpgradeException: [INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER] You may get a different result due to the upgrading to Spark >= 3.0:
Fail to parse '03-08-2025' in the new parser. You can set "spark.sql.legacy.timeParserPolicy" to "L

In [59]:
full_table_name = 'datalake.bronze.gold'

In [64]:
df_partitioned = df_partitioned.drop(col('volume'), col('change')).withColumn('source_file', lit('abc'))
df_partitioned.show(5)

+---+--------+--------+--------+--------+-------------------+----+-----+-----------+
| c0|   price|    open|    high|     low|               date|year|month|source_file|
+---+--------+--------+--------+--------+-------------------+----+-----+-----------+
|  1|2,926.11|2,916.80|2,931.26|2,906.30|2025-08-03 00:00:00|2025|    8|        abc|
|  2|2,926.60|2,929.50|2,935.90|2,897.60|2025-09-03 00:00:00|2025|    9|        abc|
|  3|2,926.00|2,929.00|2,941.30|2,903.40|2025-10-03 00:00:00|2025|   10|        abc|
|  4|2,920.60|2,904.20|2,939.80|2,892.50|2025-11-03 00:00:00|2025|   11|        abc|
|  5|2,901.10|2,872.00|2,906.40|2,866.30|2025-12-03 00:00:00|2025|   12|        abc|
+---+--------+--------+--------+--------+-------------------+----+-----+-----------+



In [67]:
FINANCIAL_COLUMNS = ["price", "price_tip", "adj_price", "open", "high", "low", "volume", "change"]
for column in [c for c in df_partitioned.columns if c in FINANCIAL_COLUMNS]:
    if column in df_partitioned.columns:
        df_partitioned = df_partitioned.withColumn(column, col(column).cast("double"))

In [68]:
# Check schema before writing to ensure 'date' is properly typed
df_partitioned.printSchema()
print(f"Number of rows: {df_partitioned.count()}")

root
 |-- c0: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- source_file: string (nullable = false)

Number of rows: 5


In [69]:
write_to_iceberg(df_partitioned, full_table_name, partition_cols)

2025-05-20 16:06:46,503 - INFO - Starting write operation to Iceberg table: datalake.bronze.gold
2025-05-20 16:06:46,505 - INFO - Partitioning by: ['year', 'month']
2025-05-20 16:06:46,505 - INFO - Write mode: overwrite
2025-05-20 16:06:46,641 - INFO - DataFrame contains 5 rows to write
2025-05-20 16:06:46,875 - INFO - Partitioning by ['year', 'month']
2025-05-20 16:06:48,845 - INFO - Successfully wrote data to Iceberg table: datalake.bronze.gold


In [76]:
test = spark.table('datalake.bronze.gold')
test.orderBy('date')

DataFrame[c0: int, date: date, price: double, open: double, high: double, low: double, source_file: string, year: int, month: int]

In [80]:
# 👇 Lọc các dòng có date < 2025-08-03
filtered_df = test.filter(col("date") < to_timestamp(lit("2025-08-03")))

# 🖨️ In kết quả
filtered_df.head(5)

[Row(c0=10133, date=datetime.date(2002, 10, 1), price=322.2, open=324.3, high=325.0, low=321.6, source_file='file:///src/data/raw/Gold/Gold.csv', year=2002, month=10),
 Row(c0=10134, date=datetime.date(2002, 10, 2), price=322.8, open=322.1, high=323.6, low=320.8, source_file='file:///src/data/raw/Gold/Gold.csv', year=2002, month=10),
 Row(c0=10135, date=datetime.date(2002, 10, 3), price=322.4, open=322.8, high=325.5, low=322.0, source_file='file:///src/data/raw/Gold/Gold.csv', year=2002, month=10),
 Row(c0=10136, date=datetime.date(2002, 10, 4), price=323.3, open=322.1, high=323.6, low=320.0, source_file='file:///src/data/raw/Gold/Gold.csv', year=2002, month=10),
 Row(c0=10137, date=datetime.date(2002, 10, 5), price=323.3, open=322.1, high=323.6, low=320.0, source_file='file:///src/data/raw/Gold/Gold.csv', year=2002, month=10)]

In [86]:
df_partitioned, partition_cols = add_partition_columns(filtered_df)

2025-05-22 15:25:41,681 - INFO - Attempting to add partition columns...
2025-05-22 15:25:41,686 - INFO - Found date column: 'date'
2025-05-22 15:25:41,687 - INFO - Attempting to parse date column 'date' using formats: ['MM-dd-yy', "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd'T'HH:mm:ssZ", 'yyyy-MM-dd HH:mm:ss.SSS', 'yyyy-MM-dd HH:mm:ss', 'MM/dd/yyyy HH:mm:ss', 'MM/dd/yyyy', 'yyyy-MM-dd', 'dd/MM/yyyy', 'dd-MM-yyyy', 'MM-dd-yyyy', 'MMM d, yyyy', 'MMMM d, yyyy']
2025-05-22 15:25:41,688 - INFO - Trying date format #1: MM-dd-yy
2025-05-22 15:25:43,241 - INFO - Format MM-dd-yy produced 11022 valid dates     
2025-05-22 15:25:43,242 - INFO - Using format 'MM-dd-yy' for date parsing
2025-05-22 15:25:44,409 - INFO - Successfully added partition columns 'year', 'month' from 'date'


In [88]:
full_table_name = 'datalake.bronze.gold'
write_to_iceberg(df_partitioned, full_table_name, partition_cols)

2025-05-22 15:26:04,891 - INFO - Starting write operation to Iceberg table: datalake.bronze.gold
2025-05-22 15:26:04,892 - INFO - Partitioning by: ['year', 'month']
2025-05-22 15:26:04,893 - INFO - Write mode: overwrite
2025-05-22 15:26:05,870 - INFO - DataFrame contains 11022 rows to write         
2025-05-22 15:26:07,686 - INFO - Partitioning by ['year', 'month']              
2025-05-22 15:26:16,682 - INFO - Successfully wrote data to Iceberg table: datalake.bronze.gold


In [89]:
df_partitioned.show(5)

+-----+-----+-----+-----+-----+--------------------+----+-----+-------------------+
|   c0|price| open| high|  low|         source_file|year|month|               date|
+-----+-----+-----+-----+-----+--------------------+----+-----+-------------------+
|10133|322.2|324.3|325.0|321.6|file:///src/data/...|2002|   10|2002-10-01 00:00:00|
|10134|322.8|322.1|323.6|320.8|file:///src/data/...|2002|   10|2002-10-02 00:00:00|
|10135|322.4|322.8|325.5|322.0|file:///src/data/...|2002|   10|2002-10-03 00:00:00|
|10136|323.3|322.1|323.6|320.0|file:///src/data/...|2002|   10|2002-10-04 00:00:00|
|10137|323.3|322.1|323.6|320.0|file:///src/data/...|2002|   10|2002-10-05 00:00:00|
+-----+-----+-----+-----+-----+--------------------+----+-----+-------------------+
only showing top 5 rows

