In [None]:
import os
import re
import logging
from datetime import datetime
from typing import List, Optional, Tuple, Dict

from pyspark.sql import SparkSession, DataFrame # type: ignore
from pyspark.sql.functions import col, input_file_name, year, month, to_timestamp # type: ignore

# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("InsertRawToBronze")

# Constants
BASE_RAW_DATA_DIR = "/src/data/raw/gold_test.csv"
TARGET_CATALOG = "datalake"
TARGET_NAMESPACE = f"{TARGET_CATALOG}.bronze"
DATE_FORMATS = [
    "yyyy-MM-dd'T'HH:mm:ss.SSSZ", 
    "yyyy-MM-dd'T'HH:mm:ssZ", 
    'yyyy-MM-dd HH:mm:ss.SSS', 
    'yyyy-MM-dd HH:mm:ss', 
    'MM/dd/yyyy HH:mm:ss', 
    'MM/dd/yyyy', 
    'yyyy-MM-dd', 
    'dd/MM/yyyy', 
    'dd-MM-yyyy', 
    'MM-dd-yyyy',
    'MMM d, yyyy',
    'MMMM d, yyyy'
]

# Standard column mapping
STANDARD_COLUMNS = {
    'date': ['date', 'datetime', 'time'],
    'price': ['price', 'close'],
    'open': ['open', 'Open'],
    'high': ['high', 'High'],
    'low': ['low'],
    'volume': ['vol', 'vol.', 'volume', 'Volume'],
    'change': ['change', 'change %'],
    'id': ['id'],
    'adj': ['adj', 'Adj'],
    'price_tip': ['close_tip'],  
    'adj_price': ['adj_close', 'adj close']
}

In [3]:
spark = (
        SparkSession.builder.appName("Test_read")
        .enableHiveSupport()
        .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        .config("spark.sql.avro.datetimeRebaseModeInWrite", "CORRECTED")
        .getOrCreate()
    )
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {SOURCE_NAMESPACE}")

DataFrame[]

In [4]:
try:
    silver_indices = spark.table("datalake.gold.indices")
    silver_macro = spark.table("datalake.gold.macro")
    silver_usbonds = spark.table("datalake.gold.USbonds")
    
    print("Tables loaded successfully")
except Exception as e:
    print(f"Error loading silver tables: {e}")

Tables loaded successfully


In [None]:
def clean_column_names(df: DataFrame) -> DataFrame:
    """Clean all column names in DataFrame"""
    original_columns = df.columns
    new_columns = [clean_name(col) for col in original_columns]
    
    # Handle duplicate column names after cleaning
    final_columns = []
    seen = {}
    for col in new_columns:
        if col in seen:
            seen[col] += 1
            final_columns.append(f"{col}_{seen[col]}")
        else:
            seen[col] = 0
            final_columns.append(col)
    
    if original_columns != final_columns:
        logger.warning(f"Duplicate or invalid column names detected after cleaning. Renaming columns: {list(zip(original_columns, final_columns))}")
        return df.toDF(*final_columns)
    
    return df

In [27]:
def standardize_schema(df: DataFrame) -> DataFrame:
    """Map columns to standard names and ensure consistent schema"""
    # Log original columns for debugging
    logger.info(f"Original columns: {df.columns}")
    
    # Create a mapping from actual column names to standard column names
    column_mapping = {}
    
    # First, find all potential matches
    potential_matches = {}
    for std_col, possible_names in STANDARD_COLUMNS.items():
        for actual_col in df.columns:
            if actual_col.lower() in [name.lower() for name in possible_names]:
                if std_col not in potential_matches:
                    potential_matches[std_col] = []
                potential_matches[std_col].append(actual_col)
    
    # Resolve the mappings, ensuring no duplicate target columns
    used_columns = set()
    for std_col, matched_columns in potential_matches.items():
        if len(matched_columns) == 1:
            # Only one match for this standard column
            column_mapping[matched_columns[0]] = std_col
            used_columns.add(matched_columns[0])
        else:
            # Multiple matches - need to create unique names
            for idx, matched_col in enumerate(matched_columns):
                if idx == 0:
                    # First match gets the standard name
                    column_mapping[matched_col] = std_col
                else:
                    # Subsequent matches get suffixed names
                    column_mapping[matched_col] = f"{std_col}_{idx}"
                used_columns.add(matched_col)
    
    # Include any remaining columns with their original names
    for col in df.columns:
        if col not in used_columns and col != "_source_file":
            column_mapping[col] = col
            
    logger.info(f"Column mapping: {column_mapping}")
    
    # Create a standardized DataFrame
    std_df = df
    
    # Rename columns according to mapping
    for orig_col, std_col in column_mapping.items():
        std_df = std_df.withColumnRenamed(orig_col, std_col)
    
    # Ensure source_file column exists
    if "_source_file" in std_df.columns:
        std_df = std_df.withColumnRenamed("_source_file", "source_file")
    
    logger.info(f"Standardized columns: {std_df.columns}")
    return std_df

Các cột có giá trị null: []


                                                                                

+-----------+----------+
|column_name|null_count|
+-----------+----------+
+-----------+----------+



In [28]:
df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("mode", "PERMISSIVE")
    .option("multiLine", "true")
    .option("escape", "\"")
    .csv(BASE_RAW_DATA_DIR)
)

Các cột có giá trị null: ['gold', 'oil', 'us_dollar', 'usd_vnd']
+-----------+----------+
|column_name|null_count|
+-----------+----------+
|       gold|         2|
|        oil|         2|
|  us_dollar|         2|
|    usd_vnd|         1|
+-----------+----------+



In [29]:
def parse_date_column(df: DataFrame, date_col_name: str) -> DataFrame:
    """Try to parse the date column using multiple formats"""
    logger.info(f"Attempting to parse date column '{date_col_name}' using formats: {DATE_FORMATS}")
    
    # Create a new DataFrame with a parsed date column
    parsed_df = df
    
    # Try each date format in succession
    for idx, date_format in enumerate(DATE_FORMATS):
        try:
            logger.info(f"Trying date format #{idx+1}: {date_format}")
            # Create temporary column with this format
            temp_df = parsed_df.withColumn(
                "parsed_date", 
                to_timestamp(col(date_col_name), date_format)
            )
            
            # Count non-null values with this format
            valid_count = temp_df.filter(col("parsed_date").isNotNull()).count()
            logger.info(f"Format {date_format} produced {valid_count} valid dates")
            
            if valid_count > 0:
                # Use this format since it worked for some values
                parsed_df = temp_df
                logger.info(f"Using format '{date_format}' for date parsing")
                break
        except Exception as e:
            logger.warning(f"Format '{date_format}' failed with error: {str(e)}")
    
    return parsed_df

Các cột có giá trị null: ['gold', 'us_2_year_bond', 'us_5_year_bond']
+--------------+----------+
|   column_name|null_count|
+--------------+----------+
|          gold|         1|
|us_2_year_bond|         1|
|us_5_year_bond|         1|
+--------------+----------+



In [30]:
def add_partition_columns(df: DataFrame) -> Tuple[DataFrame, List[str]]:
    """Add year and month partition columns from date column"""
    logger.info("Attempting to add partition columns...")
    
    # Find date column (case insensitive)
    date_col_candidates = [c for c in df.columns if c.lower() == "date"]
    
    if not date_col_candidates:
        logger.warning("No 'date' column found. Skipping partitioning.")
        return df, []
    
    date_col_name = date_col_candidates[0]
    logger.info(f"Found date column: '{date_col_name}'")
    
    try:
        # Parse the date column
        df_with_parsed_date = parse_date_column(df, date_col_name)
        
        # Check if we successfully parsed any dates
        if "parsed_date" not in df_with_parsed_date.columns:
            logger.warning("Failed to parse date column with any format. Skipping partitioning.")
            return df, []
            
        # Add year and month columns from the parsed date
        df_partitioned = (
            df_with_parsed_date
            .withColumn("year", year(col("parsed_date")))
            .withColumn("month", month(col("parsed_date")))
        )
        
        # Check if partition columns were successfully added
        null_years = df_partitioned.filter(col("year").isNull()).count()
        if null_years > 0:
            logger.warning(f"{null_years} rows have NULL year values after date parsing")
        
        logger.info(f"Successfully added partition columns 'year', 'month' from '{date_col_name}'")
        
        # Return the DataFrame with the partition columns and without the temporary parsed_date column
        return df_partitioned.drop("parsed_date"), ["year", "month"]
            
    except Exception as e:
        logger.error(f"Error adding partition columns: {str(e)}")
        logger.error("Continuing without partition columns")
        return df, []

In [None]:
def write_to_iceberg(df: DataFrame, table_name: str, partition_by: List[str]):
    """Write DataFrame to Iceberg table"""
    logger.info(f"Starting write operation to Iceberg table: {table_name}")
    logger.info(f"Partitioning by: {partition_by}")
    logger.info(f"Write mode: overwrite")
    logger.info(f"DataFrame contains {df.count()} rows to write")
    
    # Check for duplicate column names before writing
    df = check_duplicate_columns(df)
    
    try:
        writer = (
            df.write
            .format("iceberg")
            .mode("overwrite")
            .option("overwriteSchema", "true")
        )
        
        if partition_by and all(p in df.columns for p in partition_by):
            # Ensure partition columns have valid data
            null_partition_count = df.filter(
                ' OR '.join([f"{p} IS NULL" for p in partition_by])
            ).count()
            
            if null_partition_count > 0:
                logger.warning(f"{null_partition_count} rows have NULL partition values")
            
            if null_partition_count < df.count():  # Only partition if some rows have valid values
                writer = writer.partitionBy(*partition_by)
                logger.info(f"Partitioning by {partition_by}")
            else:
                logger.warning("Skipping partitioning as all partition columns contain NULL values")
        else:
            logger.warning(f"Partition columns missing from DataFrame. Skipping partitioning.")
        
        writer.saveAsTable(table_name)
        logger.info(f"Successfully wrote data to Iceberg table: {table_name}")
        
    except Exception as e:
        logger.error(f"Failed to write to Iceberg table {table_name}: {str(e)}")
        raise

In [9]:
spark.stop()