In [0]:

from datetime import datetime
import logging

# Set up logging
log_file_path = "abfss://bronze@logisticsandtransport.dfs.core.windows.net/logs/processing_log_bronze.log"
logger = logging.getLogger("RawDataPipeline")
logger.setLevel(logging.INFO)

class ADLSHandler(logging.Handler):
    def emit(self, record):
        try:
            from pyspark.dbutils import DBUtils
            dbutils = DBUtils(spark)
            message = self.format(record)
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            full_message = f"[{timestamp}] {message}\n"

            try:
                existing_logs = dbutils.fs.head(log_file_path, 1048576)
            except Exception:
                existing_logs = ""

            updated_logs = existing_logs + full_message
            dbutils.fs.put(log_file_path, updated_logs, overwrite=True)

        except Exception as e:
            print(f"Failed to write log: {e}")

log_handler = ADLSHandler()
formatter = logging.Formatter('%(levelname)s - %(message)s')
log_handler.setFormatter(formatter)

if not logger.handlers:
    logger.addHandler(log_handler)

logger.info("Logging initialized.")


In [0]:
try:
    #importing necessary packages
    from pyspark.sql import*
    from pyspark.sql.functions import*
    import logging
    import os
    logger.info("Library imported successfully.")
except Exception as e:
    logger.error(f"Error: {str(e)}")
    raise

In [0]:
try:
    spark.conf.set(
        "fs.azure.account.key.logisticsandtransport.dfs.core.windows.net",
        "AztFZBkLKu6aGzdDi4r7aT7UA7G4UF5oSkFFTwzGjWNM79CRvggV3PuK5iAyZGCbYobManm6J5Ny+AStqinx7A==")
    
    spark.conf.set(
        "fs.azure.account.key.transportandlogicticsraw.dfs.core.windows.net",
        "CzVjwv4Xchg7UsSNkdia5VoTOFHhtvjTsgoZs4ObDP6r5PRczlLMOEcmRE8W+Jafaqhm8ZtDa471+AStlaT3uA==")
    logger.info("Acess key verified successfully.")
except Exception as e:
    logger.error(f"Error: {str(e)}")
    raise

Wrote 62 bytes.


In [0]:
try:
    #setting up the paths of both raw and bronze data containers from blob and adls respectively
    raw_data_path="abfss://raw@transportandlogicticsraw.dfs.core.windows.net/"
    bronze_path="abfss://bronze@logisticsandtransport.dfs.core.windows.net/"
    logger.info("paths set successfully.")
except Exception as e:
    logger.error(f"Error: {str(e)}")
    raise

Wrote 115 bytes.


In [0]:
try:
    try:
        #changing delivery data from csv to parquet
        df_delivery=spark.read.format('csv').options(header='true', inferSchema='true').load(f"{raw_data_path}delivery_data.csv")
        delivery_bronze=df_delivery.withColumn('date_of_ingestion',current_timestamp())\
            .withColumn('source_file',lit('delivery_data.csv'))
        delivery_bronze.write.mode('overwrite').parquet(f"{bronze_path}delivery_data_bronze.parquet")
        logger.info("Read operation successful.")
    except Exception as e:
        logger.error(f"Error in reading data: {e}")
    logger.info("Parquet generated successfully.")
except Exception as e:
    logger.error(f"Error: {str(e)}")
    raise

Wrote 171 bytes.
Wrote 232 bytes.


In [0]:
try:
    try:
        #changing driver data from csv to parquet
        df_driver=spark.read.format('csv').options(header='true', inferSchema='true').load(f"{raw_data_path}driver_data.csv")
        driver_bronze=df_driver.withColumn('date_of_ingestion',current_timestamp())\
            .withColumn('source_file',lit('driver_data.csv'))
        driver_bronze.write.mode('overwrite').parquet(f"{bronze_path}driver_data_bronze.parquet")
        logger.info("Read operation successful.")
    except Exception as e:
        logger.error(f"Error in reading data: {e}")
    logger.info("Parquet generated successfully.")
except Exception as e:
    logger.error(f"Error: {str(e)}")
    raise

Wrote 288 bytes.
Wrote 349 bytes.


In [0]:
try:
    try:
        #changing route data from csv to parquet
        df_route=spark.read.format('csv').options(header='true', inferSchema='true').load(f"{raw_data_path}route_data.csv")
        route_bronze=df_route.withColumn('date_of_ingestion',current_timestamp())\
            .withColumn('source_file',lit('route_data.csv'))
        route_bronze.write.mode('overwrite').parquet(f"{bronze_path}route_data_bronze.parquet")
        logger.info("Read operation successful.")
    except Exception as e:
        logger.error(f"Error in reading data: {e}")
    logger.info("Parquet generated successfully.")
except Exception as e:
    logger.error(f"Error: {str(e)}")
    raise

Wrote 405 bytes.
Wrote 466 bytes.


In [0]:
try:
    try:
        #changing vehicle data from csv to parquet
        df_vehicle=spark.read.format('csv').options(header='true', inferSchema='true').load(f"{raw_data_path}vehicle_data.csv")
        vehicle_bronze=df_vehicle.withColumn('date_of_ingestion',current_timestamp())\
            .withColumn('source_file',lit('vehicle_data.csv'))
        vehicle_bronze.write.mode('overwrite').parquet(f"{bronze_path}vehicle_data_bronze.parquet")
        logger.info("Read operation successful.")
    except Exception as e:
        logger.error(f"Error in reading data: {e}")
    logger.info("Parquet generated successfully.")
except Exception as e:
    logger.error(f"Error: {str(e)}")
    raise

Wrote 522 bytes.
Wrote 583 bytes.
