### Part B. Task 1. - Data Ingestion Pipeline ###

#### Docker Desktop Environment ####

In [1]:
import os
import subprocess
from datetime import datetime
import pandas as pd

# CONFIGURATIONS
LOCAL_FILE_PATH = 'C:/hadoopsetup/data/GDDA707_A2_Processed_Bitcoin_Transaction_HDFS.csv'
HDFS_DIRECTORY = '/user/data/bitcoin_transactions'
LOG_FILE = 'bitcoin_ingestion_log.txt'
HADOOP_CONTAINER_NAME = "namenode"

def log_message(message):
    """Logs messages to a log file for tracking the ingestion process."""
    with open(LOG_FILE, 'a') as log:
        log.write(f"{datetime.now()}: {message}\n")
    print(message)

def validate_data(file_path):
    """
    Validates the data to ensure integrity before loading into HDFS.
    - Checks for missing values
    - Validates date format
    """
    try:
        data = pd.read_csv(file_path)

        # Check for missing values
        if data.isnull().values.any():
            raise ValueError("Data contains missing values. Please clean the data.")

        # Validate 'Date' column format (ISO 8601 format)
        pd.to_datetime(data['Date'], format='%Y-%m-%d')  

        log_message("SUCCESSFUL: Data Validation.")
        return True
    except Exception as e:
        log_message(f"FAILED: Data Validation! : {e}")
        return False

def upload_to_hdfs(local_path, hdfs_path, container_name):
    """
    Uploads the file to HDFS using the Docker namenode container.
    """
    try:
        # HDFS directory confirmation
        mkdir_command = f"hadoop fs -mkdir -p {hdfs_path}"
        subprocess.run(
            ["docker", "exec", container_name, "bash", "-c", mkdir_command],
            check=True,
            text=True,
            capture_output=True,
        )
        log_message(f"HDFS Directory: {hdfs_path}")

        # Copy to Docker container
        container_tmp_path = f"/tmp/{os.path.basename(local_path)}"
        subprocess.run(
            ["docker", "cp", local_path, f"{container_name}:{container_tmp_path}"],
            check=True,
            text=True,
            capture_output=True,
        )
        log_message(f"Copy File to Docker Container: Completed: {container_tmp_path}")

        # Upload the file to HDFS from the Docker container
        put_command = f"hadoop fs -put -f {container_tmp_path} {hdfs_path}"
        subprocess.run(
            ["docker", "exec", container_name, "bash", "-c", put_command],
            check=True,
            text=True,
            capture_output=True,
        )
        log_message(f"SUCCESSFULLY UPLOADED {local_path} to HDFS at {hdfs_path}")
    except subprocess.CalledProcessError as e:
        log_message(f"FAILED TO UPLOAD FILE: {e.stderr.strip()}")

def main_pipeline():
    """Main function to execute the data ingestion pipeline."""
    log_message(" >>>>>>>> Data Ingestion Pipeline >>>>>>>> .")

    # Step 1: Validate Data
    if validate_data(LOCAL_FILE_PATH):
        # Step 2: Upload to HDFS
        upload_to_hdfs(LOCAL_FILE_PATH, HDFS_DIRECTORY, HADOOP_CONTAINER_NAME)

    log_message("DATA INGESTION PIPELINE COMPLETED!!!.")

if __name__ == '__main__':
    main_pipeline()

 >>>>>>>> Data Ingestion Pipeline >>>>>>>> .
SUCCESSFUL: Data Validation.
HDFS Directory: /user/data/bitcoin_transactions
Copy File to Docker Container: Completed: /tmp/GDDA707_A2_Processed_Bitcoin_Transaction_HDFS.csv
SUCCESSFULLY UPLOADED C:/hadoopsetup/data/GDDA707_A2_Processed_Bitcoin_Transaction_HDFS.csv to HDFS at /user/data/bitcoin_transactions
DATA INGESTION PIPELINE COMPLETED!!!.
