In [1]:
import glob
import json
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime
import os

# Install pandas if not already installed
try:
    import pandas as pd
except ImportError:
    os.system('pip install pandas')

# Set paths
log_file_path = "D:\\Data Engineering ETL project 1\\log_file.txt"
transformed_data_path = "D:\\Data Engineering ETL project 1\\transformed_data.csv"

# Logging function
def log(message):
    with open(log_file_path, "a") as log_file:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_file.write(f"[{timestamp}] {message}\n")

# Extract Data

def extract_csv(file_path):
    return pd.read_csv(file_path)

def extract_json(file_path):
    return pd.read_json(file_path,lines=True)

def extract_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []
    columns = [elem.tag for elem in root[0]]
    for row in root:
        data.append([child.text for child in row])
    return pd.DataFrame(data, columns=columns)

def extract_data(directory):
    log("Starting data extraction")
    all_files = glob.glob(f"{directory}/*")
    data_frames = []
    for file in all_files:
        if file.endswith(".csv"):
            data_frames.append(extract_csv(file))
        elif file.endswith(".json"):
            data_frames.append(extract_json(file))
        elif file.endswith(".xml"):
            data_frames.append(extract_xml(file))
    combined_data = pd.concat(data_frames, ignore_index=True)
    log("Data extraction completed")
    return combined_data

# Transform Data
def transform_data(df):
    log("Starting data transformation")
    df['height'] = df['height'].astype(float) * 0.0254  # Inches to meters
    df['weight'] = df['weight'].astype(float) * 0.453592  # Pounds to kilograms
    log("Data transformation completed")
    return df

# Load Data
def load_data(df):
    log("Starting data loading")
    df.to_csv(transformed_data_path, index=False)
    log("Data loading completed")

# ETL Execution
def run_etl(directory):
    log("ETL process started")

    # Extraction
    extracted_data = extract_data(directory)

    # Transformation
    transformed_data = transform_data(extracted_data)

    # Loading
    load_data(transformed_data)

    log("ETL process completed")

# Example Usage
# Set the directory containing your data files
data_directory = "C:\\Users\\Akshaya\\Downloads\\source"
run_etl(data_directory)