<a href="https://colab.research.google.com/github/Dinesh-1602/Project-1/blob/main/Mini_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Step 1: Gather Data Files

In [None]:
import os

# Create a folder to store the data (if it doesn't exist)
DATA_FOLDER = './data'
os.makedirs(DATA_FOLDER, exist_ok=True)

# Step 1b: Download the dataset using wget
!wget -P {DATA_FOLDER} https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/source.zip

# Step 1c: Extract the files using the unzip command
!unzip {DATA_FOLDER}/source.zip -d {DATA_FOLDER}

print("Data files downloaded and extracted successfully.")


--2025-07-24 05:54:12--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/source.zip
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2707 (2.6K) [application/zip]
Saving to: ‘./data/source.zip’


2025-07-24 05:54:13 (655 MB/s) - ‘./data/source.zip’ saved [2707/2707]

Archive:  ./data/source.zip
  inflating: ./data/source3.json     
  inflating: ./data/source1.csv      
  inflating: ./data/source2.csv      
  inflating: ./data/source3.csv      
  inflating: ./data/source1.json     
  inflating: ./data/source2.json     
  inflating: ./data/source1.

 Step 2: Import Libraries and Set Paths

In [None]:
import glob
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime

# Set up local paths
LOG_FILE = './log_file.txt'
OUTPUT_FILE = './transformed_data.csv'
DATA_FOLDER = './data'


Step 3: ETL Process

In [None]:
# Extract data from CSV
def extract_csv(file_path):
    log(f'Extracting data from CSV file: {file_path}')
    return pd.read_csv(file_path)

# Extract data from JSON
def extract_json(file_path):
    log(f'Extracting data from JSON file: {file_path}')
    return pd.read_json(file_path)

# Extract data from XML
def extract_xml(file_path):
    log(f'Extracting data from XML file: {file_path}')
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []
    for record in root:
        row = {field.tag: field.text for field in record}
        data.append(row)
    return pd.DataFrame(data)

# Master function to extract all files
def extract():
    log('Starting data extraction...')
    all_files = glob.glob(f"{DATA_FOLDER}/*")

    data_frames = []

    for file in all_files:
        try:
            if file.endswith('.csv'):
                data_frames.append(extract_csv(file))
            elif file.endswith('.json'):
                data_frames.append(extract_json(file))
            elif file.endswith('.xml'):
                data_frames.append(extract_xml(file))
        except Exception as e:
            log(f"Error processing file {file}: {e}")

    if not data_frames:
        log("No valid data found!")
        return pd.DataFrame()

    combined_data = pd.concat(data_frames, ignore_index=True)
    log('Data extraction completed.')
    return combined_data


In [None]:
def transform(data):
    log('Starting data transformation...')

    # Convert height from inches to meters
    if 'height' in data.columns:
        data['height'] = data['height'].astype(float) * 0.0254

    # Convert weight from pounds to kilograms
    if 'weight' in data.columns:
        data['weight'] = data['weight'].astype(float) * 0.453592

    log('Data transformation completed.')
    return data


In [None]:
def load(data):
    log(f'Saving data to {OUTPUT_FILE}...')
    data.to_csv(OUTPUT_FILE, index=False)
    log('Data loading completed.')

In [None]:
  def log(message):
      timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
      with open(LOG_FILE, 'a') as log_file:
          log_file.write(f'[{timestamp}] {message}\n')
      print(f'[{timestamp}] {message}')


Step 4: ETL Execution

In [None]:
def run_etl():
    log('Starting ETL process...')
    data = extract()

    if not data.empty:
        data = transform(data)
        load(data)

    log('ETL process completed.')


In [None]:
if __name__ == "__main__":
    run_etl()


[2025-07-24 05:54:58] Starting ETL process...
[2025-07-24 05:54:58] Starting data extraction...
[2025-07-24 05:54:58] Extracting data from JSON file: ./data/source2.json
[2025-07-24 05:54:58] Error processing file ./data/source2.json: Trailing data
[2025-07-24 05:54:58] Extracting data from CSV file: ./data/source3.csv
[2025-07-24 05:54:58] Extracting data from CSV file: ./data/source1.csv
[2025-07-24 05:54:58] Extracting data from JSON file: ./data/source3.json
[2025-07-24 05:54:58] Error processing file ./data/source3.json: Trailing data
[2025-07-24 05:54:58] Extracting data from XML file: ./data/source2.xml
[2025-07-24 05:54:58] Extracting data from XML file: ./data/source3.xml
[2025-07-24 05:54:58] Extracting data from XML file: ./data/source1.xml
[2025-07-24 05:54:58] Extracting data from CSV file: ./data/source2.csv
[2025-07-24 05:54:58] Extracting data from JSON file: ./data/source1.json
[2025-07-24 05:54:58] Error processing file ./data/source1.json: Trailing data
[2025-07-24 0