In [8]:
import pandas as pd
import json
import logging
import xml.etree.ElementTree as ET

logger = logging.getLogger('my_logger')
logger.setLevel(logging.INFO)

# Create a file handler to log messages to a file
handler = logging.FileHandler('logfile.log')
handler.setLevel(logging.INFO)

# Define the log message format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Attach the handler to the logger
logger.addHandler(handler)

# Log start of ETL process
logging.info('Started ETL process')

def extract_csv(file_path):
  logger.info(f"Extracting data from csv file: {file_path}")
  return pd.read_csv(file_path)

def extract_json(file_path):
  logger.info(f"Extracting data from json file: {file_path}")
  return pd.read_json(file_path, lines=True)

def extract_xml(file_path):
  logger.info(f"Extracting data from xml file: {file_path}")
  cols = ['name','height','weight']
  rows = []

  #Parsing the xml file
  tree = ET.parse('/content/source1.xml')
  root = tree.getroot()
  for i in root:
    name = i.find('name').text
    height = i.find('height').text
    weight = i.find('weight').text
    rows.append({'name':name, 'height':height, 'weight':weight})
    df_xml = pd.DataFrame(rows, columns = cols)
  return df_xml

# Unit Conversion: Convert inches to cm:
def inches_to_cm(inches):
  return inches * 2.54

# Unit Conversion: Convert pounds to kg:
def pounds_to_kg(pounds):
  return pounds * 0.45359237

def transform_data(df_csv,df_json,df_xml):
  logger.info(f"Combining the dataframes")
  combined_data=pd.concat([df_csv,df_json,df_xml],ignore_index=True)

  logger.info(f"Converting the datatypes of the dataframe")
  # Define the conversion dictionary
  convert_dict = {'height':float, 'weight': float}
  # Convert columns using the dictionary
  transformed_data = combined_data.astype(convert_dict)
  #print(transformed_data.dtypes)

  # Applying the function to the column
  logger.info(f"Converting the height column from inches to cms")
  transformed_data['height_in cm'] = transformed_data['height'].apply(inches_to_cm)

  logger.info(f"Converting the weight column from pounds to kgs")
  transformed_data['weight_in kg'] = transformed_data['weight'].apply(pounds_to_kg)

  # Drop the extra columns
  logger.info(f"Dropping the extra columns")
  transformed_data = transformed_data.drop(transformed_data.columns[[1,2]],axis=1)
  return transformed_data


def main():
  # Extraction
  try:
    df_csv = extract_csv('/content/source1.csv')
    df_json = extract_json('/content/source1.json')
    df_xml = extract_xml('/content/source1.xml')
  except Exception as e:
    logger.error(f"Error in loading data : {e}")

  # Transformation
  logger.info("Necessary data transformation")
  transformed_data = transform_data(df_csv,df_json,df_xml)

  # Load Transformed Data into csv
  logger.info(f"Loading transformed dataframe to a csv file")
  try:
    transformed_data.to_csv('transformed_data.csv', index=False)
  except Exception as e:
    logging.error(f"Error in loading data : {e}")

  logger.info("Etl process completed")

if __name__ == '__main__':
  main()

INFO:my_logger:Extracting data from csv file: /content/source1.csv
INFO:my_logger:Extracting data from json file: /content/source1.json
INFO:my_logger:Extracting data from xml file: /content/source1.xml
INFO:my_logger:Necessary data transformation
INFO:my_logger:Combining the dataframes
INFO:my_logger:Converting the datatypes of the dataframe
INFO:my_logger:Converting the height column from inches to cms
INFO:my_logger:Converting the weight column from pounds to kgs
INFO:my_logger:Dropping the extra columns
INFO:my_logger:Loading transformed dataframe to a csv file
INFO:my_logger:Etl process completed
