In [15]:
import pandas as pd
import glob
import xml.etree.ElementTree as ET
from datetime import datetime


In [27]:
log_file = "log_file.txt"
target_file = "transformed_data.csv"


In [16]:
def extract_from_csv(data):
    dataframe = pd.read_csv(data)
    return dataframe

def extract_from_json(data):
    dataframe = pd.read_json(data, lines=True)
    return dataframe

def extract_from_xml(data):
    dataframe = pd.DataFrame(columns=['car_model', 'year_of_manufacture', 'price', 'fuel'])
    xml_data = ET.parse(data)
    
    rows = xml_data.getroot()
   
    
    for row in rows:
        price = float(row.find('price').text)
        model = row.find('car_model').text
        year = int(row.find('year_of_manufacture').text)
        fuel = row.find('fuel').text
        
        car_dict = {"car_model": model, "year_of_manufacture": year, "price": price, "fuel": fuel}
        
        car_df = pd.DataFrame([car_dict])
        
        dataframe = pd.concat([dataframe, car_df], ignore_index=True)
    
    return dataframe
    

In [23]:

def extract():
    extracted_data = pd.DataFrame(columns=['car_model', 'year_of_manufacture', 'price', 'fuel'])
    
    for csv in glob.glob('*csv'):
      extracted_data =  pd.concat([extracted_data, extract_from_csv(csv)], ignore_index=True)
    
    for json in glob.glob('*json'):
      extracted_data = pd.concat([extracted_data, extract_from_json(json)], ignore_index=True)
    
    for xml in glob.glob('*xml'):
      extracted_data = pd.concat([extracted_data, extract_from_xml(xml)], ignore_index=True)
    
    return extracted_data


In [26]:
def transform(data):
    data['price'] = data.price.round(decimals=2)
    return data

In [32]:
def load(target_file, transformed_data):
    transformed_data.to_csv(target_file)

In [33]:
def log_progress(message):
    timestamp_format = '%Y-%h-%D-%H-%M-%S'
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    
    with open(log_file, 'a') as f:
        f.write(timestamp + ', ' + message + '\n')

In [None]:
log_progress('ETL job started')
log_progress('Extract phase started')
extracted_data = extract()
log_progress('Extract phase ended')
log_progress('Transform phase started')
transformed_data = transform(extracted_data)
log_progress('Transform phase ended')
log_progress('Load phase started')
load(target_file, transformed_data)
log_progress('Load phase ended')



