[Reference](https://medium.com/@dayooped/how-to-build-a-simple-etl-process-with-python-a-beginners-guide-745d30d70a8a)

# 1. Download and Unzip Files


In [1]:
import glob
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime

In [2]:
import urllib.request
url = "https://github.com/iopedare/data_source/raw/main/source.zip"
filename = "source.zip"
urllib.request.urlretrieve(url, filename)

('source.zip', <http.client.HTTPMessage at 0x7f5bfb949250>)

In [3]:
import zipfile
zip = zipfile.ZipFile("source.zip")
zip.extractall()

In [4]:
# file used to store all extracted data
tmpfile = "temp.tmp"
# all event logs will be stored in this file
logfile = "logfile.txt"
# file where transformed data is stored
targetfile = "transformed_data.csv"

# 2. Extract Data


In [5]:
# CSV Extract Function
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe
# JSON Extract Function
def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process,lines=True)
    return dataframe
# XML Extract Function
def extract_from_xml(file_to_process):
    dataframe = pd.DataFrame(columns=["name", "height", "weight"])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for person in root:
        name = person.find("name").text
        height = float(person.find("height").text)
        weight = float(person.find("weight").text)
        dataframe = dataframe.append({
            "name":name, 
            "height":height, 
            "weight":weight
        }, ignore_index=True)
    return dataframe

In [6]:
def extract():
    # create an empty data frame to hold extracted data
    extracted_data = pd.DataFrame(columns=["name","height","weight"]) 
    
    # process all csv files
    for csvfile in glob.glob("*.csv"):
        extracted_data = pd.concat([extracted_data, extract_from_csv(csvfile)], ignore_index=True)
        
    # process all json files
    for jsonfile in glob.glob("*.json"):
        extracted_data = pd.concat([extracted_data, extract_from_json(jsonfile)], ignore_index=True)
    
    # process all xml files
    for xmlfile in glob.glob("*.xml"):
        extracted_data = pd.concat([extracted_data, extract_from_xml(xmlfile)], ignore_index=True)
        
    return extracted_data

# 3. Transform Data


In [7]:
def transform(data):
    # round off to two decimal places (one inch is 0.0254 meters)
    data["height"] = (data.height*0.0254).astype(float).round(2)
    
    # round off to two decimals(one pound is 0.45359237 kilograms)
    data["weight"] = (data.weight*0.45359237).astype(float).round(2)
    return data

# 4. Load Data



In [8]:
def load(targetfile,data_to_load):
    data_to_load.to_csv(targetfile)

In [9]:
def log(message):
    # Time format in Hour-Minute-Second-Monthname-Day-Year
    timestamp_format = "%H:%M:%S-%h-%d-%Y"
    # get current timestamp
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open("logfile.txt","a") as f:
        f.write(timestamp + "," + message + "\n")

In [10]:
log("ETL Job Started")

In [11]:
log("Extract phase Started")
# call the extraction funtion
extracted_data = extract()
log("Extract phase Ended")
# To print the first 5 rows of the extracted data output
extracted_data.head()

Unnamed: 0,name,height,weight
0,alex,65.78,112.99
1,ajay,71.52,136.49
2,alice,69.4,153.03
3,ravi,68.22,142.34
4,joe,67.79,144.3


In [12]:
log("Transform phase Started")
# Call the Transform function
transformed_data = transform(extracted_data)

log("Transform phase Ended")
# To print the first 5 rows of the transformed data output
transformed_data.head()

Unnamed: 0,name,height,weight
0,alex,1.67,51.25
1,ajay,1.82,61.91
2,alice,1.76,69.41
3,ravi,1.73,64.56
4,joe,1.72,65.45


In [13]:
log("ETL Job Ended")

In [14]:
# df = pd.read_csv("transformed_data.csv")
# print(df)