[Reference](https://medium.com/@topefolorunso/build-an-etl-data-pipeline-using-python-139c6875b046)

In [1]:
!git clone --single-branch --branch main https://github.com/topefolorunso/basic-etl-pipeline.git ~/basic-etl-pipeline

Cloning into '/root/basic-etl-pipeline'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 16 (delta 2), reused 12 (delta 1), pack-reused 0[K
Unpacking objects: 100% (16/16), done.


In [2]:
cd ~/basic-etl-pipeline

/root/basic-etl-pipeline


In [3]:
import pandas as pd
from pandas.core.frame import DataFrame

def extract(file_path: str) -> DataFrame:
    '''
    extracts csv data and converts to pandas Dataframe
    args:
        file_path (str): path to the csv file
    
    returns:
        df (DataFrame): pandas dataframe containing the csv data
    '''

    # exracts the csv data as pandas daaframe
    df = pd.read_csv(file_path)

    return df

In [4]:
def transform(df: DataFrame) -> DataFrame:
    '''
    cleans data
    args:
        df (DataFrame): pandas dataframe containing the raw data
    
    returns:
        df (DataFrame): pandas dataframe containing the clean data
    '''

    # drop null values
    df.dropna(inplace=True)

    # remove decimal from year column and convert to string
    df.Year = df.Year.astype('int').astype("str")
    
    return df

In [5]:
def load(df: DataFrame, save_path: str):
    '''
    writes pandas Dataframe to csv file
    args:
        df (DataFrame): pandas dataframe containing the clean data
        save_path (str): path to save the csv file
    
    returns:
        None
    '''

    # write dataframe to csv
    df.to_csv(save_path, index=False)
    return

In [6]:
from etl import *

file_path = "~/basic-etl-pipeline/data/economic-indicators.csv"
save_path = "~/basic-etl-pipeline/data/clean_economic-indicators.csv"

def run_pipeline(file_path:str, save_path:str):

    # extract
    df = extract(file_path=file_path)

    # transform
    df = transform(df=df)

    # load
    load(df=df, save_path=save_path)

    return


if __name__ == "__main__":
    # run pipeline
    run_pipeline(file_path=file_path, save_path=save_path)