In [102]:
# Importing the required libraries
import requests
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from datetime import datetime

#Initialize enttities
url='https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29'
table_attribs =['Country','GDP_USD_millions']
db_name = 'World_Economies.db'
table_name = 'Countries_by_GDP'
csv_path = 'Countries_by_GDP.csv'
query_statement = f"SELECT * from {table_name} WHERE GDP_USD_billions >= 100"


In [103]:
def extract(url, table_attribs):
    df = pd.DataFrame(columns=table_attribs)
    html_page = requests.get(url).text
    data = BeautifulSoup(html_page, 'html.parser')
    tables = data.find_all('table')
    rows = tables[2].find_all('tr')
    ''' This function extracts the required
    information from the website and saves it to a dataframe. The
    function returns the dataframe for further processing. '''
    for row in rows[3:]:
        col = row.find_all('td')
        data_dict = {table_attribs[0]: col[0].get_text(strip=True),
                         table_attribs[1]: col[2].get_text(strip=True)}
        df1 = pd.DataFrame(data_dict, index=[0])
        df = pd.concat([df,df1], ignore_index=True)
        
    return df

In [104]:
def transform(df):
    df['GDP_USD_millions']=df['GDP_USD_millions'].str.replace('—','0')
    df['GDP_USD_millions']=df['GDP_USD_millions'].str.replace(',','').astype(float)
    df['GDP_USD_millions']=round(df['GDP_USD_millions']/1000,2)
    df.rename(columns={'GDP_USD_millions':'GDP_USD_billions'}, inplace = True)
    ''' This function converts the GDP information from Currency
    format to float value, transforms the information of GDP from
    USD (Millions) to USD (Billions) rounding to 2 decimal places.
    The function returns the transformed dataframe.'''
    return df

In [105]:
def load_to_csv(df, csv_path):
    df.to_csv(csv_path)
    ''' This function saves the final dataframe as a `CSV` file 
    in the provided path. Function returns nothing.'''

In [106]:
def load_to_db(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)
    ''' This function saves the final dataframe as a database table
            with the provided name. Function returns nothing.'''

In [107]:
def run_query(query_statement, sql_connection):
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)
    ''' This function runs the stated query on the database table and
    prints the output on the terminal. Function returns nothing. '''

In [108]:
def log_progress(message):
    ''' This function logs the mentioned message at a given stage of the code execution to a log file. Function returns nothing'''
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open("./etl_project_log.txt","a") as f: 
        f.write(timestamp + ',' + message + '\n') 

In [109]:
# Log the initialization of the ETL process 
log_progress("Preliminaries complete. Initiating ETL process.") 
 
# Log the beginning of the Extraction process 
log_progress("Data extraction complete. Initiating Transformation process.") 
extracted_data = extract(url, table_attribs) 
 
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 
# Log the beginning of the Transformation process 
log_progress("Data transformation complete. Initiating loading process.") 
transformed_data = transform(extracted_data) 
 
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 
# Log the beginning of the Loading CSV process 
log_progress("Data saved to CSV file.") 
load_to_csv(transformed_data, csv_path) 
 
# Log the completion of the Loading CSV process 
log_progress("Load CSV phase Ended") 

#Initiate sql connection
sql_connection = sqlite3.connect(db_name)

# Log the beginning of the Loading to DB process 
log_progress("Data loaded to Database as table. Running the query.") 
load_to_db(transformed_data, sql_connection, table_name) 
 
# Log the completion of the Loading to DB process 
log_progress("Load to DB phase Ended")

#Log query run
log_progress("Process Complete.")
run_query(query_statement, sql_connection)
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

#Closing db connection
sql_connection.close()

          Country  GDP_USD_billions
0   United States          26854.60
1           China          19373.59
2           Japan           4409.74
3         Germany           4308.85
4           India           3736.88
..            ...               ...
64          Kenya            118.13
65         Angola            117.88
66           Oman            104.90
67      Guatemala            102.31
68       Bulgaria            100.64

[69 rows x 2 columns]
