# Project Scenario

An international firm that is looking to expand its business in different countries across the world has recruited you. You have been hired as a junior Data Engineer and are tasked with creating a script that can extract the list of the top 10 largest economies of the world in descending order of their GDPs in Billion USD (rounded to 2 decimal places), as logged by the International Monetary Fund (IMF).

The required data seems to be available on the URL mentioned below:

URL: https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29


# Install and import required libraries

In [1]:
!pip install lxml


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import sqlite3
from datetime import datetime


# Extract the data using web scraping

In [3]:
#Initialize the entities required

In [4]:
log_file = './etl_project_log.txt'

In [5]:
URL = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
table_columns = ['Countries','GDP(in Million USD)']
db_name = 'World_Ecoomies.db'
table_name = 'Countries_by_GDP'
csv_path = './Countries_by_GDP.csv'


In [24]:
def extract(URL,table_columns):
    df = pd.DataFrame(columns=table_columns)
    html_page = requests.get(URL).text
    soup = BeautifulSoup(html_page, 'html.parser')
    tables = soup.find_all('tbody')
    #print(tables[0])
    table = tables[2]
    rows = table.find_all('tr')
    for row in rows[3:]:
        col = row.find_all('td')
        if len(col)!=0 :
            if col[0].find('a') is not None and '—' not in col[1]:
                data_dict = {'Countries':col[0].find('a').contents, 'GDP(in Million USD)':col[1]}
                df1 = pd.DataFrame(data_dict,index=[0])
                df = pd.concat([df,df1],ignore_index=True)
    #print(df)
    return df
    
            

In [25]:
extract(URL,table_columns)


Unnamed: 0,Countries,GDP(in Million USD)
0,United States,30507217
1,China,19231705
2,Germany,4744804
3,India,4187017
4,Japan,4186431
...,...,...
186,Palau,333
187,Kiribati,312
188,Marshall Islands,297
189,Nauru,169


# Transform Million USD to Billion USD

In [21]:
def transform(df):

    df['GDP(in Million USD)']= df['GDP(in Million USD)'].str.replace(',','').astype(int)
    #df.info()
    df['GDP(in Million USD)'] = np.round(df['GDP(in Million USD)']*0.001, 2)
    df.rename(columns={'GDP(in Million USD)':'GDP(Billion USD)'},inplace = True)
    return df

In [9]:
#df = extract(URL,table_columns)
#transform(df)

# Loading information

In [10]:
# Load the dataFrame into csv file

In [11]:
def load_csv(df,csv_path):
    df.to_csv(csv_path)

In [12]:
# load the data frame into a table in database

In [13]:
def load_db(df,table_name,conn):
    df.to_sql(table_name,conn,if_exists='replace',index=False)

# Querying the database Table

In [17]:
def query_table(query_statement,sql_connection):
    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

# Logging progress

In [18]:
def log_progress(message):
    time_format = "%Y-%h-%d-%H-%M-%S" #year-Month-day-hour-second
    now = datetime.now()
    timestamp = now.strftime(time_format)
    with open(log_file,'a')as f:
        f.write(timestamp + " : " + message + '\n')
        

# Calling the functions

In [23]:
log_progress('Preliminaries complete. Initiating ETL process')
df = extract(URL,table_columns)
log_progress('Data extraction complete. Initiating Transformation process')
df = transform(df)
log_progress('Data transformation complete. Initiating loading process')
load_csv(df, csv_path)
log_progress('Data saved to CSV file')
sql_connection = sqlite3.connect('World_Economies.db')
log_progress('SQL Connection initiated.')
load_db(df,table_name,sql_connection)
log_progress('Data loaded to Database as table. Running the query')
log_progress('Data loaded to Database as table. Running the query')
query_statement = f"SELECT * from {table_name} WHERE [GDP(Billion USD)] >= 100"
query_table(query_statement, sql_connection)
log_progress('Process Complete.')
sql_connection.close()

SELECT * from Countries_by_GDP WHERE [GDP(Billion USD)] >= 100
        Countries  GDP(Billion USD)
0   United States          30507.22
1           China          19231.71
2         Germany           4744.80
3           India           4187.02
4           Japan           4186.43
..            ...               ...
67       Bulgaria            117.01
68         Angola            113.34
69      Venezuela            108.51
70           Oman            104.35
71     Costa Rica            102.59

[72 rows x 2 columns]
