In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt 
%matplotlib inline

from functools import partial

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import warnings 
warnings.filterwarnings('ignore')

In [2]:
# Connect to webpage.
url1 = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'

page1 = requests.get(url1)

# Extract site data.
soup1 = BeautifulSoup(page1.text, 'html')

In [3]:
# Isolate site data.
table1 = soup1.find('main')

table1

<main class="mw-body" id="content">
<header class="mw-body-header vector-page-titlebar">
<nav aria-label="Contents" class="vector-toc-landmark">
<div class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" id="vector-page-titlebar-toc">
<input aria-haspopup="true" aria-label="Toggle the table of contents" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-vector-page-titlebar-toc" id="vector-page-titlebar-toc-checkbox" role="button" type="checkbox"/>
<label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" for="vector-page-titlebar-toc-checkbox" id="vector-page-titlebar-toc-label"><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span>
<span class="vector-dropdown-label-text">Toggle the table of contents</span>
</label>
<div class="vector-dropdown-content">
<div class="vector-unpinned-container" id="vector-page-ti

In [4]:
# Create list of column titles.
col_titles1 = table1.find_all('th')[:1]

col_titles1

[<th rowspan="2">Country/Territory
 </th>]

In [5]:
# Create list of column titles.
col_titles1a = table1.find_all('th')[4:][:6]

col_titles1a

[<th>Forecast</th>,
 <th>Year</th>,
 <th>Estimate</th>,
 <th>Year</th>,
 <th>Estimate</th>,
 <th>Year
 </th>]

In [6]:
# Clean list of column titles.
col_table_titles1 = [title.text.strip() for title in col_titles1]

col_table_titles1

['Country/Territory']

In [7]:
# Clean list of column titles.
col_table_titles1a = [title.text.strip() for title in col_titles1a]

col_table_titles1a

['Forecast', 'Year', 'Estimate', 'Year', 'Estimate', 'Year']

In [8]:
# Merge final title column list.
col_table_titles1b = col_table_titles1 + col_table_titles1a

col_table_titles1b

['Country/Territory',
 'Forecast',
 'Year',
 'Estimate',
 'Year',
 'Estimate',
 'Year']

In [9]:
# Convert list to Pandas dataframe.
countries1 = pd.DataFrame(columns = col_table_titles1b).copy()

countries1

Unnamed: 0,Country/Territory,Forecast,Year,Estimate,Year.1,Estimate.1,Year.2


In [10]:
# Locate and extract table data.
row_dat1 = table1.find_all('table')

row_dat1

[<table border="0" cellpadding="2" cellspacing="0" style="float:right;">
 <tbody><tr>
 <td><div class="timeline-wrapper"><map name="timeline_fqqng0tmo92l3oig81vteqxt6t9f10k"></map><img src="//upload.wikimedia.org/wikipedia/en/timeline/fqqng0tmo92l3oig81vteqxt6t9f10k.png" usemap="#timeline_fqqng0tmo92l3oig81vteqxt6t9f10k"/></div>
 </td></tr>
 <tr>
 <td style="text-align:center; font-size:90%;">Largest economies in the world by GDP (nominal) in 2024<br/>according to <a href="/wiki/International_Monetary_Fund" title="International Monetary Fund">International Monetary Fund</a> estimates<sup class="reference" id="cite_ref-China-THM_1-0"><a href="#cite_note-China-THM-1">[n 1]</a></sup><sup class="reference" id="cite_ref-GDP_IMF_2-0"><a href="#cite_note-GDP_IMF-2">[1]</a></sup>
 </td></tr></tbody></table>,
 <table width="100%"> <tbody><tr> <td style="vertical-align:top;"> <style data-mw-deduplicate="TemplateStyles:r981673959">.mw-parser-output .legend{page-break-inside:avoid;break-inside:avo

In [11]:
# Create a structured list.
x1 = []

for row in row_dat1:
    row_data1 = row.find_all('td')
    ind_row_data1 = [data.text.strip() for data in row_data1]  
    ind_row_data1 = [data.text.split(',') for data in row_data1]  
    ind_row_data1 = [data.text.strip() for data in row_data1]
    x1.append(ind_row_data1)
    
    print(ind_row_data1)


['', 'Largest economies in the world by GDP (nominal) in 2024according to International Monetary Fund estimates[n 1][1]']
['> $20 trillion \xa0\xa0$10–20 trillion \xa0\xa0$5–10 trillion \xa0\xa0$1–5 trillion', '$750 billion – $1 trillion \xa0\xa0$500–750 billion \xa0\xa0$250–500 billion \xa0\xa0$100–250 billion', '$50–100 billion \xa0\xa0$25–50 billion \xa0\xa0$5–25 billion \xa0\xa0< $5 billion']
['World', '109,529,216', '2024', '105,435,540', '2023', '100,834,796', '2022', 'United States', '28,781,083', '2024', '27,360,935', '2023', '25,744,100', '2022', 'China', '18,532,633', '[n 1]2024', '17,794,782', '[n 3]2023', '17,963,170', '[n 1]2022', 'Germany', '4,591,100', '2024', '4,456,081', '2023', '4,076,923', '2022', 'Japan', '4,110,452', '2024', '4,212,945', '2023', '4,232,173', '2022', 'India', '3,937,011', '2024', '3,549,919', '2023', '3,465,541', '2022', 'United Kingdom', '3,495,261', '2024', '3,340,032', '2023', '3,089,072', '2022', 'France', '3,130,014', '2024', '3,030,904', '2023

In [12]:
# Isolate table data only.
x1 = x1[2].copy()

x1

['World',
 '109,529,216',
 '2024',
 '105,435,540',
 '2023',
 '100,834,796',
 '2022',
 'United States',
 '28,781,083',
 '2024',
 '27,360,935',
 '2023',
 '25,744,100',
 '2022',
 'China',
 '18,532,633',
 '[n 1]2024',
 '17,794,782',
 '[n 3]2023',
 '17,963,170',
 '[n 1]2022',
 'Germany',
 '4,591,100',
 '2024',
 '4,456,081',
 '2023',
 '4,076,923',
 '2022',
 'Japan',
 '4,110,452',
 '2024',
 '4,212,945',
 '2023',
 '4,232,173',
 '2022',
 'India',
 '3,937,011',
 '2024',
 '3,549,919',
 '2023',
 '3,465,541',
 '2022',
 'United Kingdom',
 '3,495,261',
 '2024',
 '3,340,032',
 '2023',
 '3,089,072',
 '2022',
 'France',
 '3,130,014',
 '2024',
 '3,030,904',
 '2023',
 '2,775,316',
 '2022',
 'Brazil',
 '2,331,391',
 '2024',
 '2,173,666',
 '2023',
 '1,920,095',
 '2022',
 'Italy',
 '2,328,028',
 '2024',
 '2,254,851',
 '2023',
 '2,046,952',
 '2022',
 'Canada',
 '2,242,182',
 '2024',
 '2,140,086',
 '2023',
 '2,137,939',
 '2022',
 'Russia',
 '2,056,844',
 '2024',
 '2,021,421',
 '2023',
 '2,240,422',
 '2022',
 '

## There are missing values that need to be inserted manually by manipulation.

In [13]:
# Create dataframe from list to utilize Pandas functionality.
zzz = pd.DataFrame(x1)

zzz

Unnamed: 0,0
0,World
1,109529216
2,2024
3,105435540
4,2023
...,...
1441,2024
1442,62
1443,2023
1444,59


In [14]:
# Rename column.
zzz.rename(columns={zzz.columns[0]: 'u'}, inplace=True)

zzz

Unnamed: 0,u
0,World
1,109529216
2,2024
3,105435540
4,2023
...,...
1441,2024
1442,62
1443,2023
1444,59


In [15]:
# Replacing dash marks (-) with 'XXXX'.
ddd = 'XXXX'

zzz = zzz.replace('—', ddd, regex = True).copy()

zzz

Unnamed: 0,u
0,World
1,109529216
2,2024
3,105435540
4,2023
...,...
1441,2024
1442,62
1443,2023
1444,59


In [16]:
# Convert dataframe to list.
zzz = zzz.u.values.tolist().copy()

zzz

['World',
 '109,529,216',
 '2024',
 '105,435,540',
 '2023',
 '100,834,796',
 '2022',
 'United States',
 '28,781,083',
 '2024',
 '27,360,935',
 '2023',
 '25,744,100',
 '2022',
 'China',
 '18,532,633',
 '[n 1]2024',
 '17,794,782',
 '[n 3]2023',
 '17,963,170',
 '[n 1]2022',
 'Germany',
 '4,591,100',
 '2024',
 '4,456,081',
 '2023',
 '4,076,923',
 '2022',
 'Japan',
 '4,110,452',
 '2024',
 '4,212,945',
 '2023',
 '4,232,173',
 '2022',
 'India',
 '3,937,011',
 '2024',
 '3,549,919',
 '2023',
 '3,465,541',
 '2022',
 'United Kingdom',
 '3,495,261',
 '2024',
 '3,340,032',
 '2023',
 '3,089,072',
 '2022',
 'France',
 '3,130,014',
 '2024',
 '3,030,904',
 '2023',
 '2,775,316',
 '2022',
 'Brazil',
 '2,331,391',
 '2024',
 '2,173,666',
 '2023',
 '1,920,095',
 '2022',
 'Italy',
 '2,328,028',
 '2024',
 '2,254,851',
 '2023',
 '2,046,952',
 '2022',
 'Canada',
 '2,242,182',
 '2024',
 '2,140,086',
 '2023',
 '2,137,939',
 '2022',
 'Russia',
 '2,056,844',
 '2024',
 '2,021,421',
 '2023',
 '2,240,422',
 '2022',
 '

In [17]:
# Add 'XXXX' to positions that were missing values. 
temp1 = []

for i in range(len(zzz)):
    if zzz[i] == ddd:
        temp1.extend([zzz[i], ddd])
    else:
        temp1.append(zzz[i])
        
print(temp1)

['World', '109,529,216', '2024', '105,435,540', '2023', '100,834,796', '2022', 'United States', '28,781,083', '2024', '27,360,935', '2023', '25,744,100', '2022', 'China', '18,532,633', '[n 1]2024', '17,794,782', '[n 3]2023', '17,963,170', '[n 1]2022', 'Germany', '4,591,100', '2024', '4,456,081', '2023', '4,076,923', '2022', 'Japan', '4,110,452', '2024', '4,212,945', '2023', '4,232,173', '2022', 'India', '3,937,011', '2024', '3,549,919', '2023', '3,465,541', '2022', 'United Kingdom', '3,495,261', '2024', '3,340,032', '2023', '3,089,072', '2022', 'France', '3,130,014', '2024', '3,030,904', '2023', '2,775,316', '2022', 'Brazil', '2,331,391', '2024', '2,173,666', '2023', '1,920,095', '2022', 'Italy', '2,328,028', '2024', '2,254,851', '2023', '2,046,952', '2022', 'Canada', '2,242,182', '2024', '2,140,086', '2023', '2,137,939', '2022', 'Russia', '2,056,844', '2024', '2,021,421', '2023', '2,240,422', '2022', 'Mexico', '2,017,025', '2024', '1,788,887', '2023', '1,463,323', '2022', 'Australia',

In [18]:
# Function to create structured list.
def split(temp1, chunk_size):

  for i in range(0, len(temp1), chunk_size):
    yield temp1[i:i + chunk_size]

chunk_size = 7

z0 = list(split(temp1, chunk_size))

z0

[['World',
  '109,529,216',
  '2024',
  '105,435,540',
  '2023',
  '100,834,796',
  '2022'],
 ['United States',
  '28,781,083',
  '2024',
  '27,360,935',
  '2023',
  '25,744,100',
  '2022'],
 ['China',
  '18,532,633',
  '[n 1]2024',
  '17,794,782',
  '[n 3]2023',
  '17,963,170',
  '[n 1]2022'],
 ['Germany', '4,591,100', '2024', '4,456,081', '2023', '4,076,923', '2022'],
 ['Japan', '4,110,452', '2024', '4,212,945', '2023', '4,232,173', '2022'],
 ['India', '3,937,011', '2024', '3,549,919', '2023', '3,465,541', '2022'],
 ['United Kingdom',
  '3,495,261',
  '2024',
  '3,340,032',
  '2023',
  '3,089,072',
  '2022'],
 ['France', '3,130,014', '2024', '3,030,904', '2023', '2,775,316', '2022'],
 ['Brazil', '2,331,391', '2024', '2,173,666', '2023', '1,920,095', '2022'],
 ['Italy', '2,328,028', '2024', '2,254,851', '2023', '2,046,952', '2022'],
 ['Canada', '2,242,182', '2024', '2,140,086', '2023', '2,137,939', '2022'],
 ['Russia', '2,056,844', '2024', '2,021,421', '2023', '2,240,422', '2022'],
 [

In [19]:
# Convert list to dataframe as well as rename columns.
df1 = pd.DataFrame(z0, columns = countries1.columns).astype(str)

df1.columns.values[0] = 'Country'
df1.columns.values[1] = 'Est. 2024 GDP(Bn)'
df1.columns.values[2] = 'Q'
df1.columns.values[3] = 'Y'
df1.columns.values[4] = 'W'
df1.columns.values[5] = '2022 GDP (Bn)'
df1.columns.values[6] = 'Z'

df1

Unnamed: 0,Country,Est. 2024 GDP(Bn),Q,Y,W,2022 GDP (Bn),Z
0,World,109529216,2024,105435540,2023,100834796,2022
1,United States,28781083,2024,27360935,2023,25744100,2022
2,China,18532633,[n 1]2024,17794782,[n 3]2023,17963170,[n 1]2022
3,Germany,4591100,2024,4456081,2023,4076923,2022
4,Japan,4110452,2024,4212945,2023,4232173,2022
...,...,...,...,...,...,...,...
205,Kiribati,311,2024,279,2023,223,2022
206,Palau,308,2024,263,2023,225,2022
207,Marshall Islands,305,2024,284,2023,279,2022
208,Nauru,161,2024,154,2023,147,2022


In [20]:
# Drop unnecessary columns.
df1 = df1.drop(columns =['Q', 'Y', 'W', 'Z']).copy()

df1

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn)
0,World,109529216,100834796
1,United States,28781083,25744100
2,China,18532633,17963170
3,Germany,4591100,4076923
4,Japan,4110452,4232173
...,...,...,...
205,Kiribati,311,223
206,Palau,308,225
207,Marshall Islands,305,279
208,Nauru,161,147


In [21]:
# Remove unnecessary characters from dataframe.
df1 = df1.replace(r'\[.*\]', '', regex = True).copy()

df1

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn)
0,World,109529216,100834796
1,United States,28781083,25744100
2,China,18532633,17963170
3,Germany,4591100,4076923
4,Japan,4110452,4232173
...,...,...,...
205,Kiribati,311,223
206,Palau,308,225
207,Marshall Islands,305,279
208,Nauru,161,147


In [22]:
# Create a .csv file.
df1.to_csv('highest gdp - wiki 1.csv', index = False)

# Read in .csv file.
df1 = pd.read_csv ('highest gdp - wiki 1.csv', sep = ',')

In [23]:
# Connect to webpage.
url2 = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita'

page2 = requests.get(url2)

# Extract site data.
soup2 = BeautifulSoup(page2.text, 'html')

In [24]:
# Isolate site data.
table2 = soup2.find('main')

table2

<main class="mw-body" id="content">
<header class="mw-body-header vector-page-titlebar">
<nav aria-label="Contents" class="vector-toc-landmark">
<div class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" id="vector-page-titlebar-toc">
<input aria-haspopup="true" aria-label="Toggle the table of contents" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-vector-page-titlebar-toc" id="vector-page-titlebar-toc-checkbox" role="button" type="checkbox"/>
<label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" for="vector-page-titlebar-toc-checkbox" id="vector-page-titlebar-toc-label"><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span>
<span class="vector-dropdown-label-text">Toggle the table of contents</span>
</label>
<div class="vector-dropdown-content">
<div class="vector-unpinned-container" id="vector-page-ti

In [25]:
# Create list of column titles.
col_titles2 = table2.find_all('th')[:1]

col_titles2

[<th rowspan="2">Country/Territory
 </th>]

In [26]:
# Create list of column titles.
col_titles2a = table2.find_all('th')[4:6]

col_titles2a

[<th>Estimate</th>, <th>Year</th>]

In [27]:
# Clean list of column titles.
col_table_titles2 = [title.text.strip() for title in col_titles2]

col_table_titles2

['Country/Territory']

In [28]:
# Clean list of column titles.
col_table_titles2a = [title.text.strip() for title in col_titles2a]

col_table_titles2a

['Estimate', 'Year']

In [29]:
# Merge final title column list.
col_table_titles2b = col_table_titles2 + col_table_titles2a + col_table_titles2a + col_table_titles2a

col_table_titles2b

['Country/Territory',
 'Estimate',
 'Year',
 'Estimate',
 'Year',
 'Estimate',
 'Year']

In [30]:
# Convert list to Pandas dataframe.
countries2 = pd.DataFrame(columns = col_table_titles2b).copy()

countries2

Unnamed: 0,Country/Territory,Estimate,Year,Estimate.1,Year.1,Estimate.2,Year.2


In [31]:
# Locate and extract table data.
row_dat2 = table2.find_all('table')

row_dat2

[<table width="100%"> <tbody><tr> <td valign="top"> <style data-mw-deduplicate="TemplateStyles:r981673959">.mw-parser-output .legend{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .legend-color{display:inline-block;min-width:1.25em;height:1.25em;line-height:1.25;margin:1px 0;text-align:center;border:1px solid black;background-color:transparent;color:black}.mw-parser-output .legend-text{}</style><div class="legend"><span class="legend-color mw-no-invert" style="background-color:#00008a; color:white;"> </span> &gt;$60,000</div> <link href="mw-data:TemplateStyles:r981673959" rel="mw-deduplicated-inline-style"/><div class="legend"><span class="legend-color mw-no-invert" style="background-color:#003c00; color:white;"> </span> $50,000–$60,000</div> <link href="mw-data:TemplateStyles:r981673959" rel="mw-deduplicated-inline-style"/><div class="legend"><span class="legend-color mw-no-invert" style="background-color:#008f00; color:black;"> </span> $40,000–$50,000</div> <link

In [32]:
# Create a structured list.
x2 = [[]]

for row in row_dat2:
    row_data2 = row.find_all('td')
    ind_row_data2 = [data.text.strip() for data in row_data2]  
    ind_row_data2 = [data.text.split(',') for data in row_data2]  
    ind_row_data2 = [data.text.strip() for data in row_data2]
    x2.append(ind_row_data2)
    
    print(ind_row_data2)

['>$60,000 \xa0\xa0$50,000–$60,000 \xa0\xa0$40,000–$50,000 \xa0\xa0$30,000–$40,000', '$20,000–$30,000 \xa0\xa0$10,000–$20,000 \xa0\xa0$5,000–$10,000 \xa0\xa0$2,500–$5,000', '$1,000–$2,500 \xa0\xa0$500–$1,000 \xa0\xa0<$500 \xa0\xa0No data']
['Monaco', '—', '240,862', '2022', '234,317', '2021', 'Liechtenstein', '—', '197,505', '2021', '169,260', '2021', 'Luxembourg', '131,384', '2024', '125,006', '2022', '133,745', '2021', 'Bermuda', '—', '118,775', '2022', '112,653', '2021', 'Ireland', '106,059', '2024', '103,983', '2022', '101,109', '2021', 'Switzerland', '105,669', '2024', '93,260', '2022', '93,525', '2021', 'Cayman Islands', '—', '99,625', '2022', '85,250', '2021', 'Norway', '94,660', '2024', '108,729', '2022', '89,242', '2021', 'Singapore', '88,447', '2024', '82,808', '2022', '66,822', '2021', 'United States', '85,373', '2024', '76,330', '2022', '69,185', '2021', 'Iceland', '84,594', '2024', '73,467', '2022', '69,133', '2021', 'Qatar', '81,400', '2024', '87,662', '2022', '66,799', '

In [33]:
# Isolate table data only.
x2 = x2[2].copy()

x2

['Monaco',
 '—',
 '240,862',
 '2022',
 '234,317',
 '2021',
 'Liechtenstein',
 '—',
 '197,505',
 '2021',
 '169,260',
 '2021',
 'Luxembourg',
 '131,384',
 '2024',
 '125,006',
 '2022',
 '133,745',
 '2021',
 'Bermuda',
 '—',
 '118,775',
 '2022',
 '112,653',
 '2021',
 'Ireland',
 '106,059',
 '2024',
 '103,983',
 '2022',
 '101,109',
 '2021',
 'Switzerland',
 '105,669',
 '2024',
 '93,260',
 '2022',
 '93,525',
 '2021',
 'Cayman Islands',
 '—',
 '99,625',
 '2022',
 '85,250',
 '2021',
 'Norway',
 '94,660',
 '2024',
 '108,729',
 '2022',
 '89,242',
 '2021',
 'Singapore',
 '88,447',
 '2024',
 '82,808',
 '2022',
 '66,822',
 '2021',
 'United States',
 '85,373',
 '2024',
 '76,330',
 '2022',
 '69,185',
 '2021',
 'Iceland',
 '84,594',
 '2024',
 '73,467',
 '2022',
 '69,133',
 '2021',
 'Qatar',
 '81,400',
 '2024',
 '87,662',
 '2022',
 '66,799',
 '2021',
 'Isle of Man',
 '—',
 '79,531',
 '2020',
 '—',
 'Macau',
 '78,962',
 '2024',
 '34,585',
 '2022',
 '43,555',
 '2021',
 'Denmark',
 '68,898',
 '2024',
 '67

## There are missing values that need to be inserted manually by manipulation.

In [34]:
# Create dataframe from list to utilize Pandas functionality.
yyy = pd.DataFrame(x2).copy()

yyy

Unnamed: 0,0
0,Monaco
1,—
2,240862
3,2022
4,234317
...,...
1507,2024
1508,259
1509,2022
1510,311


In [35]:
# Rename column.
yyy.rename(columns={yyy.columns[0]: 'u'}, inplace=True)

yyy

Unnamed: 0,u
0,Monaco
1,—
2,240862
3,2022
4,234317
...,...
1507,2024
1508,259
1509,2022
1510,311


In [36]:
# Replace dashes (-) with 'XXXX'
ddd = 'XXXX'

yyy = yyy.replace('—', ddd, regex = True).copy()

yyy

Unnamed: 0,u
0,Monaco
1,XXXX
2,240862
3,2022
4,234317
...,...
1507,2024
1508,259
1509,2022
1510,311


In [37]:
# Convert to single list.
yyy = yyy.u.values.tolist().copy()

yyy

['Monaco',
 'XXXX',
 '240,862',
 '2022',
 '234,317',
 '2021',
 'Liechtenstein',
 'XXXX',
 '197,505',
 '2021',
 '169,260',
 '2021',
 'Luxembourg',
 '131,384',
 '2024',
 '125,006',
 '2022',
 '133,745',
 '2021',
 'Bermuda',
 'XXXX',
 '118,775',
 '2022',
 '112,653',
 '2021',
 'Ireland',
 '106,059',
 '2024',
 '103,983',
 '2022',
 '101,109',
 '2021',
 'Switzerland',
 '105,669',
 '2024',
 '93,260',
 '2022',
 '93,525',
 '2021',
 'Cayman Islands',
 'XXXX',
 '99,625',
 '2022',
 '85,250',
 '2021',
 'Norway',
 '94,660',
 '2024',
 '108,729',
 '2022',
 '89,242',
 '2021',
 'Singapore',
 '88,447',
 '2024',
 '82,808',
 '2022',
 '66,822',
 '2021',
 'United States',
 '85,373',
 '2024',
 '76,330',
 '2022',
 '69,185',
 '2021',
 'Iceland',
 '84,594',
 '2024',
 '73,467',
 '2022',
 '69,133',
 '2021',
 'Qatar',
 '81,400',
 '2024',
 '87,662',
 '2022',
 '66,799',
 '2021',
 'Isle of Man',
 'XXXX',
 '79,531',
 '2020',
 'XXXX',
 'Macau',
 '78,962',
 '2024',
 '34,585',
 '2022',
 '43,555',
 '2021',
 'Denmark',
 '68,8

In [38]:
# Add 'XXXX' to positions that were missing values.
temp2 = []

for i in range(len(yyy)):
    if yyy[i] == ddd:
        temp2.extend([yyy[i], ddd])
    else:
        temp2.append(yyy[i])
        
print(temp2)

['Monaco', 'XXXX', 'XXXX', '240,862', '2022', '234,317', '2021', 'Liechtenstein', 'XXXX', 'XXXX', '197,505', '2021', '169,260', '2021', 'Luxembourg', '131,384', '2024', '125,006', '2022', '133,745', '2021', 'Bermuda', 'XXXX', 'XXXX', '118,775', '2022', '112,653', '2021', 'Ireland', '106,059', '2024', '103,983', '2022', '101,109', '2021', 'Switzerland', '105,669', '2024', '93,260', '2022', '93,525', '2021', 'Cayman Islands', 'XXXX', 'XXXX', '99,625', '2022', '85,250', '2021', 'Norway', '94,660', '2024', '108,729', '2022', '89,242', '2021', 'Singapore', '88,447', '2024', '82,808', '2022', '66,822', '2021', 'United States', '85,373', '2024', '76,330', '2022', '69,185', '2021', 'Iceland', '84,594', '2024', '73,467', '2022', '69,133', '2021', 'Qatar', '81,400', '2024', '87,662', '2022', '66,799', '2021', 'Isle of Man', 'XXXX', 'XXXX', '79,531', '2020', 'XXXX', 'XXXX', 'Macau', '78,962', '2024', '34,585', '2022', '43,555', '2021', 'Denmark', '68,898', '2024', '67,790', '2022', '68,037', '202

In [39]:
# Function to create structured list.
def split(temp2, chunk_size):

  for i in range(0, len(temp2), chunk_size):
    yield temp2[i:i + chunk_size]

chunk_size = 7

z1 = list(split(temp2, chunk_size))

z1

[['Monaco', 'XXXX', 'XXXX', '240,862', '2022', '234,317', '2021'],
 ['Liechtenstein', 'XXXX', 'XXXX', '197,505', '2021', '169,260', '2021'],
 ['Luxembourg', '131,384', '2024', '125,006', '2022', '133,745', '2021'],
 ['Bermuda', 'XXXX', 'XXXX', '118,775', '2022', '112,653', '2021'],
 ['Ireland', '106,059', '2024', '103,983', '2022', '101,109', '2021'],
 ['Switzerland', '105,669', '2024', '93,260', '2022', '93,525', '2021'],
 ['Cayman Islands', 'XXXX', 'XXXX', '99,625', '2022', '85,250', '2021'],
 ['Norway', '94,660', '2024', '108,729', '2022', '89,242', '2021'],
 ['Singapore', '88,447', '2024', '82,808', '2022', '66,822', '2021'],
 ['United States', '85,373', '2024', '76,330', '2022', '69,185', '2021'],
 ['Iceland', '84,594', '2024', '73,467', '2022', '69,133', '2021'],
 ['Qatar', '81,400', '2024', '87,662', '2022', '66,799', '2021'],
 ['Isle of Man', 'XXXX', 'XXXX', '79,531', '2020', 'XXXX', 'XXXX'],
 ['Macau', '78,962', '2024', '34,585', '2022', '43,555', '2021'],
 ['Denmark', '68,898

In [40]:
# Convert to dataframe and rename columns.
df2 = pd.DataFrame(z1, columns=col_table_titles2b).astype(str)

df2.columns.values[0] = 'Country'
df2.columns.values[2] = 'Q'
df2.columns.values[1] = 'Est. 2024 GDP per capita (Bn)'
df2.columns.values[4] = 'Y'
df2.columns.values[3] = '2022 GDP per capita (Bn)'
df2.columns.values[5] = 'Z'
df2.columns.values[6] = 'X'

df2

Unnamed: 0,Country,Est. 2024 GDP per capita (Bn),Q,2022 GDP per capita (Bn),Y,Z,X
0,Monaco,XXXX,XXXX,240862,2022,234317,2021
1,Liechtenstein,XXXX,XXXX,197505,2021,169260,2021
2,Luxembourg,131384,2024,125006,2022,133745,2021
3,Bermuda,XXXX,XXXX,118775,2022,112653,2021
4,Ireland,106059,2024,103983,2022,101109,2021
...,...,...,...,...,...,...,...
217,Malawi,481,2024,645,2022,613,2021
218,South Sudan,422,2024,1072,2015,400,2021
219,Afghanistan,422,2022,356,2021,373,2021
220,Syria,XXXX,XXXX,421,2021,925,2021


In [41]:
# Drop unnecessary columns.
df2 = df2.drop(columns =['Q', 'X', 'Y', 'Z']).reset_index(drop = True).copy()

df2

Unnamed: 0,Country,Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn)
0,Monaco,XXXX,240862
1,Liechtenstein,XXXX,197505
2,Luxembourg,131384,125006
3,Bermuda,XXXX,118775
4,Ireland,106059,103983
...,...,...,...
217,Malawi,481,645
218,South Sudan,422,1072
219,Afghanistan,422,356
220,Syria,XXXX,421


In [42]:
# Merge dataframes.
dfinal = df1.merge(df2, on = 'Country', how = 'left').copy()

dfinal

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn)
0,World,109529216,100834796,,
1,United States,28781083,25744100,85373,76330
2,China,18532633,17963170,13136,12720
3,Germany,4591100,4076923,54291,48718
4,Japan,4110452,4232173,33138,34017
...,...,...,...,...,...
205,Kiribati,311,223,2446,1702
206,Palau,308,225,17441,12922
207,Marshall Islands,305,279,6711,6225
208,Nauru,161,147,12362,11971


In [43]:
# Convert NaN values to 'XXXX'
dfinal = dfinal.fillna('XXXX').copy()

dfinal

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn)
0,World,109529216,100834796,XXXX,XXXX
1,United States,28781083,25744100,85373,76330
2,China,18532633,17963170,13136,12720
3,Germany,4591100,4076923,54291,48718
4,Japan,4110452,4232173,33138,34017
...,...,...,...,...,...
205,Kiribati,311,223,2446,1702
206,Palau,308,225,17441,12922
207,Marshall Islands,305,279,6711,6225
208,Nauru,161,147,12362,11971


In [44]:
# Remove unnecessary characters from datframe.
dfinal = dfinal.replace(r'\[.*\]', '', regex = True)

dfinal

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn)
0,World,109529216,100834796,XXXX,XXXX
1,United States,28781083,25744100,85373,76330
2,China,18532633,17963170,13136,12720
3,Germany,4591100,4076923,54291,48718
4,Japan,4110452,4232173,33138,34017
...,...,...,...,...,...
205,Kiribati,311,223,2446,1702
206,Palau,308,225,17441,12922
207,Marshall Islands,305,279,6711,6225
208,Nauru,161,147,12362,11971


In [45]:
# Create .csv file.
dfinal.to_csv('highest gdp - wiki 2.csv', index = False)

# Read .csv file.
dfinal = pd.read_csv ('highest gdp - wiki 2.csv', sep = ',')

In [46]:
# Connect to webpage.
url3 = 'https://www.worlddata.info/average-income.php'

page3 = requests.get(url3)

# Extract site data.
soup3 = BeautifulSoup(page3.text, 'html')

In [47]:
# Isolate site data.
table3 = soup3.find('main')

table3

<main id="main"><div id="intro1"><div id="blurbg" style="background-image:url(//cdn.worlddata.info/pics/coins.jpg);"></div><div id="intro3"><img alt="Income and salaries" class="deko" height="207" src="//cdn.worlddata.info/pics/coins.jpg" width="310"/><div class="floater"><h1>Average income around the world</h1>The worldwide highest income is earned in Monaco. The smallest budget per capita exists in Afghanistan. In our comparison over 91 countries, the USA comes 7th with an average income of 76,770 USD.<p>The average gross annual wage per full-time employee in the USA was $77,464 in 2022, or around $6,455 per month ($3,205/year more than in the previous year).</p></div><div class="clear"></div></div></div><div class="boxwhite"><h2>The average annual income</h2><div id="map"><picture><source media="(max-width:380px) and (prefers-color-scheme:dark)" srcset="//cdn.worlddata.info/maps/durchschnittseinkommen-dark.png"><source media="(max-width:380px)" srcset="//cdn.worlddata.info/maps/durc

In [48]:
# Create list of column titles.
col_titles3 = table3.find_all('th')[:3]

col_titles3

[<th class="left">Country/Region</th>,
 <th class="right" id="th1">Ø Annual income</th>,
 <th class="right" id="th2">Ø Gross annual wage</th>]

In [49]:
# Clean list of column titles.
col_table_titles3 = [title.text.strip() for title in col_titles3]

col_table_titles3

['Country/Region', 'Ø Annual income', 'Ø Gross annual wage']

In [50]:
# Convert list to Pandas dataframe.
countries3 = pd.DataFrame(columns = col_table_titles3).copy()

countries3

Unnamed: 0,Country/Region,Ø Annual income,Ø Gross annual wage


In [51]:
# Locate and extract table data.
row_dat3 = table3.find_all('table')

row_dat3

[<table class="std100 hover tabsort sticky" id="tabsort"><tr class="no-sort"><th class="left">Country/Region</th><th class="right" id="th1">Ø Annual income</th><th class="right" id="th2">Ø Gross annual wage</th></tr><tr><td><a class="fl_eu fl_MCO" href="/europe/monaco/index.php">Monaco</a></td><td data-val="186080">186,080 $</td><td></td></tr><tr><td><a class="fl_am fl_BMU" href="/america/bermuda/index.php">Bermuda</a> *</td><td data-val="125210">125,210 $</td><td></td></tr><tr><td><a class="fl_eu fl_CHE" href="/europe/switzerland/index.php">Switzerland</a></td><td data-val="95490">95,490 $</td><td data-val="97258">97,258 $</td></tr><tr><td><a class="fl_eu fl_NOR" href="/europe/norway/index.php">Norway</a></td><td data-val="94540">94,540 $</td><td data-val="68071">68,071 $</td></tr><tr><td><a class="fl_eu fl_LUX" href="/europe/luxembourg/index.php">Luxembourg</a></td><td data-val="89200">89,200 $</td><td data-val="79651">79,651 $</td></tr><tr><td><a class="fl_eu fl_IRL" href="/europe/i

In [52]:
# Create a structured list.
x3 = []
for row in row_dat3:
    row_data3 = row.find_all('td')
    ind_row_data3 = [data.text.strip() for data in row_data3]  
    ind_row_data3 = [data.text.split(',') for data in row_data3]  
    ind_row_data3 = [data.text.strip() for data in row_data3]
    x3.append(ind_row_data3)
    
    print(ind_row_data3)


['Monaco', '186,080 $', '', 'Bermuda *', '125,210 $', '', 'Switzerland', '95,490 $', '97,258 $', 'Norway', '94,540 $', '68,071 $', 'Luxembourg', '89,200 $', '79,651 $', 'Ireland', '79,730 $', '55,742 $', 'United States', '76,770 $', '77,464 $', 'Denmark', '73,520 $', '67,745 $', 'Qatar', '70,120 $', '', 'Iceland', '68,660 $', '87,334 $', 'Singapore', '67,200 $', '', 'Sweden', '63,500 $', '47,786 $', 'Australia', '60,840 $', '64,326 $', 'Netherlands', '60,230 $', '55,012 $', 'Austria', '55,720 $', '51,773 $', 'Israel', '55,140 $', '49,036 $', 'Finland', '54,890 $', '49,113 $', 'Hong Kong *', '54,370 $', '', 'Germany', '54,030 $', '47,836 $', 'Belgium', '53,890 $', '54,757 $', 'Canada', '53,310 $', '60,954 $', 'United Kingdom', '49,240 $', '50,209 $', 'United Arab Emirates', '49,160 $', '', 'New Zealand', '49,090 $', '51,073 $', 'San Marino', '47,120 $', '', 'France', '45,290 $', '43,888 $', 'Macao *', '43,680 $', '', 'Japan', '42,550 $', '34,578 $', 'Italy', '38,200 $', '33,179 $', 'Sou

In [53]:
# Isolate table data only.
l = 0
m = 4
n = [[]]

for item in range(165):
    n[0][l:m] = x3[0][l:m]
    l += 4
    m += 4
    
n[0]

['Monaco',
 '186,080 $',
 '',
 'Bermuda *',
 '125,210 $',
 '',
 'Switzerland',
 '95,490 $',
 '97,258 $',
 'Norway',
 '94,540 $',
 '68,071 $',
 'Luxembourg',
 '89,200 $',
 '79,651 $',
 'Ireland',
 '79,730 $',
 '55,742 $',
 'United States',
 '76,770 $',
 '77,464 $',
 'Denmark',
 '73,520 $',
 '67,745 $',
 'Qatar',
 '70,120 $',
 '',
 'Iceland',
 '68,660 $',
 '87,334 $',
 'Singapore',
 '67,200 $',
 '',
 'Sweden',
 '63,500 $',
 '47,786 $',
 'Australia',
 '60,840 $',
 '64,326 $',
 'Netherlands',
 '60,230 $',
 '55,012 $',
 'Austria',
 '55,720 $',
 '51,773 $',
 'Israel',
 '55,140 $',
 '49,036 $',
 'Finland',
 '54,890 $',
 '49,113 $',
 'Hong Kong *',
 '54,370 $',
 '',
 'Germany',
 '54,030 $',
 '47,836 $',
 'Belgium',
 '53,890 $',
 '54,757 $',
 'Canada',
 '53,310 $',
 '60,954 $',
 'United Kingdom',
 '49,240 $',
 '50,209 $',
 'United Arab Emirates',
 '49,160 $',
 '',
 'New Zealand',
 '49,090 $',
 '51,073 $',
 'San Marino',
 '47,120 $',
 '',
 'France',
 '45,290 $',
 '43,888 $',
 'Macao *',
 '43,680

In [54]:
# Create a structured list.
def split(list_c, chunk_size):

  for i in range(0, len(list_c), chunk_size):
    yield list_c[i:i + chunk_size]

chunk_size = 3

z3 = list(split(n[0], chunk_size))

z3

[['Monaco', '186,080 $', ''],
 ['Bermuda *', '125,210 $', ''],
 ['Switzerland', '95,490 $', '97,258 $'],
 ['Norway', '94,540 $', '68,071 $'],
 ['Luxembourg', '89,200 $', '79,651 $'],
 ['Ireland', '79,730 $', '55,742 $'],
 ['United States', '76,770 $', '77,464 $'],
 ['Denmark', '73,520 $', '67,745 $'],
 ['Qatar', '70,120 $', ''],
 ['Iceland', '68,660 $', '87,334 $'],
 ['Singapore', '67,200 $', ''],
 ['Sweden', '63,500 $', '47,786 $'],
 ['Australia', '60,840 $', '64,326 $'],
 ['Netherlands', '60,230 $', '55,012 $'],
 ['Austria', '55,720 $', '51,773 $'],
 ['Israel', '55,140 $', '49,036 $'],
 ['Finland', '54,890 $', '49,113 $'],
 ['Hong Kong *', '54,370 $', ''],
 ['Germany', '54,030 $', '47,836 $'],
 ['Belgium', '53,890 $', '54,757 $'],
 ['Canada', '53,310 $', '60,954 $'],
 ['United Kingdom', '49,240 $', '50,209 $'],
 ['United Arab Emirates', '49,160 $', ''],
 ['New Zealand', '49,090 $', '51,073 $'],
 ['San Marino', '47,120 $', ''],
 ['France', '45,290 $', '43,888 $'],
 ['Macao *', '43,680

In [55]:
# Convert list to Pandas dataframe and rename columns.
df3 = pd.DataFrame(z3, columns=col_table_titles3).astype(str).copy()

df3.columns.values[0] = 'Country'
df3.columns.values[1] = 'Income'

df3 = df3.drop(['Ø Gross annual wage'], axis = 1).copy()

df3

Unnamed: 0,Country,Income
0,Monaco,"186,080 $"
1,Bermuda *,"125,210 $"
2,Switzerland,"95,490 $"
3,Norway,"94,540 $"
4,Luxembourg,"89,200 $"
...,...,...
86,Kyrgyzstan,"1,440 $"
87,Nepal,"1,340 $"
88,Myanmar,"1,270 $"
89,Sudan,760 $


In [56]:
# Remove unnecessary characters from dataframe.
df3 = df3.replace(r'[\*\$]', '', regex = True).copy()

df3

Unnamed: 0,Country,Income
0,Monaco,186080
1,Bermuda,125210
2,Switzerland,95490
3,Norway,94540
4,Luxembourg,89200
...,...,...
86,Kyrgyzstan,1440
87,Nepal,1340
88,Myanmar,1270
89,Sudan,760


In [57]:
# Merge dataframes.
dfinal1 = dfinal.merge(df3, on = 'Country', how = 'right').copy()

dfinal1

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income
0,Monaco,XXXX,8772,XXXX,240862,186080
1,Bermuda,,,,,125210
2,Switzerland,938458,818426,105669,93260,95490
3,Norway,526951,579422,94660,108729,94540
4,Luxembourg,88556,81530,131384,125006,89200
...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440
87,Nepal,44179,39406,1397,1337,1340
88,Myanmar,68006,65211,1248,1149,1270
89,Sudan,26865,36729,547,1102,760


In [58]:
# Convert NaN values to 'XXXX'.
dfinal1 = dfinal1.fillna('XXXX')

dfinal1

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income
0,Monaco,XXXX,8772,XXXX,240862,186080
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210
2,Switzerland,938458,818426,105669,93260,95490
3,Norway,526951,579422,94660,108729,94540
4,Luxembourg,88556,81530,131384,125006,89200
...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440
87,Nepal,44179,39406,1397,1337,1340
88,Myanmar,68006,65211,1248,1149,1270
89,Sudan,26865,36729,547,1102,760


In [59]:
# Create .csv file.
dfinal1.to_csv('highest gdp - worlddata1.csv', index = False)

# Read .csv file.
dfinal1 = pd.read_csv ('highest gdp - worlddata1.csv', sep = ',')

In [60]:
# Connect to webpage.
url4 = 'https://www.britannica.com/topic/list-of-the-total-areas-of-the-worlds-countries-dependencies-and-territories-2130540'

page4 = requests.get(url4)

# Extract site data.
soup4 = BeautifulSoup(page4.text, 'html')

In [61]:
# Isolate site data.
table4 = soup4.find('main')

table4

<main>
<div class="md-page-wrapper">
<div class="md-content" id="content">
<div class="md-article-container template-desktop">
<div class="infinite-scroll-container article last">
<article class="article-content container-lg qa-content px-0 pt-0 pb-40 py-lg-20 content" data-topic-id="2130540">
<div class="grid gx-0">
<div class="col-auto">
<div class="topic-left-rail md-article-drawer position-relative d-flex border-right-sm border-left-sm open">
<div class="drawer d-flex flex-column open">
<div class="left-rail-section-content">
<div class="topic-left-rail-header text-truncate bg-gray-50 position-relative text-right d-flex align-items-center">
<div class="tlr-title px-20 py-15 text-left">
<em class="material-icons text-gray-400 d-lg-none" data-icon="toc"></em>
<a class="font-serif font-weight-bold text-black link-blue" href="https://www.britannica.com/topic/list-of-the-total-areas-of-the-worlds-countries-dependencies-and-territories-2130540">list of the world’s largest countries and d

In [62]:
# Create list of column titles.
col_titles4 = table4.find_all('th')[:4]

col_titles4

[<th>country, dependency, or territory</th>,
 <th>total area in square miles</th>,
 <th>total area in square km</th>,
 <th>total area rank</th>]

In [63]:
# Clean list of column titles.
col_table_titles4 = [title.text.strip() for title in col_titles4]

col_table_titles4

['country, dependency, or territory',
 'total area in square miles',
 'total area in square km',
 'total area rank']

In [64]:
# Convert list to Pandas dataframe.
countries4 = pd.DataFrame(columns = col_table_titles4).copy()

countries4

Unnamed: 0,"country, dependency, or territory",total area in square miles,total area in square km,total area rank


In [65]:
# Locate and extract table data.
row_dat4 = table4.find_all('table')

row_dat4

[<table> <thead> <tr> <th>country, dependency, or territory</th> <th>total area in square miles</th> <th>total area in square km</th> <th>total area rank</th> </tr> </thead> <tbody> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Russia">Russia</a></td> <td>6,592,812</td> <td>17,075,400</td> <td>1</td> </tr> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Canada">Canada</a></td> <td>3,855,081</td> <td>9,984,670</td> <td>2</td> </tr> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/United-States">United States</a><sup>1</sup></td> <td>3,809,525</td> <td>9,866,289</td> <td>3</td> </tr> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/China">China</a></td> <td>3,696,097</td> <td>9,572,900</td> <td>4</td> </tr> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.co

In [66]:
# Create a structured list.
x4 = []

for row in row_dat4:
    row_data4 = row.find_all('td')
    ind_row_data4 = [data.text.strip() for data in row_data4]  
    ind_row_data4 = [data.text.split(',') for data in row_data4]  
    ind_row_data4 = [data.text.strip() for data in row_data4]
    x4.append(ind_row_data4)
    
    print(ind_row_data4)


['Russia', '6,592,812', '17,075,400', '1', 'Canada', '3,855,081', '9,984,670', '2', 'United States1', '3,809,525', '9,866,289', '3', 'China', '3,696,097', '9,572,900', '4', 'Brazil', '3,285,872', '8,510,418', '5', 'Australia', '2,968,385', '7,688,126', '6', 'India', '1,269,292', '3,287,469', '7', 'Argentina', '1,044,319', '2,704,789', '8', 'Kazakhstan', '1,052,084', '2,724,900', '9', 'Algeria', '919,590', '2,381,741', '10', 'Democratic Republic of the Congo', '905,405', '2,345,000', '11', 'Greenland', '836,326', '2,166,086', '12', 'Saudi Arabia', '829,995', '2,149,690', '13', 'Mexico', '758,445', '1,964,375', '14', 'Indonesia', '730,660', '1,892,410', '15', 'Sudan', '710,689', '1,840,687', '16', 'Libya', '647,180', '1,676,198', '17', 'Iran', '629,670', '1,630,848', '18', 'Mongolia', '603,953', '1,564,241', '19', 'Peru', '496,171', '1,285,082', '20', 'Chad', '495,753', '1,284,000', '21', 'Niger', '489,189', '1,267,000', '22', 'Angola', '481,351', '1,246,700', '23', 'Mali', '479,242', '1

In [67]:
# Isolate table data only.
p = 0
q = 4
r = [[]]

for item in range(223):
    r[0][p:q] = x4[0][p:q]
    p += 4
    q += 4
    
r[0]

['Russia',
 '6,592,812',
 '17,075,400',
 '1',
 'Canada',
 '3,855,081',
 '9,984,670',
 '2',
 'United States1',
 '3,809,525',
 '9,866,289',
 '3',
 'China',
 '3,696,097',
 '9,572,900',
 '4',
 'Brazil',
 '3,285,872',
 '8,510,418',
 '5',
 'Australia',
 '2,968,385',
 '7,688,126',
 '6',
 'India',
 '1,269,292',
 '3,287,469',
 '7',
 'Argentina',
 '1,044,319',
 '2,704,789',
 '8',
 'Kazakhstan',
 '1,052,084',
 '2,724,900',
 '9',
 'Algeria',
 '919,590',
 '2,381,741',
 '10',
 'Democratic Republic of the Congo',
 '905,405',
 '2,345,000',
 '11',
 'Greenland',
 '836,326',
 '2,166,086',
 '12',
 'Saudi Arabia',
 '829,995',
 '2,149,690',
 '13',
 'Mexico',
 '758,445',
 '1,964,375',
 '14',
 'Indonesia',
 '730,660',
 '1,892,410',
 '15',
 'Sudan',
 '710,689',
 '1,840,687',
 '16',
 'Libya',
 '647,180',
 '1,676,198',
 '17',
 'Iran',
 '629,670',
 '1,630,848',
 '18',
 'Mongolia',
 '603,953',
 '1,564,241',
 '19',
 'Peru',
 '496,171',
 '1,285,082',
 '20',
 'Chad',
 '495,753',
 '1,284,000',
 '21',
 'Niger',
 '489,1

In [68]:
# Function to create structured list.
def split(list_e, chunk_size):

  for i in range(0, len(list_e), chunk_size):
    yield list_e[i:i + chunk_size]

chunk_size = 4

z4 = list(split(r[0], chunk_size))

z4

[['Russia', '6,592,812', '17,075,400', '1'],
 ['Canada', '3,855,081', '9,984,670', '2'],
 ['United States1', '3,809,525', '9,866,289', '3'],
 ['China', '3,696,097', '9,572,900', '4'],
 ['Brazil', '3,285,872', '8,510,418', '5'],
 ['Australia', '2,968,385', '7,688,126', '6'],
 ['India', '1,269,292', '3,287,469', '7'],
 ['Argentina', '1,044,319', '2,704,789', '8'],
 ['Kazakhstan', '1,052,084', '2,724,900', '9'],
 ['Algeria', '919,590', '2,381,741', '10'],
 ['Democratic Republic of the Congo', '905,405', '2,345,000', '11'],
 ['Greenland', '836,326', '2,166,086', '12'],
 ['Saudi Arabia', '829,995', '2,149,690', '13'],
 ['Mexico', '758,445', '1,964,375', '14'],
 ['Indonesia', '730,660', '1,892,410', '15'],
 ['Sudan', '710,689', '1,840,687', '16'],
 ['Libya', '647,180', '1,676,198', '17'],
 ['Iran', '629,670', '1,630,848', '18'],
 ['Mongolia', '603,953', '1,564,241', '19'],
 ['Peru', '496,171', '1,285,082', '20'],
 ['Chad', '495,753', '1,284,000', '21'],
 ['Niger', '489,189', '1,267,000', '22

In [69]:
# Convert to dataframe and rename columns.
df4 = pd.DataFrame(z4, columns=col_table_titles4).astype(str).copy()

df4 = df4[['country, dependency, or territory', 'total area in square miles']].astype(str).copy()

df4.columns.values[0] = 'Country'
df4.columns.values[1] = 'Area (Sq.Mi.)'

df4

Unnamed: 0,Country,Area (Sq.Mi.)
0,Russia,6592812
1,Canada,3855081
2,United States1,3809525
3,China,3696097
4,Brazil,3285872
...,...,...
218,Macau,13
219,Tuvalu,10
220,Nauru,8
221,Monaco,0.8


In [70]:
# Remove unnecessary characters from the datframe.
df4['Country'] = df4['Country'].replace('[0-9]', '', regex = True).astype(str).copy()

df4

Unnamed: 0,Country,Area (Sq.Mi.)
0,Russia,6592812
1,Canada,3855081
2,United States,3809525
3,China,3696097
4,Brazil,3285872
...,...,...
218,Macau,13
219,Tuvalu,10
220,Nauru,8
221,Monaco,0.8


In [71]:
# Merge datframes.
dfinal2 = dfinal1.merge(df4, on = 'Country', how = 'left').copy()

dfinal2

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.)
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,
2,Switzerland,938458,818426,105669,93260,95490,15942
3,Norway,526951,579422,94660,108729,94540,148449
4,Luxembourg,88556,81530,131384,125006,89200,998
...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199
87,Nepal,44179,39406,1397,1337,1340,56827
88,Myanmar,68006,65211,1248,1149,1270,261217
89,Sudan,26865,36729,547,1102,760,710689


In [72]:
# Convert NaN values to 'XXXX'.
dfinal2 = dfinal2.fillna('XXXX').astype(str).copy()

dfinal2

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.)
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX
2,Switzerland,938458,818426,105669,93260,95490,15942
3,Norway,526951,579422,94660,108729,94540,148449
4,Luxembourg,88556,81530,131384,125006,89200,998
...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199
87,Nepal,44179,39406,1397,1337,1340,56827
88,Myanmar,68006,65211,1248,1149,1270,261217
89,Sudan,26865,36729,547,1102,760,710689


In [73]:
# Create .csv file.
dfinal2.to_csv('highest gdp - brittanica.csv', index = False)

# Read .csv file.
dfinal2 = pd.read_csv ('highest gdp - brittanica.csv', sep = ',')

In [74]:
# Connect to webpage.
url5 = 'https://www.cia.gov/the-world-factbook/field/unemployment-rate/country-comparison/'

page5 = requests.get(url5)

# Extract site data.
soup5 = BeautifulSoup(page5.text, 'html')

In [75]:
# Isolate site data.
table5 = soup5.find('main')

table5

<main class="WFB" id="main-content" tabindex="-1"><section class="background-threads resources-threads"><div class="fade-in-section disable-fade-in fade-in-hero disable-transform"><div class="container hero-splash-container"><div class="row pv90-60"><div class="col-sm-12 col-md-8 hero-splash"><a class="hero-parent-title crown-link" href="/the-world-factbook/references/guide-to-country-comparisons/"><span class="h1 small mb0 d-inline h3-style">Country Comparisons</span></a><h1 class="hero-title">Unemployment rate</h1></div><div class="col-12"></div><div class="col-sm-12 col-md-5"><div class="hero-content">Unemployement rate compares the percent of the labor force that is without jobs.</div></div></div></div></div></section><section class="background-grid-short-white"><div class="container pv120-90" id="index-content-section"><div class="row"><div class="col-lg-3 col-md-12 col-sm-12"><div class="filter-wrapper"><div class="filter-header"><div class="filter-clear"><div class="label bold">

In [76]:
# Create list of column titles.
col_titles5 = table5.find_all('th')[:4]


col_titles5

[<th>Rank</th>, <th>Country</th>, <th>%</th>, <th>Date of Information</th>]

In [77]:
# Clean list of column titles.
col_table_titles5 = [title.text.strip() for title in col_titles5]

col_table_titles5[2] = 'Unemployment (%)'

col_table_titles5

['Rank', 'Country', 'Unemployment (%)', 'Date of Information']

In [78]:
# Convert list to Pandas dataframe.
countries5 = pd.DataFrame(columns = col_table_titles5).copy()

countries5

Unnamed: 0,Rank,Country,Unemployment (%),Date of Information


In [79]:
# Locate and extract table data.
row_dat5 = table5.find_all('table')

row_dat5

[<table class="content-table table-auto"><thead><tr class="header-row"><th>Rank</th><th>Country</th><th>%</th><th>Date of Information</th></tr></thead><tbody><tr class="content-row"><td>1</td><td><a class="text-button" href="/the-world-factbook/countries/cocos-keeling-islands/">Cocos (Keeling) Islands</a></td><td>0.1</td><td>2011 est.</td></tr><tr class="content-row"><td>2</td><td><a class="text-button" href="/the-world-factbook/countries/qatar/">Qatar</a></td><td>0.13</td><td>2022 est.</td></tr><tr class="content-row"><td>3</td><td><a class="text-button" href="/the-world-factbook/countries/cambodia/">Cambodia</a></td><td>0.23</td><td>2022 est.</td></tr><tr class="content-row"><td>4</td><td><a class="text-button" href="/the-world-factbook/countries/niger/">Niger</a></td><td>0.57</td><td>2022 est.</td></tr><tr class="content-row"><td>5</td><td><a class="text-button" href="/the-world-factbook/countries/burundi/">Burundi</a></td><td>0.91</td><td>2022 est.</td></tr><tr class="content-row">

In [80]:
# Create a structured list.
x5 = []

for row in row_dat5:
    row_data5 = row.find_all('td')
    ind_row_data5 = [data.text.strip() for data in row_data5]  
    ind_row_data5 = [data.text.split(',') for data in row_data5]  
    ind_row_data5 = [data.text.strip() for data in row_data5]
    x5.append(ind_row_data5)
    
    print(ind_row_data5)


['1', 'Cocos (Keeling) Islands', '0.1', '2011 est.', '2', 'Qatar', '0.13', '2022 est.', '3', 'Cambodia', '0.23', '2022 est.', '4', 'Niger', '0.57', '2022 est.', '5', 'Burundi', '0.91', '2022 est.', '6', 'Moldova', '0.91', '2022 est.', '7', 'Thailand', '0.94', '2022 est.', '8', 'Falkland Islands (Islas Malvinas)', '1', '2016 est.', '9', 'Gibraltar', '1', '2016 est.', '10', 'Chad', '1.1', '2022 est.', '11', 'Cuba', '1.25', '2022 est.', '12', 'Bahrain', '1.34', '2022 est.', '13', 'Benin', '1.48', '2022 est.', '14', 'Oman', '1.53', '2022 est.', '15', 'Vietnam', '1.54', '2022 est.', '16', 'Solomon Islands', '1.61', '2022 est.', '17', 'Palau', '1.7', '2015 est.', '18', 'Timor-Leste', '1.79', '2022 est.', '19', 'Madagascar', '1.9', '2022 est.', '20', 'Isle of Man', '2', 'April 2011 est.', '21', 'Tokelau', '2', '2015 est.', '22', 'Monaco', '2', '2012 est.', '23', 'Faroe Islands', '2.2', '2017 est.', '24', 'Kuwait', '2.2', '2022 est.', '25', 'Czechia', '2.22', '2022 est.', '26', 'Philippines', 

In [81]:
# Isolate table data only.
s = 0
t = 4
u = [[]]

for item in range(219):
    u[0][s:t] = x5[0][s:t]
    s += 4
    t += 4

u[0]

['1',
 'Cocos (Keeling) Islands',
 '0.1',
 '2011 est.',
 '2',
 'Qatar',
 '0.13',
 '2022 est.',
 '3',
 'Cambodia',
 '0.23',
 '2022 est.',
 '4',
 'Niger',
 '0.57',
 '2022 est.',
 '5',
 'Burundi',
 '0.91',
 '2022 est.',
 '6',
 'Moldova',
 '0.91',
 '2022 est.',
 '7',
 'Thailand',
 '0.94',
 '2022 est.',
 '8',
 'Falkland Islands (Islas Malvinas)',
 '1',
 '2016 est.',
 '9',
 'Gibraltar',
 '1',
 '2016 est.',
 '10',
 'Chad',
 '1.1',
 '2022 est.',
 '11',
 'Cuba',
 '1.25',
 '2022 est.',
 '12',
 'Bahrain',
 '1.34',
 '2022 est.',
 '13',
 'Benin',
 '1.48',
 '2022 est.',
 '14',
 'Oman',
 '1.53',
 '2022 est.',
 '15',
 'Vietnam',
 '1.54',
 '2022 est.',
 '16',
 'Solomon Islands',
 '1.61',
 '2022 est.',
 '17',
 'Palau',
 '1.7',
 '2015 est.',
 '18',
 'Timor-Leste',
 '1.79',
 '2022 est.',
 '19',
 'Madagascar',
 '1.9',
 '2022 est.',
 '20',
 'Isle of Man',
 '2',
 'April 2011 est.',
 '21',
 'Tokelau',
 '2',
 '2015 est.',
 '22',
 'Monaco',
 '2',
 '2012 est.',
 '23',
 'Faroe Islands',
 '2.2',
 '2017 est.',
 '24

In [82]:
# Create a structured list.
def split(list_f, chunk_size):

  for i in range(0, len(list_f), chunk_size):
    yield list_f[i:i + chunk_size]

chunk_size = 4

z5 = list(split(u[0], chunk_size))

z5

[['1', 'Cocos (Keeling) Islands', '0.1', '2011 est.'],
 ['2', 'Qatar', '0.13', '2022 est.'],
 ['3', 'Cambodia', '0.23', '2022 est.'],
 ['4', 'Niger', '0.57', '2022 est.'],
 ['5', 'Burundi', '0.91', '2022 est.'],
 ['6', 'Moldova', '0.91', '2022 est.'],
 ['7', 'Thailand', '0.94', '2022 est.'],
 ['8', 'Falkland Islands (Islas Malvinas)', '1', '2016 est.'],
 ['9', 'Gibraltar', '1', '2016 est.'],
 ['10', 'Chad', '1.1', '2022 est.'],
 ['11', 'Cuba', '1.25', '2022 est.'],
 ['12', 'Bahrain', '1.34', '2022 est.'],
 ['13', 'Benin', '1.48', '2022 est.'],
 ['14', 'Oman', '1.53', '2022 est.'],
 ['15', 'Vietnam', '1.54', '2022 est.'],
 ['16', 'Solomon Islands', '1.61', '2022 est.'],
 ['17', 'Palau', '1.7', '2015 est.'],
 ['18', 'Timor-Leste', '1.79', '2022 est.'],
 ['19', 'Madagascar', '1.9', '2022 est.'],
 ['20', 'Isle of Man', '2', 'April 2011 est.'],
 ['21', 'Tokelau', '2', '2015 est.'],
 ['22', 'Monaco', '2', '2012 est.'],
 ['23', 'Faroe Islands', '2.2', '2017 est.'],
 ['24', 'Kuwait', '2.2', '2

In [83]:
# Create dataframe as well as rename and drop columns.
df5 = pd.DataFrame(z5, columns = col_table_titles5).astype(str)

df5.columns.values[1] = 'Country'
df5.columns.values[3] = 'Date'

df5 = df5.drop(['Rank', 'Date'], axis = 1).copy()

df5

Unnamed: 0,Country,Unemployment (%)
0,Cocos (Keeling) Islands,0.1
1,Qatar,0.13
2,Cambodia,0.23
3,Niger,0.57
4,Burundi,0.91
...,...,...
214,Gaza Strip,24.42
215,Djibouti,26.67
216,South Africa,28.84
217,Kosovo,30.5


In [84]:
# Merge dataframes.
dfinal3 = dfinal2.merge(df5, on = 'Country', how = 'left').copy()

dfinal3

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%)
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8,2
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX,
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3
3,Norway,526951,579422,94660,108729,94540,148449,3.23
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58
...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02
87,Nepal,44179,39406,1397,1337,1340,56827,10.92
88,Myanmar,68006,65211,1248,1149,1270,261217,
89,Sudan,26865,36729,547,1102,760,710689,17.59


In [85]:
# Convert NaN values to 'XXXX'.
dfinal3 = dfinal3.fillna('XXXX').astype(str).copy()

dfinal3

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%)
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8,2
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX,XXXX
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3
3,Norway,526951,579422,94660,108729,94540,148449,3.23
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58
...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02
87,Nepal,44179,39406,1397,1337,1340,56827,10.92
88,Myanmar,68006,65211,1248,1149,1270,261217,XXXX
89,Sudan,26865,36729,547,1102,760,710689,17.59


In [86]:
# Create .csv file.
dfinal3.to_csv('highest gdp - CIA.csv', index = False)

# Read .csv file.
dfinal3 = pd.read_csv ('highest gdp - CIA.csv', sep = ',')

In [87]:
# Connect to webpage.
url6 = 'https://www.worlddata.info/populationgrowth.php'

page6 = requests.get(url6)

# Extract site data.
soup6 = BeautifulSoup(page6.text, 'html')

In [88]:
# Isolate site data.
table6 = soup6.find('main')

table6

<main id="main"><div id="intro1"><div id="blurbg" style="background-image:url(//cdn.worlddata.info/pics/crowds4.jpg);"></div><div id="intro3"><img alt="Population growth" class="deko" height="320" src="//cdn.worlddata.info/pics/crowds4.jpg" width="320"/><div class="floater"><h1>Population growth by country</h1> Global population growth is the result of birth rate and death rate. The world population is rising steadily. In 2022, it reached a total population of 7.952 billion people on our planet with a growth rate of 0.8%.<p>With a growth rate of about 0.6% in the last decade, the <a href="/america/usa/populationgrowth.php">United States</a> is well in the middle of the global comparison. The last slight increase in it's growth rate was in 90s.</p><p>By contrast, the countries of <a href="/asia/qatar/populationgrowth.php">Qatar</a> and <a href="/asia/oman/populationgrowth.php">Oman</a> (both in the Middle East) have led the field by far in recent decades. Here, growth rates of over 10% 

In [89]:
# Create list of column titles.
col_titles6 = table6.find_all('th')[:3]

col_titles6

[<th class="left">Country/Region</th>,
 <th class="right">Ø Growth/year</th>,
 <th class="right">Growth 2013-2022</th>]

In [90]:
# Clean list of column titles.
col_table_titles6 = [title.text.strip() for title in col_titles6]

col_table_titles6

['Country/Region', 'Ø Growth/year', 'Growth 2013-2022']

In [91]:
# Create Seres from list.
col_table_titles6 = pd.Series(col_table_titles6) 

col_table_titles6

0      Country/Region
1       Ø Growth/year
2    Growth 2013-2022
dtype: object

In [92]:
# Locate and extract table data.
row_dat6 = table6.find_all('table')

row_dat6

[<table class="std100 hover tabsort sticky"><tr class="no-sort"><th class="left">Country/Region</th><th class="right">Ø Growth/year</th><th class="right">Growth 2013-2022</th></tr><tr><td><a class="fl_af fl_GNQ" href="/africa/equatorial-guinea/populationgrowth.php">Equatorial Guinea</a></td><td>5.08%</td><td>55.60%</td></tr><tr><td><a class="fl_af fl_NER" href="/africa/niger/populationgrowth.php">Niger</a></td><td>3.95%</td><td>41.63%</td></tr><tr><td><a class="fl_af fl_COD" href="/africa/congo-kinshasa/populationgrowth.php">Democratic Republic of the Congo</a></td><td>3.68%</td><td>38.75%</td></tr><tr><td><a class="fl_af fl_GMB" href="/africa/gambia/populationgrowth.php">Gambia</a></td><td>3.60%</td><td>37.80%</td></tr><tr><td><a class="fl_af fl_AGO" href="/africa/angola/populationgrowth.php">Angola</a></td><td>3.55%</td><td>36.80%</td></tr><tr><td><a class="fl_af fl_MLI" href="/africa/mali/populationgrowth.php">Mali</a></td><td>3.53%</td><td>37.35%</td></tr><tr><td><a class="fl_as fl

In [93]:
# Create a structured list.
x6 = []

for row in row_dat6:
    row_data6 = row.find_all('td')
    ind_row_data6 = [data.text.strip() for data in row_data6]  
    ind_row_data6 = [data.text.split(',') for data in row_data6]  
    ind_row_data6 = [data.text.strip() for data in row_data6]
    x6.append(ind_row_data6)
    
    print(ind_row_data6)


['Equatorial Guinea', '5.08%', '55.60%', 'Niger', '3.95%', '41.63%', 'Democratic Republic of the Congo', '3.68%', '38.75%', 'Gambia', '3.60%', '37.80%', 'Angola', '3.55%', '36.80%', 'Mali', '3.53%', '37.35%', 'Iraq', '3.41%', '34.20%', 'Jordan', '3.40%', '32.48%', 'Burundi', '3.40%', '35.11%', 'Tanzania', '3.36%', '35.09%', 'Chad', '3.32%', '34.06%', 'Zambia', '3.30%', '34.11%', 'Somalia', '3.30%', '34.71%', 'Yemen', '3.28%', '34.00%', 'Seychelles', '3.26%', '33.27%', 'Benin', '3.22%', '33.47%', 'Burkina Faso', '3.19%', '32.81%', 'Uganda', '3.18%', '32.37%', 'Gabon', '3.18%', '31.47%', 'Maldives', '2.82%', '26.03%', 'Afghanistan', '2.82%', '27.45%', 'Nigeria', '2.71%', '27.23%', 'Cameroon', '2.65%', '26.44%', 'Sudan', '2.62%', '26.44%', 'Egypt', '2.55%', '25.55%', 'Palestine', '2.40%', '23.72%', 'Pakistan', '2.35%', '23.30%', 'Saudi Arabia', '2.26%', '21.15%', 'Qatar', '2.13%', '15.34%', 'Macao *', '2.12%', '20.40%', 'Luxembourg', '2.09%', '20.20%', 'Israel', '1.91%', '18.59%', 'Philip

In [94]:
# Isolate table data only.
v1 = 0
v2 = 3
v = [[]]

for item in range(104):
    v[0][v1:v2] = x6[0][v1:v2]
    v1 += 3
    v2 += 3

v[0]

['Equatorial Guinea',
 '5.08%',
 '55.60%',
 'Niger',
 '3.95%',
 '41.63%',
 'Democratic Republic of the Congo',
 '3.68%',
 '38.75%',
 'Gambia',
 '3.60%',
 '37.80%',
 'Angola',
 '3.55%',
 '36.80%',
 'Mali',
 '3.53%',
 '37.35%',
 'Iraq',
 '3.41%',
 '34.20%',
 'Jordan',
 '3.40%',
 '32.48%',
 'Burundi',
 '3.40%',
 '35.11%',
 'Tanzania',
 '3.36%',
 '35.09%',
 'Chad',
 '3.32%',
 '34.06%',
 'Zambia',
 '3.30%',
 '34.11%',
 'Somalia',
 '3.30%',
 '34.71%',
 'Yemen',
 '3.28%',
 '34.00%',
 'Seychelles',
 '3.26%',
 '33.27%',
 'Benin',
 '3.22%',
 '33.47%',
 'Burkina Faso',
 '3.19%',
 '32.81%',
 'Uganda',
 '3.18%',
 '32.37%',
 'Gabon',
 '3.18%',
 '31.47%',
 'Maldives',
 '2.82%',
 '26.03%',
 'Afghanistan',
 '2.82%',
 '27.45%',
 'Nigeria',
 '2.71%',
 '27.23%',
 'Cameroon',
 '2.65%',
 '26.44%',
 'Sudan',
 '2.62%',
 '26.44%',
 'Egypt',
 '2.55%',
 '25.55%',
 'Palestine',
 '2.40%',
 '23.72%',
 'Pakistan',
 '2.35%',
 '23.30%',
 'Saudi Arabia',
 '2.26%',
 '21.15%',
 'Qatar',
 '2.13%',
 '15.34%',
 'Macao *',
 

In [95]:
# Function to create structured list.
def split(list_g, chunk_size):

  for i in range(0, len(list_g), chunk_size):
    yield list_g[i:i + chunk_size]

chunk_size = 3

z6 = list(split(v[0], chunk_size))

z6

[['Equatorial Guinea', '5.08%', '55.60%'],
 ['Niger', '3.95%', '41.63%'],
 ['Democratic Republic of the Congo', '3.68%', '38.75%'],
 ['Gambia', '3.60%', '37.80%'],
 ['Angola', '3.55%', '36.80%'],
 ['Mali', '3.53%', '37.35%'],
 ['Iraq', '3.41%', '34.20%'],
 ['Jordan', '3.40%', '32.48%'],
 ['Burundi', '3.40%', '35.11%'],
 ['Tanzania', '3.36%', '35.09%'],
 ['Chad', '3.32%', '34.06%'],
 ['Zambia', '3.30%', '34.11%'],
 ['Somalia', '3.30%', '34.71%'],
 ['Yemen', '3.28%', '34.00%'],
 ['Seychelles', '3.26%', '33.27%'],
 ['Benin', '3.22%', '33.47%'],
 ['Burkina Faso', '3.19%', '32.81%'],
 ['Uganda', '3.18%', '32.37%'],
 ['Gabon', '3.18%', '31.47%'],
 ['Maldives', '2.82%', '26.03%'],
 ['Afghanistan', '2.82%', '27.45%'],
 ['Nigeria', '2.71%', '27.23%'],
 ['Cameroon', '2.65%', '26.44%'],
 ['Sudan', '2.62%', '26.44%'],
 ['Egypt', '2.55%', '25.55%'],
 ['Palestine', '2.40%', '23.72%'],
 ['Pakistan', '2.35%', '23.30%'],
 ['Saudi Arabia', '2.26%', '21.15%'],
 ['Qatar', '2.13%', '15.34%'],
 ['Macao *', 

In [96]:
# Convert list to dataframe and rename columns.
df6 = pd.DataFrame(z6, columns = col_table_titles6).astype(str)

df6.columns.values[0] = 'Country'
df6.columns.values[1] = 'Pop. Growth / Year (%)'
df6.columns.values[2] = 'Pop. Growth 2013-2022 (%)'

df6

Unnamed: 0,Country,Pop. Growth / Year (%),Pop. Growth 2013-2022 (%)
0,Equatorial Guinea,5.08%,55.60%
1,Niger,3.95%,41.63%
2,Democratic Republic of the Congo,3.68%,38.75%
3,Gambia,3.60%,37.80%
4,Angola,3.55%,36.80%
...,...,...,...
99,Puerto Rico *,-1.19%,-10.38%
100,Bulgaria,-1.20%,-11.01%
101,Saint Martin *,-1.51%,-12.80%
102,Ukraine,-1.72%,-16.46%


In [97]:
# Change 'United States of America' to 'United States'.
df6.loc[60, 'Country'] = 'United States'

df6

Unnamed: 0,Country,Pop. Growth / Year (%),Pop. Growth 2013-2022 (%)
0,Equatorial Guinea,5.08%,55.60%
1,Niger,3.95%,41.63%
2,Democratic Republic of the Congo,3.68%,38.75%
3,Gambia,3.60%,37.80%
4,Angola,3.55%,36.80%
...,...,...,...
99,Puerto Rico *,-1.19%,-10.38%
100,Bulgaria,-1.20%,-11.01%
101,Saint Martin *,-1.51%,-12.80%
102,Ukraine,-1.72%,-16.46%


In [98]:
# Remove Unnecessary characters from the dataframe.
df6 = df6.replace(r'[\*\%]', '', regex = True).copy()

df6

Unnamed: 0,Country,Pop. Growth / Year (%),Pop. Growth 2013-2022 (%)
0,Equatorial Guinea,5.08,55.60
1,Niger,3.95,41.63
2,Democratic Republic of the Congo,3.68,38.75
3,Gambia,3.60,37.80
4,Angola,3.55,36.80
...,...,...,...
99,Puerto Rico,-1.19,-10.38
100,Bulgaria,-1.20,-11.01
101,Saint Martin,-1.51,-12.80
102,Ukraine,-1.72,-16.46


In [99]:
# Merge dataframes.
dfinal4 = dfinal3.merge(df6, on='Country', how = 'left').copy()

dfinal4

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%)
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8,2,,
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX,XXXX,-0.19,-2.16
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49
3,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20
...,...,...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02,,
87,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49
88,Myanmar,68006,65211,1248,1149,1270,261217,XXXX,0.53,4.49
89,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44


In [100]:
# Convert NaN values to 'XXXX'
dfinal4 = dfinal4.fillna('XXXX').astype(str).copy()

dfinal4

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%)
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8,2,XXXX,XXXX
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX,XXXX,-0.19,-2.16
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49
3,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20
...,...,...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02,XXXX,XXXX
87,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49
88,Myanmar,68006,65211,1248,1149,1270,261217,XXXX,0.53,4.49
89,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44


In [101]:
# Create .csv file.
dfinal4.to_csv('highest gdp - worlddata2.csv', index = False)

# Read .csv file.
dfinal4 = pd.read_csv ('highest gdp - worlddata2.csv', sep = ',')

In [102]:
# Connect to webpage.
url7 = 'https://en.wikipedia.org/wiki/List_of_countries_by_exports'

page7 = requests.get(url7)

# Extract site data.
soup7 = BeautifulSoup(page7.text, 'html')

In [103]:
# Isolate site data.
table7 = soup7.find('main')

table7

<main class="mw-body" id="content">
<header class="mw-body-header vector-page-titlebar">
<nav aria-label="Contents" class="vector-toc-landmark">
<div class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" id="vector-page-titlebar-toc">
<input aria-haspopup="true" aria-label="Toggle the table of contents" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-vector-page-titlebar-toc" id="vector-page-titlebar-toc-checkbox" role="button" type="checkbox"/>
<label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" for="vector-page-titlebar-toc-checkbox" id="vector-page-titlebar-toc-label"><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span>
<span class="vector-dropdown-label-text">Toggle the table of contents</span>
</label>
<div class="vector-dropdown-content">
<div class="vector-unpinned-container" id="vector-page-ti

In [104]:
# Create list of column titles.
col_titles7 = table7.find_all('th')[:4]

col_titles7

[<th>Country
 </th>,
 <th>Exports
 </th>,
 <th>Year
 </th>,
 <th>Top export (2021)<sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup>
 </th>]

In [105]:
# Clean list of column titles.
col_table_titles7 = [title.text.strip() for title in col_titles7]

col_table_titles7

['Country', 'Exports', 'Year', 'Top export (2021)[2]']

In [106]:
# Convert list to Pandas dataframe.
countries7 = pd.DataFrame(columns = col_table_titles7).copy()

countries7

Unnamed: 0,Country,Exports,Year,Top export (2021)[2]


In [107]:
# Locate and extract table data.
row_dat7 = table7.find_all('table')

row_dat7

[<table class="wikitable sortable mw-datatable static-row-numbers">
 <caption>Exports of goods and services (US$ million) by country<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup>
 </caption>
 <tbody><tr>
 <th>Country
 </th>
 <th>Exports
 </th>
 <th>Year
 </th>
 <th>Top export (2021)<sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup>
 </th></tr>
 <tr>
 <td><span class="flagicon" style="display:inline-block;width:25px;"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/23px-Flag_of_the_People%27s_Republic_of_China.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Flag_of_the_People%27s_Republic_of_China.svg/35px-Flag_of_the_People%27s_Republic_of_China.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons

In [108]:
# Create a structured list.
x7 = []

for row in row_dat7:
    row_data7 = row.find_all('td')
    ind_row_data7 = [data.text.strip() for data in row_data7]  
    ind_row_data7 = [data.text.split(',') for data in row_data7]  
    ind_row_data7 = [data.text.strip() for data in row_data7]
    x7.append(ind_row_data7)
    
    print(ind_row_data7)


['China', '3,511,248', '2023', 'Broadcasting equipment', 'United States', '3,051,824', '2023', 'Petroleum', 'Germany', '2,104,251', '2023', 'Cars', 'United Kingdom', '1,074,781', '2023', 'Gold', 'France', '1,051,679', '2023', 'Packaged medications', 'Netherlands', '949,983', '2023', 'Petroleum', 'Japan', '920,737', '2023', 'Cars', 'India', '830,950', '2023', 'Petroleum', 'Italy', '793,588', '2023', 'Packaged medications', 'Singapore', '778,000', '2023', 'Intergated Circuits', 'South Korea', '769,534', '2023', 'Integrated circuits', 'United Arab Emirates', '753,000', '2022[3]', 'Petroleum', 'Ireland', '731,813', '2023', 'Blood', 'Canada', '717,677', '2023', 'Petroleum', 'Hong Kong', '673,305', '2023', 'Integrated circuits', 'Switzerland', '661,627', '2023', 'Gold', 'Mexico', '649,312', '2023', 'Cars', 'Spain', '615,829', '2023', 'Cars', 'Taiwan', '536,128', '2022[4]', 'Integrated circuits', 'Belgium', '535,173', '2023', 'Vaccines', 'Poland', '469,264', '2023', 'Car parts', 'Russia', '46

In [109]:
# Isolate table data only.
w1 = 0
w2 = 4
w = [[]]

for item in range(205):
    w[0][w1:w2] = x7[0][w1:w2]
    w1 += 4
    w2 += 4
  
w[0]

['China',
 '3,511,248',
 '2023',
 'Broadcasting equipment',
 'United States',
 '3,051,824',
 '2023',
 'Petroleum',
 'Germany',
 '2,104,251',
 '2023',
 'Cars',
 'United Kingdom',
 '1,074,781',
 '2023',
 'Gold',
 'France',
 '1,051,679',
 '2023',
 'Packaged medications',
 'Netherlands',
 '949,983',
 '2023',
 'Petroleum',
 'Japan',
 '920,737',
 '2023',
 'Cars',
 'India',
 '830,950',
 '2023',
 'Petroleum',
 'Italy',
 '793,588',
 '2023',
 'Packaged medications',
 'Singapore',
 '778,000',
 '2023',
 'Intergated Circuits',
 'South Korea',
 '769,534',
 '2023',
 'Integrated circuits',
 'United Arab Emirates',
 '753,000',
 '2022[3]',
 'Petroleum',
 'Ireland',
 '731,813',
 '2023',
 'Blood',
 'Canada',
 '717,677',
 '2023',
 'Petroleum',
 'Hong Kong',
 '673,305',
 '2023',
 'Integrated circuits',
 'Switzerland',
 '661,627',
 '2023',
 'Gold',
 'Mexico',
 '649,312',
 '2023',
 'Cars',
 'Spain',
 '615,829',
 '2023',
 'Cars',
 'Taiwan',
 '536,128',
 '2022[4]',
 'Integrated circuits',
 'Belgium',
 '535,173'

In [110]:
# Function to create structured list.
def split(list_h, chunk_size):

  for i in range(0, len(list_h), chunk_size):
    yield list_h[i:i + chunk_size]

chunk_size = 4

z7 = list(split(w[0], chunk_size))

z7

[['China', '3,511,248', '2023', 'Broadcasting equipment'],
 ['United States', '3,051,824', '2023', 'Petroleum'],
 ['Germany', '2,104,251', '2023', 'Cars'],
 ['United Kingdom', '1,074,781', '2023', 'Gold'],
 ['France', '1,051,679', '2023', 'Packaged medications'],
 ['Netherlands', '949,983', '2023', 'Petroleum'],
 ['Japan', '920,737', '2023', 'Cars'],
 ['India', '830,950', '2023', 'Petroleum'],
 ['Italy', '793,588', '2023', 'Packaged medications'],
 ['Singapore', '778,000', '2023', 'Intergated Circuits'],
 ['South Korea', '769,534', '2023', 'Integrated circuits'],
 ['United Arab Emirates', '753,000', '2022[3]', 'Petroleum'],
 ['Ireland', '731,813', '2023', 'Blood'],
 ['Canada', '717,677', '2023', 'Petroleum'],
 ['Hong Kong', '673,305', '2023', 'Integrated circuits'],
 ['Switzerland', '661,627', '2023', 'Gold'],
 ['Mexico', '649,312', '2023', 'Cars'],
 ['Spain', '615,829', '2023', 'Cars'],
 ['Taiwan', '536,128', '2022[4]', 'Integrated circuits'],
 ['Belgium', '535,173', '2023', 'Vaccines

In [111]:
# Convert list to dataframe as well as rename and drop columns.
df7 = pd.DataFrame(z7, columns = col_table_titles7).astype(str).copy()

df7.columns.values[0] = 'Country'
df7.columns.values[1] = 'Exports (Mn)'
df7.columns.values[3] = 'Top Export'

df7 = df7.drop(['Year'], axis = 1).copy() 

df7

Unnamed: 0,Country,Exports (Mn),Top Export
0,China,3511248,Broadcasting equipment
1,United States,3051824,Petroleum
2,Germany,2104251,Cars
3,United Kingdom,1074781,Gold
4,France,1051679,Packaged medications
...,...,...,...
200,Tonga,60,Shellfish
201,Nauru,31,Fish
202,Palau,12,Computers
203,Kiribati,11,Fish


In [112]:
# Merge dataframes.
dfinal5 = dfinal4.merge(df7, on = 'Country', how = 'left')

dfinal5

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8,2,XXXX,XXXX,,
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX,XXXX,-0.19,-2.16,,
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49,661627,Gold
3,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron
...,...,...,...,...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02,XXXX,XXXX,3292,Gold
87,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil
88,Myanmar,68006,65211,1248,1149,1270,261217,XXXX,0.53,4.49,17523,Clothing
89,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold


In [113]:
# Convert NaN values to 'XXXX'.
dfinal5 = dfinal5.fillna('XXXX').astype(str).copy()

dfinal5

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8,2,XXXX,XXXX,XXXX,XXXX
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX,XXXX,-0.19,-2.16,XXXX,XXXX
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49,661627,Gold
3,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron
...,...,...,...,...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02,XXXX,XXXX,3292,Gold
87,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil
88,Myanmar,68006,65211,1248,1149,1270,261217,XXXX,0.53,4.49,17523,Clothing
89,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold


In [114]:
# Create .csv file.
dfinal5.to_csv('highest gdp - wiki 3.csv', index = False)

# Read .csv file.
dfinal5 = pd.read_csv ('highest gdp - wiki 3.csv', sep = ',')

In [115]:
# Connect to webpage.
url8 = 'https://en.wikipedia.org/wiki/List_of_countries_by_imports'

page8 = requests.get(url8)

# Extract site data.
soup8 = BeautifulSoup(page8.text, 'html')

In [116]:
# Isolate site data.
table8 = soup8.find('main')

table8

<main class="mw-body" id="content">
<header class="mw-body-header vector-page-titlebar">
<nav aria-label="Contents" class="vector-toc-landmark">
<div class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" id="vector-page-titlebar-toc">
<input aria-haspopup="true" aria-label="Toggle the table of contents" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-vector-page-titlebar-toc" id="vector-page-titlebar-toc-checkbox" role="button" type="checkbox"/>
<label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" for="vector-page-titlebar-toc-checkbox" id="vector-page-titlebar-toc-label"><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span>
<span class="vector-dropdown-label-text">Toggle the table of contents</span>
</label>
<div class="vector-dropdown-content">
<div class="vector-unpinned-container" id="vector-page-ti

In [117]:
# Create list of column titles.
col_titles8 = table8.find_all('th')[1:4]

col_titles8

[<th>Country
 </th>,
 <th>Imports
 <p>(millions of <a href="/wiki/United_States_dollar" title="United States dollar">$</a>)
 </p>
 </th>,
 <th>Year
 </th>]

In [118]:
# Clean list of column titles.
col_table_titles8 = [title.text.strip() for title in col_titles8]

col_table_titles8

['Country', 'Imports\n(millions of $)', 'Year']

In [119]:
# Convert list to dataframe and rename column.
countries8 = pd.DataFrame(columns = col_table_titles8).copy()

countries8.columns.values[1] = 'Imports'

countries8

Unnamed: 0,Country,Imports,Year


In [120]:
# Locate and extract table data.
row_dat8 = table8.find_all('table')

row_dat8

[<table class="sidebar sidebar-collapse nomobile nowraplinks"><tbody><tr><td class="sidebar-pretitle"><i>Part of <a href="/wiki/Category:Trade" title="Category:Trade">a series</a> on</i></td></tr><tr><th class="sidebar-title-with-pretitle"><a href="/wiki/International_trade" title="International trade">World trade</a></th></tr><tr><td class="sidebar-image" style="padding: 0.4em 0 0.4em; border-bottom: 1px solid #ccc"><figure class="mw-halign-center notpageimage" typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Storck_Harbour_scene.jpg"><img class="mw-file-element" data-file-height="1599" data-file-width="1590" decoding="async" height="181" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cf/Storck_Harbour_scene.jpg/180px-Storck_Harbour_scene.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cf/Storck_Harbour_scene.jpg/270px-Storck_Harbour_scene.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cf/Storck_Harbour_scene.jpg/360px-Storck_Harbour_sc

In [121]:
# Create a structured list.
x8 = []

for row in row_dat8:
    row_data8 = row.find_all('td')
    ind_row_data8 = [data.text.strip() for data in row_data8]  
    ind_row_data8 = [data.text.split(',') for data in row_data8]  
    ind_row_data8 = [data.text.strip() for data in row_data8]
    x8.append(ind_row_data8)
    
    print(ind_row_data8)


['Part of a series on', '', 'Policy\nImport\nExport\nBalance of trade\nTrade law\nTrade pact\nTrade bloc\nTrade creation\nTrade diversion\nExport orientation\nImport substitution\nTrade finance\nTrade facilitation\nTrade route\nDomestic trade\nTax', 'Restrictions\nTrade barriers\nTariffs\nNon-tariff barriers\nImport quotas\nTariff-rate quotas\nImport licenses\nCustoms duties\nExport subsidies\nTechnical barriers\nBribery\nExchange rate controls\nEmbargo\nSafeguards\nCountervailing duties\nAnti-dumping duties\nVoluntary export restraints', 'History\nMercantilism\nProtectionism\nLaissez-faire\nFree trade\nEconomic nationalism\nEconomic integration', 'Organizations\nInternational Monetary Fund\nInternational Trade Centre\nWorld Trade Organization\nWorld Customs Organization\nInternational Chamber of Commerce', 'Economic integration\nPreferential trading area\nFree-trade area\nCurrency union\nCustoms union\nSingle market\nEconomic union\nFiscal union\nCustoms and monetary union\nEconomic a

In [122]:
# Isolate table data only.
x8[0] = x8[1].copy()

x8

[['United States',
  '3,375,948',
  '2022',
  'European Union[n 1]',
  '2,743,745[3]',
  '2022',
  'China',
  '2,715,999',
  '2022',
  'Germany',
  '1,571,057',
  '2022',
  'Japan',
  '905,099',
  '2022',
  'France',
  '830,300',
  '2022',
  'United Kingdom',
  '816,300',
  '2022',
  'Italy',
  '743,030',
  '2022',
  'South Korea',
  '731,366',
  '2022',
  'Netherlands',
  '712,801',
  '2022',
  'India',
  '677,240',
  '2023-24',
  'Hong Kong',
  '669,093',
  '2022',
  'Belgium',
  '624,289',
  '2022',
  'Mexico',
  '604,615',
  '2022',
  'Canada',
  '567,826',
  '2022',
  'Spain',
  '499,055',
  '2022',
  'Singapore',
  '475,516',
  '2022',
  'Taiwan',
  '437,335',
  '2022',
  'Vietnam',
  '364,052',
  '2022',
  'Turkey',
  '363,711',
  '2022',
  'Poland',
  '358,593',
  '2022',
  'Switzerland',
  '356,763',
  '2022',
  'Thailand',
  '306,260',
  '2022',
  'Malaysia',
  '295,092',
  '2022',
  'Australia',
  '290,113',
  '2022',
  'Brazil',
  '272,701',
  '2022',
  'United Arab Emirate

In [123]:
# Create a structured list.
a1 = 0
a2 = 3
a3 = [[]]

for item in range(1):
    a3[0][a1:2] = x8[a1:a2][a1:a2]
    a1 += 3
    a2 += 3
    
a3[0]

[['United States',
  '3,375,948',
  '2022',
  'European Union[n 1]',
  '2,743,745[3]',
  '2022',
  'China',
  '2,715,999',
  '2022',
  'Germany',
  '1,571,057',
  '2022',
  'Japan',
  '905,099',
  '2022',
  'France',
  '830,300',
  '2022',
  'United Kingdom',
  '816,300',
  '2022',
  'Italy',
  '743,030',
  '2022',
  'South Korea',
  '731,366',
  '2022',
  'Netherlands',
  '712,801',
  '2022',
  'India',
  '677,240',
  '2023-24',
  'Hong Kong',
  '669,093',
  '2022',
  'Belgium',
  '624,289',
  '2022',
  'Mexico',
  '604,615',
  '2022',
  'Canada',
  '567,826',
  '2022',
  'Spain',
  '499,055',
  '2022',
  'Singapore',
  '475,516',
  '2022',
  'Taiwan',
  '437,335',
  '2022',
  'Vietnam',
  '364,052',
  '2022',
  'Turkey',
  '363,711',
  '2022',
  'Poland',
  '358,593',
  '2022',
  'Switzerland',
  '356,763',
  '2022',
  'Thailand',
  '306,260',
  '2022',
  'Malaysia',
  '295,092',
  '2022',
  'Australia',
  '290,113',
  '2022',
  'Brazil',
  '272,701',
  '2022',
  'United Arab Emirate

In [124]:
# Convert to a single list.
a3 = a3[0][1]

a3

['United States',
 '3,375,948',
 '2022',
 'European Union[n 1]',
 '2,743,745[3]',
 '2022',
 'China',
 '2,715,999',
 '2022',
 'Germany',
 '1,571,057',
 '2022',
 'Japan',
 '905,099',
 '2022',
 'France',
 '830,300',
 '2022',
 'United Kingdom',
 '816,300',
 '2022',
 'Italy',
 '743,030',
 '2022',
 'South Korea',
 '731,366',
 '2022',
 'Netherlands',
 '712,801',
 '2022',
 'India',
 '677,240',
 '2023-24',
 'Hong Kong',
 '669,093',
 '2022',
 'Belgium',
 '624,289',
 '2022',
 'Mexico',
 '604,615',
 '2022',
 'Canada',
 '567,826',
 '2022',
 'Spain',
 '499,055',
 '2022',
 'Singapore',
 '475,516',
 '2022',
 'Taiwan',
 '437,335',
 '2022',
 'Vietnam',
 '364,052',
 '2022',
 'Turkey',
 '363,711',
 '2022',
 'Poland',
 '358,593',
 '2022',
 'Switzerland',
 '356,763',
 '2022',
 'Thailand',
 '306,260',
 '2022',
 'Malaysia',
 '295,092',
 '2022',
 'Australia',
 '290,113',
 '2022',
 'Brazil',
 '272,701',
 '2022',
 'United Arab Emirates',
 '248,418',
 '2022',
 'Indonesia',
 '237,447',
 '2022',
 'Czech Republic',


In [125]:
# Create a structured list.
def split(list_i, chunk_size):

  for i in range(0, len(list_i), chunk_size):
    yield list_i[i:i + chunk_size]

chunk_size = 3

z8 = list(split(a3, chunk_size))

z8

[['United States', '3,375,948', '2022'],
 ['European Union[n 1]', '2,743,745[3]', '2022'],
 ['China', '2,715,999', '2022'],
 ['Germany', '1,571,057', '2022'],
 ['Japan', '905,099', '2022'],
 ['France', '830,300', '2022'],
 ['United Kingdom', '816,300', '2022'],
 ['Italy', '743,030', '2022'],
 ['South Korea', '731,366', '2022'],
 ['Netherlands', '712,801', '2022'],
 ['India', '677,240', '2023-24'],
 ['Hong Kong', '669,093', '2022'],
 ['Belgium', '624,289', '2022'],
 ['Mexico', '604,615', '2022'],
 ['Canada', '567,826', '2022'],
 ['Spain', '499,055', '2022'],
 ['Singapore', '475,516', '2022'],
 ['Taiwan', '437,335', '2022'],
 ['Vietnam', '364,052', '2022'],
 ['Turkey', '363,711', '2022'],
 ['Poland', '358,593', '2022'],
 ['Switzerland', '356,763', '2022'],
 ['Thailand', '306,260', '2022'],
 ['Malaysia', '295,092', '2022'],
 ['Australia', '290,113', '2022'],
 ['Brazil', '272,701', '2022'],
 ['United Arab Emirates', '248,418', '2022'],
 ['Indonesia', '237,447', '2022'],
 ['Czech Republic',

In [126]:
# Convert list to dataframe.
df8 = pd.DataFrame(z8, columns = countries8.columns.values).astype(str).copy()

df8

Unnamed: 0,Country,Imports,Year
0,United States,3375948,2022
1,European Union[n 1],"2,743,745[3]",2022
2,China,2715999,2022
3,Germany,1571057,2022
4,Japan,905099,2022
...,...,...,...
229,Niue (NZ),31.70,2022
230,Guam (US),19.66,2019
231,Cocos Islands (AU),13.04,2022
232,Pitcairn Islands (UK),5.988,2019


In [127]:
# Remove unnecessary characters from datframe.
df8 = df8.replace(r'\[.*\]', '', regex = True).astype(str).copy()

df8

Unnamed: 0,Country,Imports,Year
0,United States,3375948,2022
1,European Union,2743745,2022
2,China,2715999,2022
3,Germany,1571057,2022
4,Japan,905099,2022
...,...,...,...
229,Niue (NZ),31.70,2022
230,Guam (US),19.66,2019
231,Cocos Islands (AU),13.04,2022
232,Pitcairn Islands (UK),5.988,2019


In [128]:
# drop the 'Year' column.
df8 = df8.drop(['Year'], axis = 1).copy() 

df8

Unnamed: 0,Country,Imports
0,United States,3375948
1,European Union,2743745
2,China,2715999
3,Germany,1571057
4,Japan,905099
...,...,...
229,Niue (NZ),31.70
230,Guam (US),19.66
231,Cocos Islands (AU),13.04
232,Pitcairn Islands (UK),5.988


In [129]:
# Merge dataframes.
dfinal6 = dfinal5.merge(df8, on='Country', how = 'left').copy()

dfinal6

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8,2,XXXX,XXXX,XXXX,XXXX,1371
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX,XXXX,-0.19,-2.16,XXXX,XXXX,
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49,661627,Gold,356763
3,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02,XXXX,XXXX,3292,Gold,9629
87,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716
88,Myanmar,68006,65211,1248,1149,1270,261217,XXXX,0.53,4.49,17523,Clothing,17403
89,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448


In [130]:
# Replace NaN values with 'XXXX'.
dfinal6 = dfinal6.fillna('XXXX').astype(str).copy()

dfinal6

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports
0,Monaco,XXXX,8772,XXXX,240862,186080,0.8,2,XXXX,XXXX,XXXX,XXXX,1371
1,Bermuda,XXXX,XXXX,XXXX,XXXX,125210,XXXX,XXXX,-0.19,-2.16,XXXX,XXXX,XXXX
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49,661627,Gold,356763
3,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02,XXXX,XXXX,3292,Gold,9629
87,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716
88,Myanmar,68006,65211,1248,1149,1270,261217,XXXX,0.53,4.49,17523,Clothing,17403
89,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448


In [131]:
# Convert all 'XXXX' entries to NaN values.
dfinal6.replace('XXXX', '0', inplace = True)
dfinal6 = dfinal6.replace('0', np.nan).copy()

dfinal6

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports
0,Monaco,,8772,,240862,186080,0.8,2,,,,,1371
1,Bermuda,,,,,125210,,,-0.19,-2.16,,,
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49,661627,Gold,356763
3,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02,,,3292,Gold,9629
87,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716
88,Myanmar,68006,65211,1248,1149,1270,261217,,0.53,4.49,17523,Clothing,17403
89,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448


In [132]:
# Remove unnecessary characters from the dataframe.
dfinal6 = dfinal6.replace(r',', '', regex = True).copy()

dfinal6

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports
0,Monaco,,8772,,240862,186080,0.8,2,,,,,1371
1,Bermuda,,,,,125210,,,-0.19,-2.16,,,
2,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49,661627,Gold,356763
3,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268
4,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,Kyrgyzstan,13599,10930,1922,1655,1440,77199,4.02,,,3292,Gold,9629
87,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716
88,Myanmar,68006,65211,1248,1149,1270,261217,,0.53,4.49,17523,Clothing,17403
89,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448


In [133]:
# Drop all countries with NaN values.
dfinal7 = dfinal6.dropna().reset_index(drop = True).copy()

dfinal7

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports
0,Switzerland,938458,818426,105669,93260,95490,15942,4.3,0.93,8.49,661627,Gold,356763
1,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268
2,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068
3,Ireland,564020,532415,106059,103983,79730,27458,4.48,1.09,10.89,731813,Blood,147913
4,United States,28781083,25744100,85373,76330,76770,3809525,3.65,0.61,4.41,3051824,Petroleum,3375948
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,53205,44341,1815,1564,1640,179942,3.78,2.65,26.44,7449,Petroleum,7971
62,Pakistan,338237,326796,1461,1589,1560,307373,5.6,2.35,23.30,38700,Clothing,71105
63,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716
64,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448


In [134]:
# Convert dataframe to numeric values.
dfinal7 = dfinal7.apply(partial(pd.to_numeric, errors = 'ignore'))

dfinal7

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports
0,Switzerland,938458,818426,105669,93260,95490,15942,4.30,0.93,8.49,661627,Gold,356763.0
1,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268.0
2,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068.0
3,Ireland,564020,532415,106059,103983,79730,27458,4.48,1.09,10.89,731813,Blood,147913.0
4,United States,28781083,25744100,85373,76330,76770,3809525,3.65,0.61,4.41,3051824,Petroleum,3375948.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,53205,44341,1815,1564,1640,179942,3.78,2.65,26.44,7449,Petroleum,7971.0
62,Pakistan,338237,326796,1461,1589,1560,307373,5.60,2.35,23.30,38700,Clothing,71105.0
63,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716.0
64,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448.0


In [135]:
# Calculate an estimated 2023 GDP and GDP per capita.
dfinal7['Est. 2023 GDP'] = ((dfinal7['Est. 2024 GDP(Bn)'] + dfinal7['2022 GDP (Bn)']) / 2).astype(int).copy()

dfinal7['Est. 2023 GDP per capita'] = ((dfinal7['Est. 2024 GDP per capita (Bn)'] + dfinal7['2022 GDP per capita (Bn)']) / 2).astype(int).copy()

dfinal7

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports,Est. 2023 GDP,Est. 2023 GDP per capita
0,Switzerland,938458,818426,105669,93260,95490,15942,4.30,0.93,8.49,661627,Gold,356763.0,878442,99464
1,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268.0,553186,101694
2,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068.0,85043,128195
3,Ireland,564020,532415,106059,103983,79730,27458,4.48,1.09,10.89,731813,Blood,147913.0,548217,105021
4,United States,28781083,25744100,85373,76330,76770,3809525,3.65,0.61,4.41,3051824,Petroleum,3375948.0,27262591,80851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,53205,44341,1815,1564,1640,179942,3.78,2.65,26.44,7449,Petroleum,7971.0,48773,1689
62,Pakistan,338237,326796,1461,1589,1560,307373,5.60,2.35,23.30,38700,Clothing,71105.0,332516,1525
63,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716.0,41792,1367
64,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448.0,31797,824


In [136]:
# Calculate an estimated 2022, 2023, 2024 popuations.
dfinal7['2022 Population'] = dfinal7['2022 GDP (Bn)'] / dfinal7['2022 GDP per capita (Bn)'].astype(int).copy()

dfinal7['2022 Population'] = (dfinal7['2022 Population'] * 10**6).astype(int).copy()

dfinal7['Est. 2023 Population'] = ((((dfinal7['2022 Population'] * (dfinal7['Pop. Growth / Year (%)']) / 100)) + dfinal7['2022 Population'])).astype(int).copy()                               

dfinal7['Est. 2024 Population'] = ((((dfinal7['Est. 2023 Population'] * (dfinal7['Pop. Growth / Year (%)']) / 100)) + dfinal7['Est. 2023 Population'])).astype(int).copy()

dfinal7

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports,Est. 2023 GDP,Est. 2023 GDP per capita,2022 Population,Est. 2023 Population,Est. 2024 Population
0,Switzerland,938458,818426,105669,93260,95490,15942,4.30,0.93,8.49,661627,Gold,356763.0,878442,99464,8775745,8857359,8939732
1,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268.0,553186,101694,5329047,5373810,5418950
2,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068.0,85043,128195,652208,665839,679755
3,Ireland,564020,532415,106059,103983,79730,27458,4.48,1.09,10.89,731813,Blood,147913.0,548217,105021,5120211,5176021,5232439
4,United States,28781083,25744100,85373,76330,76770,3809525,3.65,0.61,4.41,3051824,Petroleum,3375948.0,27262591,80851,337273680,339331049,341400968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,53205,44341,1815,1564,1640,179942,3.78,2.65,26.44,7449,Petroleum,7971.0,48773,1689,28351023,29102325,29873536
62,Pakistan,338237,326796,1461,1589,1560,307373,5.60,2.35,23.30,38700,Clothing,71105.0,332516,1525,205661422,210494465,215441084
63,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716.0,41792,1367,29473448,29841866,30214889
64,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448.0,31797,824,33329401,34202631,35098739


In [137]:
# Remove unnecessary characters from dataframe.
dfinal7 = dfinal7.replace(r'\[.*\]', '', regex = True).copy()

dfinal7

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports,Est. 2023 GDP,Est. 2023 GDP per capita,2022 Population,Est. 2023 Population,Est. 2024 Population
0,Switzerland,938458,818426,105669,93260,95490,15942,4.30,0.93,8.49,661627,Gold,356763.0,878442,99464,8775745,8857359,8939732
1,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268.0,553186,101694,5329047,5373810,5418950
2,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068.0,85043,128195,652208,665839,679755
3,Ireland,564020,532415,106059,103983,79730,27458,4.48,1.09,10.89,731813,Blood,147913.0,548217,105021,5120211,5176021,5232439
4,United States,28781083,25744100,85373,76330,76770,3809525,3.65,0.61,4.41,3051824,Petroleum,3375948.0,27262591,80851,337273680,339331049,341400968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,53205,44341,1815,1564,1640,179942,3.78,2.65,26.44,7449,Petroleum,7971.0,48773,1689,28351023,29102325,29873536
62,Pakistan,338237,326796,1461,1589,1560,307373,5.60,2.35,23.30,38700,Clothing,71105.0,332516,1525,205661422,210494465,215441084
63,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716.0,41792,1367,29473448,29841866,30214889
64,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448.0,31797,824,33329401,34202631,35098739


In [138]:
# Calculate 'Import / Export Ratio'.
dfinal7['Exports (Mn)'] = dfinal7['Exports (Mn)'].astype(int).copy()
dfinal7['Import / Export Ratio'] = (dfinal7['Imports'] / dfinal7['Exports (Mn)']).round(2).copy()

dfinal7

Unnamed: 0,Country,Est. 2024 GDP(Bn),2022 GDP (Bn),Est. 2024 GDP per capita (Bn),2022 GDP per capita (Bn),Income,Area (Sq.Mi.),Unemployment (%),Pop. Growth / Year (%),Pop. Growth 2013-2022 (%),Exports (Mn),Top Export,Imports,Est. 2023 GDP,Est. 2023 GDP per capita,2022 Population,Est. 2023 Population,Est. 2024 Population,Import / Export Ratio
0,Switzerland,938458,818426,105669,93260,95490,15942,4.30,0.93,8.49,661627,Gold,356763.0,878442,99464,8775745,8857359,8939732,0.54
1,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268.0,553186,101694,5329047,5373810,5418950,0.33
2,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068.0,85043,128195,652208,665839,679755,0.16
3,Ireland,564020,532415,106059,103983,79730,27458,4.48,1.09,10.89,731813,Blood,147913.0,548217,105021,5120211,5176021,5232439,0.20
4,United States,28781083,25744100,85373,76330,76770,3809525,3.65,0.61,4.41,3051824,Petroleum,3375948.0,27262591,80851,337273680,339331049,341400968,1.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,53205,44341,1815,1564,1640,179942,3.78,2.65,26.44,7449,Petroleum,7971.0,48773,1689,28351023,29102325,29873536,1.07
62,Pakistan,338237,326796,1461,1589,1560,307373,5.60,2.35,23.30,38700,Clothing,71105.0,332516,1525,205661422,210494465,215441084,1.84
63,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716.0,41792,1367,29473448,29841866,30214889,5.04
64,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448.0,31797,824,33329401,34202631,35098739,1.26


In [139]:
# Reposition and rename columns.
dfinal7 = dfinal7.rename(columns={'Est. 2024 GDP(Bn)' : 'Est. 2024 GDP', '2022 GDP (Bn)' : '2022 GDP', 
                                  'Est. 2024 GDP per capita (Bn)' : 'Est. 2024 GDP per capita', 
                                  '2022 GDP per capita (Bn)' : '2022 GDP per capita',
                                  'Area (Sq.Mi.)' : 'Area',
                                  'Unemployment (%)' : 'Unemployment', 
                                  'Pop. Growth / Year (%)' : 'Pop. Growth / Year', 
                                  'Pop. Growth 2013-2022 (%)' : 'Pop. Growth 2013-2022', 
                                  'Exports (Mn)' : 'Exports'}).copy()

dfinal7

Unnamed: 0,Country,Est. 2024 GDP,2022 GDP,Est. 2024 GDP per capita,2022 GDP per capita,Income,Area,Unemployment,Pop. Growth / Year,Pop. Growth 2013-2022,Exports,Top Export,Imports,Est. 2023 GDP,Est. 2023 GDP per capita,2022 Population,Est. 2023 Population,Est. 2024 Population,Import / Export Ratio
0,Switzerland,938458,818426,105669,93260,95490,15942,4.30,0.93,8.49,661627,Gold,356763.0,878442,99464,8775745,8857359,8939732,0.54
1,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,321076,Petroleum,107268.0,553186,101694,5329047,5373810,5418950,0.33
2,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,163585,Iron,26068.0,85043,128195,652208,665839,679755,0.16
3,Ireland,564020,532415,106059,103983,79730,27458,4.48,1.09,10.89,731813,Blood,147913.0,548217,105021,5120211,5176021,5232439,0.20
4,United States,28781083,25744100,85373,76330,76770,3809525,3.65,0.61,4.41,3051824,Petroleum,3375948.0,27262591,80851,337273680,339331049,341400968,1.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,53205,44341,1815,1564,1640,179942,3.78,2.65,26.44,7449,Petroleum,7971.0,48773,1689,28351023,29102325,29873536,1.07
62,Pakistan,338237,326796,1461,1589,1560,307373,5.60,2.35,23.30,38700,Clothing,71105.0,332516,1525,205661422,210494465,215441084,1.84
63,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,2722,Soybean oil,13716.0,41792,1367,29473448,29841866,30214889,5.04
64,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,5908,Gold,7448.0,31797,824,33329401,34202631,35098739,1.26


In [140]:
# Create .csv fili.
dfinal7.to_csv('highest gdp - wiki 4.csv', index = False)

# Read .csv file.
dfinal7 = pd.read_csv ('highest gdp - wiki 4.csv', sep = ',')

In [141]:
# Connect to webpage.
url9 = 'https://www.worldometers.info/world-population/population-by-country/'

page9 = requests.get(url9)

# Extract site data.
soup9 = BeautifulSoup(page9.text, 'html')

In [142]:
# Isolate site data.
table9 = soup9.find('table')

table9

<table cellspacing="0" class="table table-striped table-bordered" id="example2" width="100%"> <thead> <tr> <th>#</th> <th>Country (or dependency)</th> <th>Population<br/> (2023)</th> <th>Yearly<br/> Change</th> <th>Net<br/> Change</th> <th>Density<br/> (P/Km²)</th> <th>Land Area<br/> (Km²)</th> <th>Migrants<br/> (net)</th> <th>Fert.<br/> Rate</th> <th>Med.<br/> Age</th> <th>Urban<br/> Pop %</th> <th>World<br/> Share</th> </tr> </thead> <tbody> <tr> <td>1</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/india-population/">India</a></td> <td style="font-weight: bold;">1,428,627,663</td> <td>0.81 %</td> <td>11,454,490</td> <td>481</td> <td>2,973,190</td> <td>-486,136</td> <td>2.0</td> <td>28</td> <td>36 %</td> <td>17.76 %</td> </tr> <tr> <td>2</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/china-population/">China</a></td> <td style="font-weight: bold;">1,425,671,352</td> <td>-0.02 %</td> <td>-21

In [143]:
# Create list of column titles.
col_titles9 = table9.find_all('th')

col_titles9

[<th>#</th>,
 <th>Country (or dependency)</th>,
 <th>Population<br/> (2023)</th>,
 <th>Yearly<br/> Change</th>,
 <th>Net<br/> Change</th>,
 <th>Density<br/> (P/Km²)</th>,
 <th>Land Area<br/> (Km²)</th>,
 <th>Migrants<br/> (net)</th>,
 <th>Fert.<br/> Rate</th>,
 <th>Med.<br/> Age</th>,
 <th>Urban<br/> Pop %</th>,
 <th>World<br/> Share</th>]

In [144]:
# Clean list of column titles.
col_table_titles9 = [title.text.strip() for title in col_titles9]

col_table_titles9

['#',
 'Country (or dependency)',
 'Population (2023)',
 'Yearly Change',
 'Net Change',
 'Density (P/Km²)',
 'Land Area (Km²)',
 'Migrants (net)',
 'Fert. Rate',
 'Med. Age',
 'Urban Pop %',
 'World Share']

In [145]:
# Convert list to Pandas dataframe.
countries9 = pd.DataFrame(columns = col_table_titles9).copy()

countries9

Unnamed: 0,#,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share


In [146]:
# Isolate table data only.
row_dat9 = table9.find_all('tr')

row_dat9

[<tr> <th>#</th> <th>Country (or dependency)</th> <th>Population<br/> (2023)</th> <th>Yearly<br/> Change</th> <th>Net<br/> Change</th> <th>Density<br/> (P/Km²)</th> <th>Land Area<br/> (Km²)</th> <th>Migrants<br/> (net)</th> <th>Fert.<br/> Rate</th> <th>Med.<br/> Age</th> <th>Urban<br/> Pop %</th> <th>World<br/> Share</th> </tr>,
 <tr> <td>1</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/india-population/">India</a></td> <td style="font-weight: bold;">1,428,627,663</td> <td>0.81 %</td> <td>11,454,490</td> <td>481</td> <td>2,973,190</td> <td>-486,136</td> <td>2.0</td> <td>28</td> <td>36 %</td> <td>17.76 %</td> </tr>,
 <tr> <td>2</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/china-population/">China</a></td> <td style="font-weight: bold;">1,425,671,352</td> <td>-0.02 %</td> <td>-215,985</td> <td>152</td> <td>9,388,211</td> <td>-310,220</td> <td>1.2</td> <td>39</td> <td>65 %</td> <td>17.72 %</t

In [147]:
# Create a structured list.
x9 = []

for row in row_dat9[1:235]:
    row_data9 = row.find_all('td')
    ind_row_data9 = [data.text.strip() for data in row_data9]  
    ind_row_data9 = [data.text.split(',') for data in row_data9]  
    ind_row_data9 = [data.text.strip() for data in row_data9]
    x9.append(ind_row_data9)
    
    print(ind_row_data9)

x9

['1', 'India', '1,428,627,663', '0.81 %', '11,454,490', '481', '2,973,190', '-486,136', '2.0', '28', '36 %', '17.76 %']
['2', 'China', '1,425,671,352', '-0.02 %', '-215,985', '152', '9,388,211', '-310,220', '1.2', '39', '65 %', '17.72 %']
['3', 'United States', '339,996,563', '0.50 %', '1,706,706', '37', '9,147,420', '999,700', '1.7', '38', '83 %', '4.23 %']
['4', 'Indonesia', '277,534,122', '0.74 %', '2,032,783', '153', '1,811,570', '-49,997', '2.1', '30', '59 %', '3.45 %']
['5', 'Pakistan', '240,485,658', '1.98 %', '4,660,796', '312', '770,880', '-165,988', '3.3', '21', '35 %', '2.99 %']
['6', 'Nigeria', '223,804,632', '2.41 %', '5,263,420', '246', '910,770', '-59,996', '5.1', '17', '54 %', '2.78 %']
['7', 'Brazil', '216,422,446', '0.52 %', '1,108,948', '26', '8,358,140', '6,000', '1.6', '34', '88 %', '2.69 %']
['8', 'Bangladesh', '172,954,319', '1.03 %', '1,767,947', '1,329', '130,170', '-309,977', '1.9', '27', '41 %', '2.15 %']
['9', 'Russia', '144,444,359', '-0.19 %', '-268,955', 

[['1',
  'India',
  '1,428,627,663',
  '0.81 %',
  '11,454,490',
  '481',
  '2,973,190',
  '-486,136',
  '2.0',
  '28',
  '36 %',
  '17.76 %'],
 ['2',
  'China',
  '1,425,671,352',
  '-0.02 %',
  '-215,985',
  '152',
  '9,388,211',
  '-310,220',
  '1.2',
  '39',
  '65 %',
  '17.72 %'],
 ['3',
  'United States',
  '339,996,563',
  '0.50 %',
  '1,706,706',
  '37',
  '9,147,420',
  '999,700',
  '1.7',
  '38',
  '83 %',
  '4.23 %'],
 ['4',
  'Indonesia',
  '277,534,122',
  '0.74 %',
  '2,032,783',
  '153',
  '1,811,570',
  '-49,997',
  '2.1',
  '30',
  '59 %',
  '3.45 %'],
 ['5',
  'Pakistan',
  '240,485,658',
  '1.98 %',
  '4,660,796',
  '312',
  '770,880',
  '-165,988',
  '3.3',
  '21',
  '35 %',
  '2.99 %'],
 ['6',
  'Nigeria',
  '223,804,632',
  '2.41 %',
  '5,263,420',
  '246',
  '910,770',
  '-59,996',
  '5.1',
  '17',
  '54 %',
  '2.78 %'],
 ['7',
  'Brazil',
  '216,422,446',
  '0.52 %',
  '1,108,948',
  '26',
  '8,358,140',
  '6,000',
  '1.6',
  '34',
  '88 %',
  '2.69 %'],
 ['8',


In [148]:
# Convert list to Pandas dataframe as well as drop and rename columns.
df9 = pd.DataFrame(x9, columns = col_table_titles9).astype(str).copy()

df9 = df9.drop(columns = ['#', 'Population (2023)', 'Yearly Change', 'Net Change', 
                          'Density (P/Km²)', 'Land Area (Km²)', 'Migrants (net)',
                         'World Share']).copy()

df9 = df9.rename(columns = {'Country (or dependency)' : 'Country', 'Urban Pop %' : 'Urban Pop'})

df9

Unnamed: 0,Country,Fert. Rate,Med. Age,Urban Pop
0,India,2.0,28,36 %
1,China,1.2,39,65 %
2,United States,1.7,38,83 %
3,Indonesia,2.1,30,59 %
4,Pakistan,3.3,21,35 %
...,...,...,...,...
229,Montserrat,1.6,44,11 %
230,Falkland Islands,1.6,40,62 %
231,Niue,2.4,36,41 %
232,Tokelau,2.6,27,0 %


In [149]:
# Remove unnecessary characters from dataframe.
df9 = df9.replace(r'\%', '', regex = True).copy()

df9

Unnamed: 0,Country,Fert. Rate,Med. Age,Urban Pop
0,India,2.0,28,36
1,China,1.2,39,65
2,United States,1.7,38,83
3,Indonesia,2.1,30,59
4,Pakistan,3.3,21,35
...,...,...,...,...
229,Montserrat,1.6,44,11
230,Falkland Islands,1.6,40,62
231,Niue,2.4,36,41
232,Tokelau,2.6,27,0


In [150]:
# Merge dataframes.
dfinal8 = dfinal7.merge(df9, on = 'Country', how = 'left').copy()

# Drop all rows with 'N.A.' value in 'Urban Pop' column.
dfinal8 = dfinal8.drop(dfinal8[dfinal8['Urban Pop'] == 'N.A.'].index).copy()

dfinal8

Unnamed: 0,Country,Est. 2024 GDP,2022 GDP,Est. 2024 GDP per capita,2022 GDP per capita,Income,Area,Unemployment,Pop. Growth / Year,Pop. Growth 2013-2022,...,Imports,Est. 2023 GDP,Est. 2023 GDP per capita,2022 Population,Est. 2023 Population,Est. 2024 Population,Import / Export Ratio,Fert. Rate,Med. Age,Urban Pop
0,Switzerland,938458,818426,105669,93260,95490,15942,4.30,0.93,8.49,...,356763.0,878442,99464,8775745,8857359,8939732,0.54,1.5,42,75
1,Norway,526951,579422,94660,108729,94540,148449,3.23,0.84,7.43,...,107268.0,553186,101694,5329047,5373810,5418950,0.33,1.5,40,86
2,Luxembourg,88556,81530,131384,125006,89200,998,4.58,2.09,20.20,...,26068.0,85043,128195,652208,665839,679755,0.16,1.4,39,88
3,Ireland,564020,532415,106059,103983,79730,27458,4.48,1.09,10.89,...,147913.0,548217,105021,5120211,5176021,5232439,0.20,1.8,38,64
4,United States,28781083,25744100,85373,76330,76770,3809525,3.65,0.61,4.41,...,3375948.0,27262591,80851,337273680,339331049,341400968,1.11,1.7,38,83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,53205,44341,1815,1564,1640,179942,3.78,2.65,26.44,...,7971.0,48773,1689,28351023,29102325,29873536,1.07,4.3,18,58
62,Pakistan,338237,326796,1461,1589,1560,307373,5.60,2.35,23.30,...,71105.0,332516,1525,205661422,210494465,215441084,1.84,3.3,21,35
63,Nepal,44179,39406,1397,1337,1340,56827,10.92,1.25,13.49,...,13716.0,41792,1367,29473448,29841866,30214889,5.04,2.0,24,22
64,Sudan,26865,36729,547,1102,760,710689,17.59,2.62,26.44,...,7448.0,31797,824,33329401,34202631,35098739,1.26,4.3,19,35


In [151]:
# Reposition columns in dataframe.
dfinal9 = dfinal8.iloc[:,[0, 11, 10, 12, 18, 15,
                          2, 4, 16, 13, 14, 17, 1,
                          3, 9, 8, 21, 19, 20, 5, 7, 6 ]].copy()

dfinal9

Unnamed: 0,Country,Top Export,Exports,Imports,Import / Export Ratio,2022 Population,2022 GDP,2022 GDP per capita,Est. 2023 Population,Est. 2023 GDP,...,Est. 2024 GDP,Est. 2024 GDP per capita,Pop. Growth 2013-2022,Pop. Growth / Year,Urban Pop,Fert. Rate,Med. Age,Income,Unemployment,Area
0,Switzerland,Gold,661627,356763.0,0.54,8775745,818426,93260,8857359,878442,...,938458,105669,8.49,0.93,75,1.5,42,95490,4.30,15942
1,Norway,Petroleum,321076,107268.0,0.33,5329047,579422,108729,5373810,553186,...,526951,94660,7.43,0.84,86,1.5,40,94540,3.23,148449
2,Luxembourg,Iron,163585,26068.0,0.16,652208,81530,125006,665839,85043,...,88556,131384,20.20,2.09,88,1.4,39,89200,4.58,998
3,Ireland,Blood,731813,147913.0,0.20,5120211,532415,103983,5176021,548217,...,564020,106059,10.89,1.09,64,1.8,38,79730,4.48,27458
4,United States,Petroleum,3051824,3375948.0,1.11,337273680,25744100,76330,339331049,27262591,...,28781083,85373,4.41,0.61,83,1.7,38,76770,3.65,3809525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,Petroleum,7449,7971.0,1.07,28351023,44341,1564,29102325,48773,...,53205,1815,26.44,2.65,58,4.3,18,1640,3.78,179942
62,Pakistan,Clothing,38700,71105.0,1.84,205661422,326796,1589,210494465,332516,...,338237,1461,23.30,2.35,35,3.3,21,1560,5.60,307373
63,Nepal,Soybean oil,2722,13716.0,5.04,29473448,39406,1337,29841866,41792,...,44179,1397,13.49,1.25,22,2.0,24,1340,10.92,56827
64,Sudan,Gold,5908,7448.0,1.26,33329401,36729,1102,34202631,31797,...,26865,547,26.44,2.62,35,4.3,19,760,17.59,710689


In [152]:
# Convert all columns in dataframe that contain only 
# digits and periods (decimals) to numeric values.
dfinal9 = dfinal9.apply(partial(pd.to_numeric, errors = 'ignore')).copy()

dfinal9.info()

<class 'pandas.core.frame.DataFrame'>
Index: 65 entries, 0 to 65
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Country                   65 non-null     object 
 1   Top Export                65 non-null     object 
 2   Exports                   65 non-null     int64  
 3   Imports                   65 non-null     float64
 4   Import / Export Ratio     65 non-null     float64
 5   2022 Population           65 non-null     int64  
 6   2022 GDP                  65 non-null     int64  
 7   2022 GDP per capita       65 non-null     int64  
 8   Est. 2023 Population      65 non-null     int64  
 9   Est. 2023 GDP             65 non-null     int64  
 10  Est. 2023 GDP per capita  65 non-null     int64  
 11  Est. 2024 Population      65 non-null     int64  
 12  Est. 2024 GDP             65 non-null     int64  
 13  Est. 2024 GDP per capita  65 non-null     int64  
 14  Pop. Growth 2013-

In [153]:
# Creates .csv file.
dfinal9.to_csv('Highest GDP Countries and Wages.csv', index = False)

dfinal9

Unnamed: 0,Country,Top Export,Exports,Imports,Import / Export Ratio,2022 Population,2022 GDP,2022 GDP per capita,Est. 2023 Population,Est. 2023 GDP,...,Est. 2024 GDP,Est. 2024 GDP per capita,Pop. Growth 2013-2022,Pop. Growth / Year,Urban Pop,Fert. Rate,Med. Age,Income,Unemployment,Area
0,Switzerland,Gold,661627,356763.0,0.54,8775745,818426,93260,8857359,878442,...,938458,105669,8.49,0.93,75,1.5,42,95490,4.30,15942
1,Norway,Petroleum,321076,107268.0,0.33,5329047,579422,108729,5373810,553186,...,526951,94660,7.43,0.84,86,1.5,40,94540,3.23,148449
2,Luxembourg,Iron,163585,26068.0,0.16,652208,81530,125006,665839,85043,...,88556,131384,20.20,2.09,88,1.4,39,89200,4.58,998
3,Ireland,Blood,731813,147913.0,0.20,5120211,532415,103983,5176021,548217,...,564020,106059,10.89,1.09,64,1.8,38,79730,4.48,27458
4,United States,Petroleum,3051824,3375948.0,1.11,337273680,25744100,76330,339331049,27262591,...,28781083,85373,4.41,0.61,83,1.7,38,76770,3.65,3809525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Cameroon,Petroleum,7449,7971.0,1.07,28351023,44341,1564,29102325,48773,...,53205,1815,26.44,2.65,58,4.3,18,1640,3.78,179942
62,Pakistan,Clothing,38700,71105.0,1.84,205661422,326796,1589,210494465,332516,...,338237,1461,23.30,2.35,35,3.3,21,1560,5.60,307373
63,Nepal,Soybean oil,2722,13716.0,5.04,29473448,39406,1337,29841866,41792,...,44179,1397,13.49,1.25,22,2.0,24,1340,10.92,56827
64,Sudan,Gold,5908,7448.0,1.26,33329401,36729,1102,34202631,31797,...,26865,547,26.44,2.62,35,4.3,19,760,17.59,710689
