In [12]:
from tabulate import tabulate
import pandas as pd

In [13]:
# Extracting the data from Wikipedia by using the pandas library
url = "https://en.wikipedia.org/wiki/List_of_continents_and_continental_subregions_by_population"
tables = pd.read_html(url)
# Display the tables to understand their structure
print(f"Number of tables found: {len(tables)}")

Number of tables found: 28


In [14]:
# When extracting data from the Wikipedia page, the data is unstructured and needs to be cleaned

In [15]:
# Tabel for continents
continent = tables[1]
continent.to_csv("continent.csv", index=False)
continent.columns = continent.iloc[0] # set the first row as the column names
continent = continent[1:].reset_index(drop=True)

# Tabel for continental regions
cont_region = tables[2] 
cont_region.to_csv("cont_region.csv", index=False)
cont_region.columns = cont_region.iloc[0] # set the first row as the column names
cont_region = cont_region[1:].reset_index(drop=True)

# Tabel for all regions
region_tables = tables[3:]  
region = pd.concat(region_tables, ignore_index=True)
region.to_csv("region.csv", index=False)


In [16]:
# Renaming the columns to make them identical

continent = continent.rename(columns={'Population (2021)[1][2][4]': 'Population'})
cont_region = cont_region.rename(columns={'Population (2021)[1][2]': 'Population'})
region = region.rename(columns={'Pop.': 'Population'})


In [17]:
# The table for regions dont have the same structure as the other tables. 
# The table for regions needs to be cleaned and structured
# The table do not have region name but just Year and Population. 
# First step is therefore to create a new column with the Region_nr where each unique region is assigned a number based on repeated data from 1950 to 2021
region['Region_nr'] = (region['Year'] == 1950).cumsum()

In [18]:
# Definér mapping mellem 'region nr' og regionnavnene
region_mapping = {
    1: "Eastern Africa",
    2: "Middle Africa",
    3: "Northern Africa",
    4: "Southern Africa",
    5: "Western Africa",
    6: "Total Africa",
    7: "Total Americas",
    8: "Caribbean",
    9: "Central America",
    10: "North America",
    11: "Total North America",
    12: "Total South America",
    13: "Central Asia",
    14: "Eastern Asia",
    15: "South-Eastern Asia",
    16: "Southern Asia",
    17: "Western Asia",
    18: "Total Asia",
    19: "Eastern Europe",
    20: "North Europe",
    21: "Southern Europe",
    22: "Western Europe",
    23: "Total Europe",
    24: "Total Oceania",
    25: "Total World",
}

# Tilføj en ny kolonne 'region' ved at mappe 'region nr' til navne
region['Region'] = region['Region_nr'].map(region_mapping)

In [19]:
print(tabulate(continent, headers='keys', tablefmt='grid'))

+----+---------------+--------------+-------------+-----------------------+---------------------------+--------------------------+--------------------------------------------+------------------------+
|    | Continent     |   Population | % (world)   | ±% p.a. (2010–2013)   |   Sovereign states (2024) |   De facto states (2024) |   Non-self-governing territory(ies) (2024) |   Other area(s) (2024) |
|  0 | World         |   7909295151 | 100%        | 1.17%                 |                       197 |                        8 |                                         17 |                     34 |
+----+---------------+--------------+-------------+-----------------------+---------------------------+--------------------------+--------------------------------------------+------------------------+
|  1 | Asia          |   4694576167 | 59.4%       | 1.04%                 |                        48 |                        4 |                                          0 |                     

In [20]:
print(tabulate(cont_region, headers='keys', tablefmt='grid'))

+----+------------------------------------+--------------+-------------+-----------------------+---------------------------+----------------------------+--------------------------------------------+------------------------+
|    | Continental subregion              |   Population | % (world)   | ±% p.a. (2010–2013)   |   Sovereign states (2024) |   De facto state(s) (2024) |   Non-self-governing territory(ies) (2024) |   Other area(s) (2024) |
|  0 | World                              |   7909295151 | 100%        | 1.17%                 |                       197 |                          8 |                                         17 |                     34 |
+----+------------------------------------+--------------+-------------+-----------------------+---------------------------+----------------------------+--------------------------------------------+------------------------+
|  1 | Southern Asia[c]                   |   1989452478 | 25.2%       | 1.32%                 |        

In [21]:
print(tabulate(region, headers='keys', tablefmt='grid'))

+-----+--------+--------------+-----------+-------------+---------------------+
|     |   Year |   Population | ±% p.a.   |   Region_nr | Region              |
|   0 |   1950 |     66923000 | —         |           1 | Eastern Africa      |
+-----+--------+--------------+-----------+-------------+---------------------+
|   1 |   1960 |     84305000 | +2.34%    |           1 | Eastern Africa      |
+-----+--------+--------------+-----------+-------------+---------------------+
|   2 |   1970 |    110428000 | +2.74%    |           1 | Eastern Africa      |
+-----+--------+--------------+-----------+-------------+---------------------+
|   3 |   1980 |    147512000 | +2.94%    |           1 | Eastern Africa      |
+-----+--------+--------------+-----------+-------------+---------------------+
|   4 |   1990 |    198232000 | +3.00%    |           1 | Eastern Africa      |
+-----+--------+--------------+-----------+-------------+---------------------+
|   5 |   2000 |    259373000 | +2.72%  

CLASS REGION

In [22]:
class Region:
    def __init__(self, data):
        """
        Initialize the Region class with a DataFrame containing Year, Population, and Region.
        """
        self.data = data

    def display_population(self, region_name, year):
        """
        Display the population of a specific region in a specific year.
        """
        region_data = self.data[(self.data['Region'] == region_name) & (self.data['Year'] == year)]
        if not region_data.empty:
            population = region_data['Population'].iloc[0]
            print(f"Population of {region_name} in {year}: {population:,}")
        else:
            print(f"No data available for {region_name} in {year}.")



In [23]:
region_manager = Region(region)

# Display the population of a specific region in a specific year
region_manager.display_population("Eastern Africa", 1950)


Population of Eastern Africa in 1950: 66,923,000
