# Creation of the countries total area using web scraping

In [120]:
# Import libraries
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup

In [3]:
URL = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area" # wikipedia URL to get the information

r = requests.get(URL)

soup = BeautifulSoup(r.text, 'html.parser')

In [31]:
byCountries = soup.find_all('tr') # each rows for each country is between 'tr' tags
byCountries = byCountries[2:-7]   # remove rows containing no usefull information
byCountries[0]

<tr>
<td><span data-sort-value="5000000000000000000♠" style="display:none"></span> –</td>
<td style="text-align:left;"><span class="flagicon" style="padding-left:25px;"> </span><a href="/wiki/World" title="World">World</a></td>
<td><span data-sort-value="7014510072000000000♠"></span>510,072,000<br/>(196,940,000)</td>
<td><span data-sort-value="7014148940000000000♠"></span>148,940,000<br/>(57,510,000)</td>
<td><span data-sort-value="7014361132000000000♠"></span>361,132,000<br/>(139,434,000)</td>
<td>70.8</td>
<td style="text-align:left;">
</td></tr>

In [243]:
country_areas = []
for r in byCountries:
    tds = r.find_all('td')  
    country = r.find('a').get('title') # Get the name of the country
    values = []
    for p in tds[2:5]: # The values of interest are stored in the second to fourth td tags
        if(len(p.contents) > 0):
            if isinstance(p.contents[1], str): # consider the column if it is a string (and not a tag or something else)
                values += [p.contents[1]]
            else:
                values += "Not determined"
        else:
            values += ["-"]
    country_areas.append({"Country" : country, "Total(km2)" : values[0], "Land(km2)" : values[1], "Water(km2)" : values[2]})

In [244]:
country_areas = pd.DataFrame.from_dict(country_areas)
country_areas

Unnamed: 0,Country,Total(km2),Land(km2),Water(km2)
0,World,510072000,148940000,361132000
1,Russia,17098246,16377742,720500
2,Antarctica,14000000,14000000,0
3,Canada,9984670,9093507,891163
4,China,9596961,9326410,270550
...,...,...,...,...
257,Clipperton Island,6,2,4
258,Ashmore and Cartier Islands,5,5,0
259,Spratly Islands,< 5,< 5,0
260,Coral Sea Islands,< 3,< 3,0


In [245]:
country_areas["Total(km2)"] = country_areas["Total(km2)"].apply(lambda x: re.sub(r"[^0-9]", "", x))
country_areas["Land(km2)"] = country_areas["Land(km2)"].apply(lambda x: re.sub(r"[^0-9]", "", x))
country_areas["Water(km2)"] = country_areas["Water(km2)"].apply(lambda x: re.sub(r"[^0-9]", "", x))

country_areas["Total(km2)"] = country_areas["Total(km2)"].apply(lambda x: 0 if x == '' else int(x))
country_areas["Land(km2)"] = country_areas["Land(km2)"].apply(lambda x: 0 if x == '' else int(x))
country_areas["Water(km2)"] = country_areas["Water(km2)"].apply(lambda x: 0 if x == '' else int(x))

country_areas = country_areas.drop_duplicates(subset="Country", keep="first").sort_values(by="Total(km2)", ascending=False)

In [246]:
country_areas

Unnamed: 0,Country,Total(km2),Land(km2),Water(km2)
0,World,510072000,148940000,361132000
1,Russia,17098246,16377742,720500
2,Antarctica,14000000,14000000,0
3,Canada,9984670,9093507,891163
4,China,9596961,9326410,270550
...,...,...,...,...
256,Gibraltar,6,65,0
257,Clipperton Island,6,2,4
258,Ashmore and Cartier Islands,5,5,0
259,Spratly Islands,5,5,0


In [252]:
export_path = r'Data\country_areas.csv'
export_csv = country_areas.to_csv(export_path, index = None, header=True)