In [19]:
import pandas as pd

In [20]:
#pick the URL we will get the data
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"


In [21]:
tables=pd.read_html(url)     #this returns a list of all tables onthe page

#inspect the first table
df=tables[0]

df

Unnamed: 0,Location,Population,% of world,Date,Source (official or from the United Nations),Notes
0,World,8232000000,100%,13 Jun 2025,UN projection[1][3],
1,India,1413324000,17.3%,1 Mar 2025,Official projection[4],[b]
2,China,1408280000,17.2%,31 Dec 2024,Official estimate[5],[c]
3,United States,340110988,4.2%,1 Jul 2024,Official estimate[6],[d]
4,Indonesia,282477584,3.5%,30 Jun 2024,National annual projection[7],
...,...,...,...,...,...,...
235,Niue (New Zealand),1681,0%,11 Nov 2022,2022 Census[246],
236,Tokelau (New Zealand),1647,0%,1 Jan 2019,2019 Census[247],
237,Vatican City,882,0%,31 Dec 2024,Official figure[248],[ah]
238,Cocos (Keeling) Islands (Australia),593,0%,30 Jun 2020,2021 Census[249],


In [22]:
#check the columns
df.columns

Index(['Location', 'Population', '% of world', 'Date',
       'Source (official or from the United Nations)', 'Notes'],
      dtype='object')

In [23]:
#rename the columns
df.columns = ['Country', 'Population', 'World%', 'Date', 'Source', 'Notes']


In [24]:
df.head(2)

Unnamed: 0,Country,Population,World%,Date,Source,Notes
0,World,8232000000,100%,13 Jun 2025,UN projection[1][3],
1,India,1413324000,17.3%,1 Mar 2025,Official projection[4],[b]


In [25]:
#clean population and world%
#convert the column values into strings before removing commas and convert to numeric
df['Population']=df['Population'].astype(str).str.replace(',','').astype(int)
df['World%']=df['World%'].astype(str).str.replace("%", '').astype(float)

In [26]:
df.head(10)

Unnamed: 0,Country,Population,World%,Date,Source,Notes
0,World,8232000000,100.0,13 Jun 2025,UN projection[1][3],
1,India,1413324000,17.3,1 Mar 2025,Official projection[4],[b]
2,China,1408280000,17.2,31 Dec 2024,Official estimate[5],[c]
3,United States,340110988,4.2,1 Jul 2024,Official estimate[6],[d]
4,Indonesia,282477584,3.5,30 Jun 2024,National annual projection[7],
5,Pakistan,241499431,3.0,1 Mar 2023,2023 census result[8],[e]
6,Nigeria,223800000,2.7,1 Jul 2023,Official projection[9],
7,Brazil,212583750,2.6,1 Jul 2024,Official projection[10],
8,Bangladesh,169828911,2.1,14 Jun 2022,2022 census result[11],[f]
9,Russia,146028325,1.8,1 Jan 2025,Official estimate[13],[g]


In [27]:
#drop footnote brackets e.g.[1]
import re

df['Country'] = df['Country'].apply(lambda x: re.sub(r'\[\w+\]', '', x).strip())

In [28]:
#lets see the cleaned version
df

Unnamed: 0,Country,Population,World%,Date,Source,Notes
0,World,8232000000,100.0,13 Jun 2025,UN projection[1][3],
1,India,1413324000,17.3,1 Mar 2025,Official projection[4],[b]
2,China,1408280000,17.2,31 Dec 2024,Official estimate[5],[c]
3,United States,340110988,4.2,1 Jul 2024,Official estimate[6],[d]
4,Indonesia,282477584,3.5,30 Jun 2024,National annual projection[7],
...,...,...,...,...,...,...
235,Niue (New Zealand),1681,0.0,11 Nov 2022,2022 Census[246],
236,Tokelau (New Zealand),1647,0.0,1 Jan 2019,2019 Census[247],
237,Vatican City,882,0.0,31 Dec 2024,Official figure[248],[ah]
238,Cocos (Keeling) Islands (Australia),593,0.0,30 Jun 2020,2021 Census[249],


In [31]:
#lets drop the first column
df.drop(0, inplace=True)

In [32]:
#show table
df

Unnamed: 0,Country,Population,World%,Date,Source,Notes
1,India,1413324000,17.3,1 Mar 2025,Official projection[4],[b]
2,China,1408280000,17.2,31 Dec 2024,Official estimate[5],[c]
3,United States,340110988,4.2,1 Jul 2024,Official estimate[6],[d]
4,Indonesia,282477584,3.5,30 Jun 2024,National annual projection[7],
5,Pakistan,241499431,3.0,1 Mar 2023,2023 census result[8],[e]
...,...,...,...,...,...,...
235,Niue (New Zealand),1681,0.0,11 Nov 2022,2022 Census[246],
236,Tokelau (New Zealand),1647,0.0,1 Jan 2019,2019 Census[247],
237,Vatican City,882,0.0,31 Dec 2024,Official figure[248],[ah]
238,Cocos (Keeling) Islands (Australia),593,0.0,30 Jun 2020,2021 Census[249],


In [33]:

df.set_index('Country')

Unnamed: 0_level_0,Population,World%,Date,Source,Notes
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
India,1413324000,17.3,1 Mar 2025,Official projection[4],[b]
China,1408280000,17.2,31 Dec 2024,Official estimate[5],[c]
United States,340110988,4.2,1 Jul 2024,Official estimate[6],[d]
Indonesia,282477584,3.5,30 Jun 2024,National annual projection[7],
Pakistan,241499431,3.0,1 Mar 2023,2023 census result[8],[e]
...,...,...,...,...,...
Niue (New Zealand),1681,0.0,11 Nov 2022,2022 Census[246],
Tokelau (New Zealand),1647,0.0,1 Jan 2019,2019 Census[247],
Vatican City,882,0.0,31 Dec 2024,Official figure[248],[ah]
Cocos (Keeling) Islands (Australia),593,0.0,30 Jun 2020,2021 Census[249],


In [34]:
df

Unnamed: 0,Country,Population,World%,Date,Source,Notes
1,India,1413324000,17.3,1 Mar 2025,Official projection[4],[b]
2,China,1408280000,17.2,31 Dec 2024,Official estimate[5],[c]
3,United States,340110988,4.2,1 Jul 2024,Official estimate[6],[d]
4,Indonesia,282477584,3.5,30 Jun 2024,National annual projection[7],
5,Pakistan,241499431,3.0,1 Mar 2023,2023 census result[8],[e]
...,...,...,...,...,...,...
235,Niue (New Zealand),1681,0.0,11 Nov 2022,2022 Census[246],
236,Tokelau (New Zealand),1647,0.0,1 Jan 2019,2019 Census[247],
237,Vatican City,882,0.0,31 Dec 2024,Official figure[248],[ah]
238,Cocos (Keeling) Islands (Australia),593,0.0,30 Jun 2020,2021 Census[249],


In [36]:
#find population of Kenya
df[df['Country']=="Kenya"]

Unnamed: 0,Country,Population,World%,Date,Source,Notes
27,Kenya,52428290,0.6,1 Jul 2024,Official projection[31],


In [37]:
#lets drop null values
df.dropna(inplace=True)

In [41]:
#find population for Uganda
df[df['Country'] == "Tanzania"]


Unnamed: 0,Country,Population,World%,Date,Source,Notes
22,Tanzania,66278276,0.8,1 Jul 2024,Official projection[26],[j]
