In [105]:
# importing all the necessaries library
import pandas as pd
import plotly.express as px

In [106]:
# opening the dataset and quick look on the dataset
data= pd.read_csv(r'C:\Users\johan\Anaconda3\envs\minimal_ds\conda-meta\os\Portofolio\World Population.csv')
data

Unnamed: 0,Rank,Country,Region,Population,Percentage,Date
0,1,China,Asia,1411778724,17.80%,20-Nov
1,2,India,Asia,1386141732,17.50%,21-Dec
2,3,United States,Americas,332960297,4.20%,21-Dec
3,4,Indonesia,Asia,271350000,3.43%,20-Dec
4,5,Pakistan,Asia,225200000,2.84%,21-Jul
...,...,...,...,...,...,...
236,237,Niue(NewZealand),Oceania,1549,0%,21-Jul
237,238,Tokelau(NewZealand),Oceania,1501,0%,21-Jul
238,239,VaticanCity,Europe,825,0%,19-Feb
239,240,Cocos(Keeling)Islands(Australia),Oceania,573,0%,20-Jun


### Data Cleaning

In [107]:
data.info()
# based on this info we could conclude that there is no null value

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Rank        241 non-null    int64 
 1   Country     241 non-null    object
 2   Region      241 non-null    object
 3   Population  241 non-null    int64 
 4   Percentage  241 non-null    object
 5   Date        241 non-null    object
dtypes: int64(2), object(4)
memory usage: 11.4+ KB


In [108]:
# checking if there is any duplication in the dataset
sum(data.duplicated())

0

In [109]:
# as we could see there are several column with the wrong dtypes
# first we are goint to make a new column where the percentage is float
data['Percentage']= data['Percentage'].str.strip('%').astype(float)/100.0
data.head()

Unnamed: 0,Rank,Country,Region,Population,Percentage,Date
0,1,China,Asia,1411778724,0.178,20-Nov
1,2,India,Asia,1386141732,0.175,21-Dec
2,3,United States,Americas,332960297,0.042,21-Dec
3,4,Indonesia,Asia,271350000,0.0343,20-Dec
4,5,Pakistan,Asia,225200000,0.0284,21-Jul


In [110]:
# now we could drop the unnecessary column such as rank and date
data_simple= data.drop(['Rank', 'Date'], axis=1)
data_simple.head()

Unnamed: 0,Country,Region,Population,Percentage
0,China,Asia,1411778724,0.178
1,India,Asia,1386141732,0.175
2,United States,Americas,332960297,0.042
3,Indonesia,Asia,271350000,0.0343
4,Pakistan,Asia,225200000,0.0284


### Exploratory Data Analysis

1. What is the population of each region ?

In [111]:
# first we are going to check the Region unique values
data['Region'].value_counts()
# as we could see we need to check the dataset, because as we could see, there are several country who have Europe,Asia and Asia,Europe region

Africa         56
Asia           53
Europe         52
Americas       51
Oceania        27
Europe,Asia     1
Asia,Europe     1
Name: Region, dtype: int64

In [112]:
data_filter= data_simple[data_simple['Region']=='Europe,Asia']
data_filter
# turns out the Europe Asia is Rusia, and Asia Europe is Turkey
# since both country located partly in Europe and Asia, we are going to combine both region
# and since both country are located partly in Europe and Asia, we are going to specify each region population, based on the dataset
data_simple['Region'].replace('Europe,Asia', 'Asia,Europe', inplace=True) 

In [113]:
# checking on how many population are there in Russia and Turkey
data_simple[(data_simple['Country']=='Russia') | (data_simple['Country']=='Turkey')]


Unnamed: 0,Country,Region,Population,Percentage
8,Russia,"Asia,Europe",146171015,0.0185
17,Turkey,"Asia,Europe",83614362,0.0106


In [114]:
# checking the total population worldwide based on the dataset
data_simple['Population'].sum()

7789333911

In [115]:
# based on worldometers, it is stated that 75% population of Russia live in Europe, while the rest (25%) live in Asia
# so for detailed matter, I will clean this data by adding Russia (Europe) and Russia (Asia) with their respectable population and percentage as well
data_simple.loc[8]= 'Russia (Europe)', 'Europe', 109628261 , 0.0141
data_simple[data_simple.Country=='Russia (Europe)']

Unnamed: 0,Country,Region,Population,Percentage
8,Russia (Europe),Europe,109628261,0.0141


In [116]:
# after changing the Russia into Russia (Europe), including their Region, Population and Percentage
# we are going to add another row with named Russia (Asia)
data_added= data_simple.append({'Country': 'Russia (Asia)', 'Region': 'Asia', 'Population':36542754, 'Percentage': 0.0047}, ignore_index=True)
data_added[(data_added.Country=='Russia (Europe)')|(data_added.Country=='Russia (Asia)')]

Unnamed: 0,Country,Region,Population,Percentage
8,Russia (Europe),Europe,109628261,0.0141
241,Russia (Asia),Asia,36542754,0.0047


In [117]:
# now we are going to do the same thing with Turkey
# based on wikipedia, 10% of their population are European and the rest are included as Asian, so we are going to differintiate the country into Turkey (Asia) and Turkey (Europe)
data_added.loc[17]= 'Turkey (Asia)', 'Asia', 75252926, 0.0097
data_added[data_added.Country=='Turkey (Asia)']

Unnamed: 0,Country,Region,Population,Percentage
17,Turkey (Asia),Asia,75252926,0.0097


In [126]:
# we are going to add a new row as well named Turkey (Europe)
data_added_again= data_added.append({'Country':'Turkey (Europe)', 'Region': 'Europe', 'Population':836146, 'Percentage': 0.0011}, ignore_index=True)
data_added_again[(data_added_again.Country=='Turkey (Asia)')|(data_added_again.Country=='Turkey (Europe)')]

Unnamed: 0,Country,Region,Population,Percentage
17,Turkey (Asia),Asia,75252926,0.0097
242,Turkey (Europe),Europe,836146,0.0011


In [127]:
# double checking if the Region already separated and no region called Europe Asia again
data_added_again['Region'].value_counts()

Africa      56
Asia        55
Europe      54
Americas    51
Oceania     27
Name: Region, dtype: int64

In [128]:
# now we could make the visualization for each region's population
px.histogram(data_added_again, x= 'Region', y= 'Population', histfunc= 'sum').update_xaxes(categoryorder='total descending')
# as we could see, Asia has the highest population in the world, following by Africa, then America, Europe and Oceania

2. Which country has the most population in each region ?

In [129]:
# to find the country with the most population in each region, we are going to grouping the dataset based on the Region
# after that we could determine the country with the highest population
max=data_added_again.groupby("Region").head(1)
max


Unnamed: 0,Country,Region,Population,Percentage
0,China,Asia,1411778724,0.178
2,United States,Americas,332960297,0.042
6,Nigeria,Africa,211401000,0.0267
8,Russia (Europe),Europe,109628261,0.0141
52,Australia,Oceania,25922849,0.0033


In [121]:
# based on the graph we could see that China has the most contribution on the population of the world
px.histogram(max, 'Country', 'Population')

3. What is the percentage of the first 10 countries?

In [130]:
# now we are going to sort the data based on the highest percentage
sorted= data_added_again.nlargest(10, 'Percentage')
sorted

Unnamed: 0,Country,Region,Population,Percentage
0,China,Asia,1411778724,0.178
1,India,Asia,1386141732,0.175
2,United States,Americas,332960297,0.042
3,Indonesia,Asia,271350000,0.0343
4,Pakistan,Asia,225200000,0.0284
5,Brazil,Americas,214143381,0.027
6,Nigeria,Africa,211401000,0.0267
7,Bangladesh,Asia,171950056,0.0217
9,Mexico,Americas,126014024,0.0159
10,Japan,Asia,125470000,0.0158


In [123]:
# for easiear visualization, we are going to add another column where they showed Percentage (%)
sorted['Percentage (%)']= sorted['Percentage']*100
sorted

Unnamed: 0,Country,Region,Population,Percentage,Percentage (%)
0,China,Asia,1411778724,0.178,17.8
1,India,Asia,1386141732,0.175,17.5
2,United States,Americas,332960297,0.042,4.2
3,Indonesia,Asia,271350000,0.0343,3.43
4,Pakistan,Asia,225200000,0.0284,2.84
5,Brazil,Americas,214143381,0.027,2.7
6,Nigeria,Africa,211401000,0.0267,2.67
7,Bangladesh,Asia,171950056,0.0217,2.17
9,Mexico,Americas,126014024,0.0159,1.59
10,Japan,Asia,125470000,0.0158,1.58


In [124]:
# based on the graph, we could see that the region included in top 10 highest population is mostly from Asia, then followed up with Americas and Africa
# the highest percentage of population still goes to China whereas it has 17.8% global population, the runner up is India with 17.5%
px.line(sorted, 'Country', 'Percentage (%)')