#Data Wrangling a Population of Countries Dataset by Dr Alvin Ang

https://www.alvinang.sg/s/Population-of-Countries-in-2000.csv



---



#Step 1: Import the Dataset

In [7]:
import pandas as pd

df = pd.read_csv('https://www.alvinang.sg/s/Population-of-Countries-in-2000.csv')

df.sample()

Unnamed: 0.1,Unnamed: 0,country,country_isocode,year,pop,xrat,tcgdp,cc,cg
5,5,South Africa,ZAF,2000,45064.098,6.93983,227242.36949,72.71871,5.726546




---



#Step 2: Preview the Dataset

In [2]:
df.shape

(8, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       8 non-null      int64  
 1   country          8 non-null      object 
 2   country_isocode  8 non-null      object 
 3   year             8 non-null      int64  
 4   pop              8 non-null      float64
 5   xrat             8 non-null      float64
 6   tcgdp            8 non-null      float64
 7   cc               8 non-null      float64
 8   cg               8 non-null      float64
dtypes: float64(5), int64(2), object(2)
memory usage: 704.0+ bytes


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,year,pop,xrat,tcgdp,cc,cg
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,3.5,2000.0,176382.6,16.415811,1606312.0,71.404995,8.145477
std,2.44949,0.0,347922.3,22.758175,3397025.0,5.318015,3.383397
min,0.0,2000.0,3219.793,0.9995,5026.222,64.436451,5.108068
25%,1.75,2000.0,10379.77,1.543623,103254.4,66.963157,5.689611
50%,3.5,2000.0,28194.42,5.50858,261157.3,72.532882,6.376276
75%,5.25,2000.0,104341.1,20.310094,838389.6,74.959919,10.614755
max,7.0,2000.0,1006300.0,59.543808,9898700.0,78.97874,14.072206




---



#Step 3: Check for Missing Data

In [8]:
df.isna().any()

#there's no missing data!

Unnamed: 0         False
country            False
country_isocode    False
year               False
pop                False
xrat               False
tcgdp              False
cc                 False
cg                 False
dtype: bool

In [9]:
df.isnull().sum()

#there's no NaNs!

Unnamed: 0         0
country            0
country_isocode    0
year               0
pop                0
xrat               0
tcgdp              0
cc                 0
cg                 0
dtype: int64



---



#Step 4: Deal with Missing Data

##4a) Drop NaNs

###4a)(i) Drop off Columns that have NaNs

In [10]:
df.dropna(axis = 'columns')

#drop off columns that have NaNs

Unnamed: 0.1,Unnamed: 0,country,country_isocode,year,pop,xrat,tcgdp,cc,cg
0,0,Argentina,ARG,2000,37335.653,0.9995,295072.2,75.716805,5.578804
1,1,Australia,AUS,2000,19053.186,1.72483,541804.7,67.759026,6.720098
2,2,India,IND,2000,1006300.297,44.9416,1728144.0,64.575551,14.072206
3,3,Israel,ISR,2000,6114.57,4.07733,129253.9,64.436451,10.266688
4,4,Malawi,MWI,2000,11801.505,59.543808,5026.222,74.707624,11.658954
5,5,South Africa,ZAF,2000,45064.098,6.93983,227242.4,72.71871,5.726546
6,6,United States,USA,2000,282171.957,1.0,9898700.0,72.347054,6.032454
7,7,Uruguay,URY,2000,3219.793,12.099592,25255.96,78.97874,5.108068




---



###4a)(ii) Drop off Rows that have NaNs

In [11]:
df.dropna(axis = 'rows')

#drop off rows that have NaNs

Unnamed: 0.1,Unnamed: 0,country,country_isocode,year,pop,xrat,tcgdp,cc,cg
0,0,Argentina,ARG,2000,37335.653,0.9995,295072.2,75.716805,5.578804
1,1,Australia,AUS,2000,19053.186,1.72483,541804.7,67.759026,6.720098
2,2,India,IND,2000,1006300.297,44.9416,1728144.0,64.575551,14.072206
3,3,Israel,ISR,2000,6114.57,4.07733,129253.9,64.436451,10.266688
4,4,Malawi,MWI,2000,11801.505,59.543808,5026.222,74.707624,11.658954
5,5,South Africa,ZAF,2000,45064.098,6.93983,227242.4,72.71871,5.726546
6,6,United States,USA,2000,282171.957,1.0,9898700.0,72.347054,6.032454
7,7,Uruguay,URY,2000,3219.793,12.099592,25255.96,78.97874,5.108068




---



##4b) Fill Up NaNs with Other Values

###4b)(i) Fill Up NaNs with 0

In [12]:
df.fillna(0)

#fill up all NaNs with 0 

Unnamed: 0.1,Unnamed: 0,country,country_isocode,year,pop,xrat,tcgdp,cc,cg
0,0,Argentina,ARG,2000,37335.653,0.9995,295072.2,75.716805,5.578804
1,1,Australia,AUS,2000,19053.186,1.72483,541804.7,67.759026,6.720098
2,2,India,IND,2000,1006300.297,44.9416,1728144.0,64.575551,14.072206
3,3,Israel,ISR,2000,6114.57,4.07733,129253.9,64.436451,10.266688
4,4,Malawi,MWI,2000,11801.505,59.543808,5026.222,74.707624,11.658954
5,5,South Africa,ZAF,2000,45064.098,6.93983,227242.4,72.71871,5.726546
6,6,United States,USA,2000,282171.957,1.0,9898700.0,72.347054,6.032454
7,7,Uruguay,URY,2000,3219.793,12.099592,25255.96,78.97874,5.108068




---



##4b)(ii) Forward Fill

In [13]:
df.fillna(method = 'ffill')

Unnamed: 0.1,Unnamed: 0,country,country_isocode,year,pop,xrat,tcgdp,cc,cg
0,0,Argentina,ARG,2000,37335.653,0.9995,295072.2,75.716805,5.578804
1,1,Australia,AUS,2000,19053.186,1.72483,541804.7,67.759026,6.720098
2,2,India,IND,2000,1006300.297,44.9416,1728144.0,64.575551,14.072206
3,3,Israel,ISR,2000,6114.57,4.07733,129253.9,64.436451,10.266688
4,4,Malawi,MWI,2000,11801.505,59.543808,5026.222,74.707624,11.658954
5,5,South Africa,ZAF,2000,45064.098,6.93983,227242.4,72.71871,5.726546
6,6,United States,USA,2000,282171.957,1.0,9898700.0,72.347054,6.032454
7,7,Uruguay,URY,2000,3219.793,12.099592,25255.96,78.97874,5.108068




---



##4b)(iii) Backward Fill

In [14]:
df.fillna(method = 'bfill')

Unnamed: 0.1,Unnamed: 0,country,country_isocode,year,pop,xrat,tcgdp,cc,cg
0,0,Argentina,ARG,2000,37335.653,0.9995,295072.2,75.716805,5.578804
1,1,Australia,AUS,2000,19053.186,1.72483,541804.7,67.759026,6.720098
2,2,India,IND,2000,1006300.297,44.9416,1728144.0,64.575551,14.072206
3,3,Israel,ISR,2000,6114.57,4.07733,129253.9,64.436451,10.266688
4,4,Malawi,MWI,2000,11801.505,59.543808,5026.222,74.707624,11.658954
5,5,South Africa,ZAF,2000,45064.098,6.93983,227242.4,72.71871,5.726546
6,6,United States,USA,2000,282171.957,1.0,9898700.0,72.347054,6.032454
7,7,Uruguay,URY,2000,3219.793,12.099592,25255.96,78.97874,5.108068




---



#Step 5: Renaming Columns

In [15]:
#look at the current column names

df.columns

Index(['Unnamed: 0', 'country', 'country_isocode', 'year', 'pop', 'xrat',
       'tcgdp', 'cc', 'cg'],
      dtype='object')

In [16]:
df.columns = ['S/N', 'country', 'country_isocode', 'year', 'pop', 'xrat',
'tcgdp', 'cc', 'cg']

In [18]:
df.sample()

Unnamed: 0,S/N,country,country_isocode,year,pop,xrat,tcgdp,cc,cg
1,1,Australia,AUS,2000,19053.186,1.72483,541804.6521,67.759026,6.720098


In [19]:
df = df.rename(columns = {"country_isocode":"blablabla"})

In [20]:
df.columns

Index(['S/N', 'country', 'blablabla', 'year', 'pop', 'xrat', 'tcgdp', 'cc',
       'cg'],
      dtype='object')



---



#Step 6: Filtering a Row

In [21]:
#Filtering out United States

df[df['country'] == 'United States']

Unnamed: 0,S/N,country,blablabla,year,pop,xrat,tcgdp,cc,cg
6,6,United States,USA,2000,282171.957,1.0,9898700.0,72.347054,6.032454


In [22]:
#Filtering out Australia

df[df['country'] == 'Australia']

Unnamed: 0,S/N,country,blablabla,year,pop,xrat,tcgdp,cc,cg
1,1,Australia,AUS,2000,19053.186,1.72483,541804.6521,67.759026,6.720098




---



#Step 7: Filtering a Column

In [23]:
#Filter out Country and Year

df.filter(items = ['country', 'year'])

Unnamed: 0,country,year
0,Argentina,2000
1,Australia,2000
2,India,2000
3,Israel,2000
4,Malawi,2000
5,South Africa,2000
6,United States,2000
7,Uruguay,2000


In [24]:
#OR

df[['year', 'country']]

Unnamed: 0,year,country
0,2000,Argentina
1,2000,Australia
2,2000,India
3,2000,Israel
4,2000,Malawi
5,2000,South Africa
6,2000,United States
7,2000,Uruguay




---



#Step 8: Searching Out Using REGEX

In [None]:
#axis = 1 --> search via Columns
#axis = 0 --> search via Row (index column)

- regex = Regular Expression
- regex is a string of text that allows you to create patterns that help search and match text.
- the $$ means Matches end of line
- so r$ means that 'r' has to match at the end of the line (in this case, yea'r')
- if you put regex='y$', it will search out 'country' because that's the only column name with ending 'y'

In [25]:
df.filter(regex = 'r$', axis = 1)
#searching out column names ending with 'r'

Unnamed: 0,year
0,2000
1,2000
2,2000
3,2000
4,2000
5,2000
6,2000
7,2000


In [27]:
df.filter(regex = 'y$', axis = 1)
#searching out column names ending with 'y' 

Unnamed: 0,country
0,Argentina
1,Australia
2,India
3,Israel
4,Malawi
5,South Africa
6,United States
7,Uruguay




---



#Step 9: Searching Out Using LIKE

In [28]:
df.filter(like = 'p', axis = 1)

#like 'p' means to search for any column name that has 'p' in it

Unnamed: 0,pop,tcgdp
0,37335.653,295072.2
1,19053.186,541804.7
2,1006300.297,1728144.0
3,6114.57,129253.9
4,11801.505,5026.222
5,45064.098,227242.4
6,282171.957,9898700.0
7,3219.793,25255.96




---
#THE END


---


