# Import Básicos

In [117]:
import requests
import os
import pandas as pd
import zipfile

#Descarga del Dataset

In [118]:
csv_url = "https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv"

In [119]:
ruta_carpeta = "/content/CSV's"

if not os.path.isdir(ruta_carpeta):
  os.mkdir(ruta_carpeta)

In [120]:
nombre_arch = ruta_carpeta + '/population.zip'


if not os.path.isfile(nombre_arch):

  arch_csv = requests.get(csv_url, stream=True)

  with open(nombre_arch,"wb") as csv:
      for chunk in arch_csv.iter_content(chunk_size=1024):
    
          if chunk:
              csv.write(chunk)

# Descomprimimos el Dataset

In [121]:
#   Descomprimimos el archivo utilizando la libreria zipfile
nombre_dir_csv = ruta_carpeta + '/population'

with zipfile.ZipFile(file=nombre_arch, mode='r') as f:
  f.extractall(path=nombre_dir_csv)

#Cargamos el Dataset

In [122]:
path_csv = "/content/CSV's/population/API_SP.POP.TOTL_DS2_en_csv_v2_3731322.csv"
df_pob = pd.read_csv(filepath_or_buffer=path_csv, skiprows=4)

In [123]:
df_pob.head(5)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54208.0,55434.0,56234.0,56699.0,57029.0,57357.0,...,102565.0,103165.0,103776.0,104339.0,104865.0,105361.0,105846.0,106310.0,106766.0,
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130836765.0,134159786.0,137614644.0,141202036.0,144920186.0,148769974.0,...,547482863.0,562601578.0,578075373.0,593871847.0,609978946.0,626392880.0,643090131.0,660046272.0,677243299.0,
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8996967.0,9169406.0,9351442.0,9543200.0,9744772.0,9956318.0,...,31161378.0,32269592.0,33370804.0,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0,38928341.0,
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,96396419.0,98407221.0,100506960.0,102691339.0,104953470.0,107289875.0,...,370243017.0,380437896.0,390882979.0,401586651.0,412551299.0,423769930.0,435229381.0,446911598.0,458803476.0,
4,Angola,AGO,"Population, total",SP.POP.TOTL,5454938.0,5531451.0,5608499.0,5679409.0,5734995.0,5770573.0,...,25107925.0,26015786.0,26941773.0,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0,32866268.0,


#Transformamos el Dataset

In [124]:
df_pob.drop(
    columns= ['Country Code', 'Indicator Name', 'Indicator Code', 'Unnamed: 65'],
    inplace= True
)

df_pob = pd.melt(
    df_pob, 
    id_vars=['Country Name'],
    var_name='Year',
    value_name= 'Pob'
)

df_pob.rename(
    columns={'Country Name':'Country'}, 
    inplace=True
)

In [125]:
pd.options.display.float_format = '{:,.1f}'.format
df_pob.head(5)

Unnamed: 0,Country,Year,Pob
0,Aruba,1960,54208.0
1,Africa Eastern and Southern,1960,130836765.0
2,Afghanistan,1960,8996967.0
3,Africa Western and Central,1960,96396419.0
4,Angola,1960,5454938.0


# Hacemos las conversiones correspondientes

In [126]:
df_pob.dtypes

Country     object
Year        object
Pob        float64
dtype: object

In [127]:
#   Hacemos que la columna sea de tipo Categorical y los datos sean de tipo String
df_pob['Year'] = pd.Categorical(
    df_pob['Year'].apply(str)
)

In [128]:
df_pob.dtypes

Country      object
Year       category
Pob         float64
dtype: object

# Creamos un SubSet

In [129]:
#   Hacemos una mascara booleanas que contengan los valores de los paises selecionados

idx_filtro = df_pob['Country'].isin(['Mexico', 'Hong Kong SAR, China', 'Ireland'])

In [130]:
#   Hacemos un subconjunto utilizando la mascara booleana que contenga unicamente
#   los paises seleccionados   

df_ejemplo = df_pob[idx_filtro]

In [131]:
df_ejemplo

Unnamed: 0,Country,Year,Pob
96,"Hong Kong SAR, China",1960,3075605.0
111,Ireland,1960,2828600.0
154,Mexico,1960,37771861.0
362,"Hong Kong SAR, China",1961,3168100.0
377,Ireland,1961,2824400.0
...,...,...,...
15805,Ireland,2019,4934340.0
15848,Mexico,2019,127575529.0
16056,"Hong Kong SAR, China",2020,7481800.0
16071,Ireland,2020,4994724.0


In [135]:
#   Ordenamos el Subconjunto por pais y año
df_ejemplo = df_ejemplo.set_index(['Country','Year']).sort_index()

In [137]:
df_ejemplo

Unnamed: 0_level_0,Unnamed: 1_level_0,Pob
Country,Year,Unnamed: 2_level_1
"Hong Kong SAR, China",1960,3075605.0
"Hong Kong SAR, China",1961,3168100.0
"Hong Kong SAR, China",1962,3305200.0
"Hong Kong SAR, China",1963,3420900.0
"Hong Kong SAR, China",1964,3504600.0
...,...,...
Mexico,2016,123333379.0
Mexico,2017,124777326.0
Mexico,2018,126190782.0
Mexico,2019,127575529.0


In [None]:
#   Seleccionamos un Pais y años especificos (Seleccion multi-index)
df_ejemplo.loc['Mexico'].loc['1997':'2020', :]

In [159]:
df_ejemplo.xs(key='1990', level='Year')

Unnamed: 0_level_0,Pob
Country,Unnamed: 1_level_1
"Hong Kong SAR, China",5704500.0
Ireland,3513974.0
Mexico,83943135.0


# Categorizamos el DataSet

In [163]:
df_paises = df_pob.set_index(['Country', 'Year']).sort_index(ascending=[True, True])

In [164]:
df_paises

Unnamed: 0_level_0,Unnamed: 1_level_0,Pob
Country,Year,Unnamed: 2_level_1
Afghanistan,1960,8996967.0
Afghanistan,1961,9169406.0
Afghanistan,1962,9351442.0
Afghanistan,1963,9543200.0
Afghanistan,1964,9744772.0
...,...,...
Zimbabwe,2016,14030338.0
Zimbabwe,2017,14236599.0
Zimbabwe,2018,14438812.0
Zimbabwe,2019,14645473.0


In [170]:
ids = pd.IndexSlice
df_paises.loc[ids['Macedonia':'Mexico', '2000':'2020'],:].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pob
Country,Year,Unnamed: 2_level_1
Madagascar,2000,15766806.0
Madagascar,2001,16260933.0
Madagascar,2002,16765122.0
Madagascar,2003,17279139.0
Madagascar,2004,17802992.0
...,...,...
Mexico,2016,123333379.0
Mexico,2017,124777326.0
Mexico,2018,126190782.0
Mexico,2019,127575529.0
