In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer


import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [2]:
df= pd.read_csv('pair_nulos.csv', index_col=0)

Gestion de Nulos

En este pair programming usaremos el csv que generamos en el pair programming de ayer. El objetivo del ejercicio es identificar las columnas que contienen valores nulos y aplicar un proceso de imputación para manejarlos de manera adecuada.

Instrucciones:

Identificación de Valores Nulos: Identifica todas las columnas que contengan valores nulos en el DataFrame.

Selección de Método de Imputación: Discute en tu compañera cuál sería la mejor estrategia para manejar los valores nulos en cada una de las columnas identificadas en el paso anterior.

Imputación de Valores Nulos: Implementa el método de imputación seleccionado en el paso 2 para llenar los valores nulos en las columnas.

Informe: Añade al final de un jupyter una explicación breve que describa las columnas que tenían valores nulos, cómo decidiste imputarlos y cualquier observación adicional que consideres importante sobre el proceso de limpieza de datos.

Nota: Puedes utilizar cualquier método o estrategia de imputación que consideres adecuado para los datos y discutir las ventajas y desventajas de tu elección en el informe.

In [3]:
df.columns

Index(['country', 'density', 'abbreviation', 'agriculturalland', 'landarea',
       'armedforcessize', 'birthrate', 'callingcode', 'capital/majorcity',
       'co2-emissions', 'cpi', 'cpichange', 'currency-code', 'fertilityrate',
       'forestedarea', 'gasolineprice', 'gdp',
       'grossprimaryeducationenrollment', 'grosstertiaryeducationenrollment',
       'infantmortality', 'largestcity', 'lifeexpectancy',
       'maternalmortalityratio', 'minimumwage', 'officiallanguage',
       'outofpockethealthexpenditure', 'physiciansperthousand', 'population',
       'populationlaborforceparticipation', 'taxrevenue', 'totaltaxrate',
       'unemploymentrate', 'urban_population', 'lattitude', 'longitude',
       'continente'],
      dtype='object')

In [4]:
df.head(3)

Unnamed: 0,country,density,abbreviation,agriculturalland,landarea,armedforcessize,birthrate,callingcode,capital/majorcity,co2-emissions,cpi,cpichange,currency-code,fertilityrate,forestedarea,gasolineprice,gdp,grossprimaryeducationenrollment,grosstertiaryeducationenrollment,infantmortality,largestcity,lifeexpectancy,maternalmortalityratio,minimumwage,officiallanguage,outofpockethealthexpenditure,physiciansperthousand,population,populationlaborforceparticipation,taxrevenue,totaltaxrate,unemploymentrate,urban_population,lattitude,longitude,continente
0,Afghanistan,60.0,AF,58.1,652230.0,323000.0,32.49,93.0,Kabul,8672.0,149.9,2.3,AFN,4.47,2.1,0.7,19101350000.0,104.0,9.7,47.9,Kabul,64.5,638.0,0.43,Pashto,78.4,0.28,38041754.0,48.9,9.3,71.4,11.12,9797273.0,33.93911,67.709953,Asia
1,Albania,105.0,AL,43.1,28748.0,9000.0,11.78,355.0,Tirana,4536.0,119.05,1.4,ALL,1.62,28.1,1.36,15278080000.0,107.0,55.0,7.8,Tirana,78.5,15.0,1.12,Albanian,56.9,1.2,2854191.0,55.7,18.6,36.6,12.33,1747593.0,41.153332,20.168331,Europa
2,Algeria,18.0,DZ,17.4,2381741.0,317000.0,24.28,213.0,Algiers,150006.0,151.36,2.0,DZD,3.02,0.8,0.28,169988200000.0,109.9,51.4,20.1,Algiers,76.7,112.0,0.95,Arabic,28.1,1.72,43053054.0,41.2,37.2,66.1,11.7,31510100.0,28.033886,1.659626,África


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195 entries, 0 to 194
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   country                            195 non-null    object 
 1   density                            195 non-null    float64
 2   abbreviation                       188 non-null    object 
 3   agriculturalland                   188 non-null    float64
 4   landarea                           194 non-null    float64
 5   armedforcessize                    171 non-null    float64
 6   birthrate                          189 non-null    float64
 7   callingcode                        194 non-null    float64
 8   capital/majorcity                  192 non-null    object 
 9   co2-emissions                      188 non-null    float64
 10  cpi                                178 non-null    float64
 11  cpichange                          179 non-null    float64

In [6]:
(df.isnull().sum() / df.shape[0]) * 100

country                               0.000000
density                               0.000000
abbreviation                          3.589744
agriculturalland                      3.589744
landarea                              0.512821
armedforcessize                      12.307692
birthrate                             3.076923
callingcode                           0.512821
capital/majorcity                     1.538462
co2-emissions                         3.589744
cpi                                   8.717949
cpichange                             8.205128
currency-code                         7.692308
fertilityrate                         3.589744
forestedarea                          3.589744
gasolineprice                        10.256410
gdp                                   1.025641
grossprimaryeducationenrollment       3.589744
grosstertiaryeducationenrollment      6.153846
infantmortality                       3.076923
largestcity                           3.076923
lifeexpectanc

In [7]:
#Filtramos el dataframe solo para quedarnos con los que son nulos
df_nulos = pd.DataFrame((df.isnull().sum() / df.shape[0]) * 100, columns = ["%_nulos"])

# filtramos el DataFrame para quedarnos solo con aquellas columnas que tengan nulos
df_nulos[df_nulos["%_nulos"] > 0]


Unnamed: 0,%_nulos
abbreviation,3.589744
agriculturalland,3.589744
landarea,0.512821
armedforcessize,12.307692
birthrate,3.076923
callingcode,0.512821
capital/majorcity,1.538462
co2-emissions,3.589744
cpi,8.717949
cpichange,8.205128


In [8]:
#Inspeccionamos las columnas que tienen nulos y  son categoricas y los nulos en las numericas
nulos_categoricas = df[df.columns[df.isnull().any()]].select_dtypes(include = "O").columns
nulos_categoricas

Index(['abbreviation', 'capital/majorcity', 'currency-code', 'largestcity',
       'officiallanguage'],
      dtype='object')

In [9]:
for col in nulos_categoricas:
    print(f"La distribución de las categorías para la columna {col.upper()}")
    display(df[col].value_counts() / df.shape[0])
    print("........................")

La distribución de las categorías para la columna ABBREVIATION


AF    0.005128
PY    0.005128
NE    0.005128
NG    0.005128
KP    0.005128
        ...   
GR    0.005128
GD    0.005128
GT    0.005128
GN    0.005128
ZW    0.005128
Name: abbreviation, Length: 188, dtype: float64

........................
La distribución de las categorías para la columna CAPITAL/MAJORCITY


Kabul                    0.005128
Tirana                   0.005128
Wellington               0.005128
Managua                  0.005128
Niamey                   0.005128
                           ...   
Athens                   0.005128
St. George's, Grenada    0.005128
Guatemala City           0.005128
Conakry                  0.005128
Harare                   0.005128
Name: capital/majorcity, Length: 192, dtype: float64

........................
La distribución de las categorías para la columna CURRENCY-CODE


EUR    0.117949
XOF    0.041026
XCD    0.030769
USD    0.030769
XAF    0.025641
         ...   
GMD    0.005128
FJD    0.005128
ETB    0.005128
ERN    0.005128
ZMW    0.005128
Name: currency-code, Length: 133, dtype: float64

........................
La distribución de las categorías para la columna LARGESTCITY


S����                    0.010256
Panama City              0.005128
Managua                  0.005128
Niamey                   0.005128
Lagos                    0.005128
                           ...   
Macedonia                0.005128
St. George's, Grenada    0.005128
Guatemala City           0.005128
Kankan                   0.005128
Harare                   0.005128
Name: largestcity, Length: 188, dtype: float64

........................
La distribución de las categorías para la columna OFFICIALLANGUAGE


English             0.158974
French              0.128205
Spanish             0.097436
Arabic              0.092308
Portuguese          0.035897
                      ...   
Albanian            0.005128
Jamaican English    0.005128
Lao                 0.005128
Latvian             0.005128
Shona               0.005128
Name: officiallanguage, Length: 76, dtype: float64

........................


In [10]:
#Ponemos la moda en los nulos de la columna officiallanguage
#Ponemos unknown en los nulos de las columnas de la lista columnas_desconocido
columnas_moda = ["officiallanguage"]
columnas_desconocido = ["largestcity", "currency-code", "capital/majorcity", "abbreviation"]

In [11]:
for col in columnas_moda:
    moda = df[col].mode()[0]    
    df[col] = df[col].fillna(moda)

print("Después del reemplazo usando 'fillna' quedan los siguientes nulos")

df[columnas_moda].isnull().sum()

Después del reemplazo usando 'fillna' quedan los siguientes nulos


officiallanguage    0
dtype: int64

In [12]:
for col in columnas_desconocido:

    df[col] = df[col].fillna("unknown")

print("Después del reemplazo usando 'fillna' quedan los siguientes nulos")
df[columnas_desconocido].isnull().sum()

Después del reemplazo usando 'fillna' quedan los siguientes nulos


largestcity          0
currency-code        0
capital/majorcity    0
abbreviation         0
dtype: int64

### Tratamiento de los nulos en las columnas numericas

In [13]:
nulos_numericos = df[df.columns[df.isnull().any()]].select_dtypes(include = np.number).columns
nulos_numericos

Index(['agriculturalland', 'landarea', 'armedforcessize', 'birthrate',
       'callingcode', 'co2-emissions', 'cpi', 'cpichange', 'fertilityrate',
       'forestedarea', 'gasolineprice', 'gdp',
       'grossprimaryeducationenrollment', 'grosstertiaryeducationenrollment',
       'infantmortality', 'lifeexpectancy', 'maternalmortalityratio',
       'minimumwage', 'outofpockethealthexpenditure', 'physiciansperthousand',
       'population', 'populationlaborforceparticipation', 'taxrevenue',
       'totaltaxrate', 'unemploymentrate', 'urban_population', 'lattitude',
       'longitude'],
      dtype='object')

In [14]:
dfnulosnum = df[nulos_numericos].isnull().sum() / df.shape[0]
dfnulosnum

agriculturalland                     0.035897
landarea                             0.005128
armedforcessize                      0.123077
birthrate                            0.030769
callingcode                          0.005128
co2-emissions                        0.035897
cpi                                  0.087179
cpichange                            0.082051
fertilityrate                        0.035897
forestedarea                         0.035897
gasolineprice                        0.102564
gdp                                  0.010256
grossprimaryeducationenrollment      0.035897
grosstertiaryeducationenrollment     0.061538
infantmortality                      0.030769
lifeexpectancy                       0.041026
maternalmortalityratio               0.071795
minimumwage                          0.230769
outofpockethealthexpenditure         0.035897
physiciansperthousand                0.035897
population                           0.005128
populationlaborforceparticipation 

In [15]:
#Los que estan por encima o igual de 5% de nulos les aplicamos los metodos iterative y KNN
dfnulosnum[dfnulosnum >= 0.05]

armedforcessize                      0.123077
cpi                                  0.087179
cpichange                            0.082051
gasolineprice                        0.102564
grosstertiaryeducationenrollment     0.061538
maternalmortalityratio               0.071795
minimumwage                          0.230769
populationlaborforceparticipation    0.097436
taxrevenue                           0.133333
totaltaxrate                         0.061538
unemploymentrate                     0.097436
dtype: float64

In [16]:
dfcopia = df.copy()
dfcopia.sample(2)

Unnamed: 0,country,density,abbreviation,agriculturalland,landarea,armedforcessize,birthrate,callingcode,capital/majorcity,co2-emissions,cpi,cpichange,currency-code,fertilityrate,forestedarea,gasolineprice,gdp,grossprimaryeducationenrollment,grosstertiaryeducationenrollment,infantmortality,largestcity,lifeexpectancy,maternalmortalityratio,minimumwage,officiallanguage,outofpockethealthexpenditure,physiciansperthousand,population,populationlaborforceparticipation,taxrevenue,totaltaxrate,unemploymentrate,urban_population,lattitude,longitude,continente
18,Benin,108.0,BJ,33.3,112622.0,12000.0,36.22,229.0,Porto-Novo,6476.0,110.71,-0.9,XOF,4.84,37.8,0.72,14390710000.0,122.0,12.3,60.5,Cotonou,61.5,397.0,0.39,French,40.5,0.08,11801151.0,70.9,10.8,48.9,2.23,5648149.0,9.30769,2.315834,África
171,Tajikistan,68.0,TJ,34.1,144100.0,17000.0,30.76,992.0,Dushanbe,5310.0,148.57,6.0,TJS,3.59,3.0,0.71,8116627000.0,100.9,31.3,30.4,Dushanbe,70.9,17.0,0.23,Persian,63.1,1.7,9321018.0,42.0,9.8,67.3,11.02,2545477.0,38.861034,71.276093,Asia


In [17]:
iterative = IterativeImputer(max_iter = 20, random_state = 42)

# ajustamos y tranformamos los datos
iterative_hecho = iterative.fit_transform(df[[
    'armedforcessize', 
    'cpi', 
    'cpichange', 
    'gasolineprice', 
    'grosstertiaryeducationenrollment', 
    'maternalmortalityratio', 
    'minimumwage', 
    'populationlaborforceparticipation', 
    'taxrevenue', 
    'totaltaxrate', 
    'unemploymentrate'
]])

iterative_hecho

array([[3.23000000e+05, 1.49900000e+02, 2.30000000e+00, ...,
        9.30000000e+00, 7.14000000e+01, 1.11200000e+01],
       [9.00000000e+03, 1.19050000e+02, 1.40000000e+00, ...,
        1.86000000e+01, 3.66000000e+01, 1.23300000e+01],
       [3.17000000e+05, 1.51360000e+02, 2.00000000e+00, ...,
        3.72000000e+01, 6.61000000e+01, 1.17000000e+01],
       ...,
       [4.00000000e+04, 1.57580000e+02, 8.10000000e+00, ...,
        1.67562804e+01, 2.66000000e+01, 1.29100000e+01],
       [1.60000000e+04, 2.12310000e+02, 9.20000000e+00, ...,
        1.62000000e+01, 1.56000000e+01, 1.14300000e+01],
       [5.10000000e+04, 1.05510000e+02, 9.00000000e-01, ...,
        2.07000000e+01, 3.16000000e+01, 4.95000000e+00]])

In [18]:
dfcopia[['armedforcessize_Ite', 'cpi_Ite', 'cpichange_Ite', 'gasolineprice_Ite', 'grosstertiaryeducationenrollment_Ite','maternalmortalityratio_Ite', 
          'minimumwage_Ite', 'populationlaborforceparticipation_Ite', 'taxrevenue_Ite','totaltaxrate_Ite', 'unemploymentrate_Ite']] = iterative_hecho


In [19]:
#comprobamos 
dfcopia.sample(5)

Unnamed: 0,country,density,abbreviation,agriculturalland,landarea,armedforcessize,birthrate,callingcode,capital/majorcity,co2-emissions,cpi,cpichange,currency-code,fertilityrate,forestedarea,gasolineprice,gdp,grossprimaryeducationenrollment,grosstertiaryeducationenrollment,infantmortality,largestcity,lifeexpectancy,maternalmortalityratio,minimumwage,officiallanguage,outofpockethealthexpenditure,physiciansperthousand,population,populationlaborforceparticipation,taxrevenue,totaltaxrate,unemploymentrate,urban_population,lattitude,longitude,continente,armedforcessize_Ite,cpi_Ite,cpichange_Ite,gasolineprice_Ite,grosstertiaryeducationenrollment_Ite,maternalmortalityratio_Ite,minimumwage_Ite,populationlaborforceparticipation_Ite,taxrevenue_Ite,totaltaxrate_Ite,unemploymentrate_Ite
131,Pakistan,287.0,PK,47.8,796095.0,936000.0,28.25,92.0,Islamabad,201150.0,182.32,10.6,PKR,3.51,1.9,0.79,304400000000.0,94.3,9.0,57.2,Karachi,67.1,140.0,0.69,Urdu,66.5,0.98,216565318.0,52.6,9.2,33.9,4.45,79927762.0,30.375321,69.345116,Asia,936000.0,182.32,10.6,0.79,9.0,140.0,0.69,52.6,9.2,33.9,4.45
22,Botswana,4.0,BW,45.6,581730.0,9000.0,24.82,267.0,Gaborone,6340.0,149.75,2.8,BWP,2.87,18.9,0.71,18340510000.0,103.2,24.9,30.0,Gaborone,69.3,144.0,0.29,English,5.3,0.37,2346179.0,70.8,19.5,25.1,18.19,1616550.0,-22.328474,24.684866,África,9000.0,149.75,2.8,0.71,24.9,144.0,0.29,70.8,19.5,25.1,18.19
103,Malaysia,99.0,MY,26.3,329847.0,136000.0,16.75,60.0,Kuala Lumpur,248289.0,121.46,0.7,MYR,2.0,67.6,0.45,364701500000.0,105.3,45.1,6.7,Johor Bahru,76.0,29.0,0.93,Malaysian language,36.7,1.51,32447385.0,64.3,12.0,38.7,3.32,24475766.0,4.210484,101.975766,Asia,136000.0,121.46,0.7,0.45,45.1,29.0,0.93,64.3,12.0,38.7,3.32
145,Saint Kitts and Nevis,205.0,KN,23.1,261.0,,12.6,1.0,Basseterre,238.0,104.57,-1.0,XCD,2.11,42.3,,1050993000.0,108.7,86.7,9.8,Basseterre,71.3,,3.33,English,56.6,2.52,52823.0,,18.5,49.7,,16269.0,17.357822,-62.782998,América del Norte,159275.11318,104.57,-1.0,1.14504,86.7,-32.186029,3.33,59.315013,18.5,49.7,6.875463
82,Israel,400.0,IL,24.6,20770.0,178000.0,20.8,972.0,Jerusalem,65166.0,108.15,0.8,ILS,3.09,7.7,1.57,395098700000.0,104.9,63.4,3.0,Jerusalem,82.8,3.0,7.58,Hebrew,24.4,4.62,9053300.0,64.0,23.1,25.3,3.86,8374393.0,31.046051,34.851612,Asia,178000.0,108.15,0.8,1.57,63.4,3.0,7.58,64.0,23.1,25.3,3.86


In [20]:
#Le hacemos el KNN
knn = KNNImputer(n_neighbors = 5)


knn_hecho = knn.fit_transform(dfcopia[[
    'armedforcessize', 
    'cpi', 
    'cpichange', 
    'gasolineprice', 
    'grosstertiaryeducationenrollment', 
    'maternalmortalityratio', 
    'minimumwage', 
    'populationlaborforceparticipation', 
    'taxrevenue', 
    'totaltaxrate', 
    'unemploymentrate'
]])


knn_hecho

array([[3.2300e+05, 1.4990e+02, 2.3000e+00, ..., 9.3000e+00, 7.1400e+01,
        1.1120e+01],
       [9.0000e+03, 1.1905e+02, 1.4000e+00, ..., 1.8600e+01, 3.6600e+01,
        1.2330e+01],
       [3.1700e+05, 1.5136e+02, 2.0000e+00, ..., 3.7200e+01, 6.6100e+01,
        1.1700e+01],
       ...,
       [4.0000e+04, 1.5758e+02, 8.1000e+00, ..., 2.1140e+01, 2.6600e+01,
        1.2910e+01],
       [1.6000e+04, 2.1231e+02, 9.2000e+00, ..., 1.6200e+01, 1.5600e+01,
        1.1430e+01],
       [5.1000e+04, 1.0551e+02, 9.0000e-01, ..., 2.0700e+01, 3.1600e+01,
        4.9500e+00]])

In [21]:
dfcopia[['armedforcessize_kN', 'cpi_kN', 'cpichange_kN', 'gasolineprice_kN', 'grosstertiaryeducationenrollment_kN','maternalmortalityratio_kN', 
          'minimumwage_kN', 'populationlaborforceparticipation_kN', 'taxrevenue_kN','totaltaxrate_kN', 'unemploymentrate_kN']] = knn_hecho


In [22]:
dfcopia.sample(5)

Unnamed: 0,country,density,abbreviation,agriculturalland,landarea,armedforcessize,birthrate,callingcode,capital/majorcity,co2-emissions,cpi,cpichange,currency-code,fertilityrate,forestedarea,gasolineprice,gdp,grossprimaryeducationenrollment,grosstertiaryeducationenrollment,infantmortality,largestcity,lifeexpectancy,maternalmortalityratio,minimumwage,officiallanguage,outofpockethealthexpenditure,physiciansperthousand,population,populationlaborforceparticipation,taxrevenue,totaltaxrate,unemploymentrate,urban_population,lattitude,longitude,continente,armedforcessize_Ite,cpi_Ite,cpichange_Ite,gasolineprice_Ite,grosstertiaryeducationenrollment_Ite,maternalmortalityratio_Ite,minimumwage_Ite,populationlaborforceparticipation_Ite,taxrevenue_Ite,totaltaxrate_Ite,unemploymentrate_Ite,armedforcessize_kN,cpi_kN,cpichange_kN,gasolineprice_kN,grosstertiaryeducationenrollment_kN,maternalmortalityratio_kN,minimumwage_kN,populationlaborforceparticipation_kN,taxrevenue_kN,totaltaxrate_kN,unemploymentrate_kN
58,Fiji,49.0,FJ,23.3,18274.0,4000.0,21.28,679.0,Suva,2046.0,132.3,1.8,FJD,2.77,55.9,0.82,5535549000.0,106.4,16.1,21.6,Suva,67.3,34.0,1.28,Fiji Hindi,21.4,0.84,889953.0,57.6,24.2,32.1,4.1,505048.0,-17.713371,178.065032,Oceanía,4000.0,132.3,1.8,0.82,16.1,34.0,1.28,57.6,24.2,32.1,4.1,4000.0,132.3,1.8,0.82,16.1,34.0,1.28,57.6,24.2,32.1,4.1
115,Montenegro,47.0,ME,19.0,13812.0,12000.0,11.73,382.0,Podgorica,2017.0,116.32,2.6,EUR,1.75,61.5,1.16,5494737000.0,100.0,56.1,2.3,Podgorica,76.8,6.0,1.23,Montenegrin language,31.8,2.76,622137.0,54.4,,22.2,14.88,417765.0,42.708678,19.37439,Europa,12000.0,116.32,2.6,1.16,56.1,6.0,1.23,54.4,19.114884,22.2,14.88,12000.0,116.32,2.6,1.16,56.1,6.0,1.23,54.4,20.72,22.2,14.88
184,United Arab Emirates,118.0,AE,5.5,83600.0,63000.0,10.33,971.0,Abu Dhabi,206324.0,114.52,-1.9,AED,1.41,4.6,0.49,421142300000.0,108.4,36.8,6.5,Dubai,77.8,3.0,,Arabic,17.8,2.53,9770529.0,82.1,0.1,15.9,2.35,8479744.0,23.424076,53.847818,Asia,63000.0,114.52,-1.9,0.49,36.8,3.0,2.346871,82.1,0.1,15.9,2.35,63000.0,114.52,-1.9,0.49,36.8,3.0,4.434,82.1,0.1,15.9,2.35
118,Myanmar,83.0,MM,19.5,676578.0,513000.0,17.55,95.0,Naypyidaw,25280.0,168.18,8.8,MMK,2.15,43.6,0.54,76085850000.0,112.3,18.8,36.8,Yangon,66.9,250.0,0.39,Burmese,73.9,0.68,54045420.0,61.7,5.4,31.2,1.58,16674093.0,21.916221,95.955974,Asia,513000.0,168.18,8.8,0.54,18.8,250.0,0.39,61.7,5.4,31.2,1.58,513000.0,168.18,8.8,0.54,18.8,250.0,0.39,61.7,5.4,31.2,1.58
66,Greece,81.0,GR,47.6,131957.0,146000.0,8.1,30.0,Athens,62434.0,101.87,0.2,EUR,1.35,31.7,1.54,209852800000.0,99.6,136.6,3.6,Macedonia,81.3,3.0,4.46,Greek,35.5,5.48,10716322.0,51.8,26.2,51.9,17.24,8507474.0,39.074208,21.824312,Europa,146000.0,101.87,0.2,1.54,136.6,3.0,4.46,51.8,26.2,51.9,17.24,146000.0,101.87,0.2,1.54,136.6,3.0,4.46,51.8,26.2,51.9,17.24


In [23]:
dfcopia.columns

Index(['country', 'density', 'abbreviation', 'agriculturalland', 'landarea',
       'armedforcessize', 'birthrate', 'callingcode', 'capital/majorcity',
       'co2-emissions', 'cpi', 'cpichange', 'currency-code', 'fertilityrate',
       'forestedarea', 'gasolineprice', 'gdp',
       'grossprimaryeducationenrollment', 'grosstertiaryeducationenrollment',
       'infantmortality', 'largestcity', 'lifeexpectancy',
       'maternalmortalityratio', 'minimumwage', 'officiallanguage',
       'outofpockethealthexpenditure', 'physiciansperthousand', 'population',
       'populationlaborforceparticipation', 'taxrevenue', 'totaltaxrate',
       'unemploymentrate', 'urban_population', 'lattitude', 'longitude',
       'continente', 'armedforcessize_Ite', 'cpi_Ite', 'cpichange_Ite',
       'gasolineprice_Ite', 'grosstertiaryeducationenrollment_Ite',
       'maternalmortalityratio_Ite', 'minimumwage_Ite',
       'populationlaborforceparticipation_Ite', 'taxrevenue_Ite',
       'totaltaxrate_Ite', 'une

In [25]:
#eliminamos las columnas
dfcopia.drop(['armedforcessize', 'cpi','cpichange', 
              'gasolineprice', 'grosstertiaryeducationenrollment',  'maternalmortalityratio', 'minimumwage', 'populationlaborforceparticipation',  'taxrevenue', 'totaltaxrate', 'unemploymentrate', 'armedforcessize',  
        'armedforcessize_Ite',
       'cpi_Ite', 'cpichange_Ite', 'gasolineprice_Ite',
       'grosstertiaryeducationenrollment_Ite', 'maternalmortalityratio_Ite',
       'minimumwage_Ite', 'populationlaborforceparticipation_Ite',
       'taxrevenue_Ite', 'totaltaxrate_Ite', 'unemploymentrate_Ite',
       'armedforcessize_kN', 'cpi_kN', 'cpichange_kN', 'gasolineprice_kN',
       'grosstertiaryeducationenrollment_kN', 'maternalmortalityratio_kN',
       'minimumwage_kN', 'populationlaborforceparticipation_kN',
       'taxrevenue_kN', 'totaltaxrate_kN', 'unemploymentrate_kN'], axis=1, inplace=True)

In [26]:
#remmplazamos por las columnas que hemos decidido.
nuevas_columnas = {'armedforcessize_kN': 'armedforcessize', 'cpi_kN': 'cpi', 'cpichange_kN':'cpichange', 'gasolineprice_kN': 'gasolineprice', 'grosstertiaryeducationenrollment_kN': 'grosstertiaryeducationenrollment','maternalmortalityratio_kN': 'maternalmortalityratio', 
          'minimumwage_kN':'minimumwage' , 'populationlaborforceparticipation_kN':'populationlaborforceparticipation','taxrevenue_kN':'totaltaxrate', 'unemploymentrate_kN':'unemploymentrate', 'taxrevenue_kN': 'taxrevenue'}
dfcopia.rename(columns = nuevas_columnas, inplace = True)


In [27]:
dfcopia.columns

Index(['country', 'density', 'abbreviation', 'agriculturalland', 'landarea',
       'birthrate', 'callingcode', 'capital/majorcity', 'co2-emissions',
       'currency-code', 'fertilityrate', 'forestedarea', 'gdp',
       'grossprimaryeducationenrollment', 'infantmortality', 'largestcity',
       'lifeexpectancy', 'officiallanguage', 'outofpockethealthexpenditure',
       'physiciansperthousand', 'population', 'urban_population', 'lattitude',
       'longitude', 'continente'],
      dtype='object')

In [28]:
#sacamos los que estan por debajo de 5% de nulos y les ponemos la moda o mediana
dfnulosnum[dfnulosnum < 0.05] 

agriculturalland                   0.035897
landarea                           0.005128
birthrate                          0.030769
callingcode                        0.005128
co2-emissions                      0.035897
fertilityrate                      0.035897
forestedarea                       0.035897
gdp                                0.010256
grossprimaryeducationenrollment    0.035897
infantmortality                    0.030769
lifeexpectancy                     0.041026
outofpockethealthexpenditure       0.035897
physiciansperthousand              0.035897
population                         0.005128
urban_population                   0.025641
lattitude                          0.005128
longitude                          0.005128
dtype: float64

In [29]:
columnas_a_tratar = ['agriculturalland', 'landarea', 'armedforcessize', 'birthrate',
       'callingcode', 'co2-emissions', 'cpi', 'cpichange', 'fertilityrate',
       'forestedarea', 'gasolineprice', 'gdp',
       'grossprimaryeducationenrollment', 'grosstertiaryeducationenrollment',
       'infantmortality', 'lifeexpectancy', 'maternalmortalityratio',
       'minimumwage', 'outofpockethealthexpenditure', 'physiciansperthousand',
       'population', 'populationlaborforceparticipation', 'taxrevenue',
       'totaltaxrate', 'unemploymentrate', 'urban_population', 'lattitude',
       'longitude']

In [30]:
for col in columnas_a_tratar:
    if col in df.columns:  
        media = df[col].describe()[["mean", "50%"]]
        display(col,media)

'agriculturalland'

mean    39.117553
50%     39.600000
Name: agriculturalland, dtype: float64

'landarea'

mean    689624.365979
50%     119511.000000
Name: landarea, dtype: float64

'armedforcessize'

mean    159274.853801
50%      31000.000000
Name: armedforcessize, dtype: float64

'birthrate'

mean    20.214974
50%     17.950000
Name: birthrate, dtype: float64

'callingcode'

mean    360.546392
50%     255.500000
Name: callingcode, dtype: float64

'co2-emissions'

mean    177799.239362
50%      12303.000000
Name: co2-emissions, dtype: float64

'cpi'

mean    190.460955
50%     125.340000
Name: cpi, dtype: float64

'cpichange'

mean    6.722346
50%     2.300000
Name: cpichange, dtype: float64

'fertilityrate'

mean    2.698138
50%     2.245000
Name: fertilityrate, dtype: float64

'forestedarea'

mean    32.015426
50%     32.000000
Name: forestedarea, dtype: float64

'gasolineprice'

mean    1.002457
50%     0.980000
Name: gasolineprice, dtype: float64

'gdp'

mean    4.772959e+11
50%     3.438723e+10
Name: gdp, dtype: float64

'grossprimaryeducationenrollment'

mean    102.470213
50%     102.550000
Name: grossprimaryeducationenrollment, dtype: float64

'grosstertiaryeducationenrollment'

mean    37.963388
50%     31.200000
Name: grosstertiaryeducationenrollment, dtype: float64

'infantmortality'

mean    21.332804
50%     14.000000
Name: infantmortality, dtype: float64

'lifeexpectancy'

mean    72.279679
50%     73.200000
Name: lifeexpectancy, dtype: float64

'maternalmortalityratio'

mean    160.392265
50%      53.000000
Name: maternalmortalityratio, dtype: float64

'minimumwage'

mean    2.194133
50%     1.045000
Name: minimumwage, dtype: float64

'outofpockethealthexpenditure'

mean    32.637234
50%     30.700000
Name: outofpockethealthexpenditure, dtype: float64

'physiciansperthousand'

mean    1.83984
50%     1.46000
Name: physiciansperthousand, dtype: float64

'population'

mean    3.938116e+07
50%     8.826588e+06
Name: population, dtype: float64

'populationlaborforceparticipation'

mean    62.738068
50%     62.450000
Name: populationlaborforceparticipation, dtype: float64

'taxrevenue'

mean    16.573964
50%     16.300000
Name: taxrevenue, dtype: float64

'totaltaxrate'

mean    40.822404
50%     37.200000
Name: totaltaxrate, dtype: float64

'unemploymentrate'

mean    6.886364
50%     5.360000
Name: unemploymentrate, dtype: float64

'urban_population'

mean    2.230454e+07
50%     4.678104e+06
Name: urban_population, dtype: float64

'lattitude'

mean    19.092351
50%     17.273849
Name: lattitude, dtype: float64

'longitude'

mean    20.232434
50%     20.972652
Name: longitude, dtype: float64

In [31]:
#Sustituimos los nulos inferiores a 5% de las columnas numericas con la mediana.

columnas = ['agriculturalland', 'landarea', 'birthrate',
       'callingcode', 'co2-emissions', 'fertilityrate',
       'forestedarea', 'gdp',
       'grossprimaryeducationenrollment', 
       'infantmortality', 'lifeexpectancy',
        'outofpockethealthexpenditure', 'physiciansperthousand',
       'population', 'urban_population', 'lattitude',
       'longitude']


for col in columnas:

    mediana = df[col].median()
    df[col] = df[col].fillna(mediana)
