## This notebook performs data wrangling for a demographic research project


In [36]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import numpy as np

%matplotlib inline
pd.options.display.max_columns = 100
pd.options.display.max_rows = 250
plt.rcParams["figure.figsize"] = [15, 5]
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('Consolidated Data_Demographics.csv',header=2,nrows=217)

In [109]:
#df.to_csv('df.csv')

In [4]:
df.head()

Unnamed: 0,Country Name,region,sub-region,Country Code,Income group,Population aged 0 to 14 years old (percentage),Population aged 60+ years old (percentage),Population density,Population mid-year estimates (millions),Population mid-year estimates for females (millions),Population mid-year estimates for males (millions),Sex ratio (males per 100 females),GDP(USD - billion),Ethnicity,Proportion of health care facilities with basic hygiene services,Proportion of health care facilities with limited hygiene services,Proportion of health care facilities with no hygiene service,Capital health expenditure (% of GDP),Current health expenditure per capita (current US$),Current health expenditure (% of GDP),Domestic general government health expenditure (% of current health expenditure),Domestic general government health expenditure (% of GDP),Domestic general government health expenditure (% of general government expenditure),Domestic general government health expenditure per capita (current US$),Domestic private health expenditure (% of current health expenditure),Domestic private health expenditure per capita (current US$),External health expenditure (% of current health expenditure),External health expenditure per capita (current US$),"Hospital beds (per 1,000 people)","Hospital beds (per 1,000 people).1","Hospital beds (per 1,000 people).2",Total density per 100 000 population: Health posts,Total density per 100 000 population: Health centres,Total density per 100 000 population: District/rural hospitals,Total density per 100 000 population: Provincial hospitals,Total density per 100 000 population: Specialized hospitals,Total density per 100 000 population: Hospitals,Healthcare Workers,Doctor-Population Ratio,TOTAL TESTS,TOTAL TESTS_per thousand,Cumulative Cases,Cumulative Deaths,R0,Lockdown (Y/N),Lockdown Date,Lockdown Type
0,Afghanistan,Asia,Southern Asia,AFG,Low income,42.4723,4.1655,58.2694,38.0418,18.512,19.5297,105.4975,19.36297,,..,..,..,0.21333474,60.0888133,10.19867734,5.12806279,0.52299458,2.01440592,2.93575239,77.40100618,44.31111664,17.47093102,10.00189145,..,..,..,2.95,1.22,0.18,0.1,0.09,0.37,,,..,..,,,,1,24/03/2020,Full
1,Albania,Europe,Southern Europe,ALB,Upper middle income,17.3996,20.4779,105.143,2.8809,1.4141,1.4668,103.7236,15.102501,,..,..,..,0.01236091,264.4346027,6.69789475,41.35680773,2.77003546,9.50813332,112.301534,57.98090142,157.4431038,0.66229084,1.7984047,..,..,..,13.08,0,0.72,0.35,0.28,1.36,,,..,..,,,,1,08/03/2020,Full
2,Algeria,Africa,Northern Africa,DZA,Upper middle income,30.5504,9.6817,18.0763,43.0531,21.3034,21.7497,102.0949,173.757953,,..,..,..,0.01104053,290.5032401,6.648475,67.68694359,4.50014952,10.73426065,176.2653157,32.28954319,84.08603227,0.02351322,0.06123139,..,..,..,..,..,..,..,..,..,,,..,..,,,,1,24/03/2020,Full
3,American Samoa,Oceania,Polynesia,ASM,Upper middle income,..,..,276.56,0.0553,..,..,..,0.636,,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,,,..,..,,,,0,..,
4,Andorra,Europe,Southern Europe,AND,High income,..,..,164.1319,0.0771,..,..,..,3.236544,,..,..,..,..,3698.117574,10.36772767,49.12979925,5.09364379,14.01755583,1883.995436,50.87020075,1950.735145,..,..,..,..,..,..,..,..,..,..,..,,,..,..,,,,1,16/03/2020,Full


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 47 columns):
Country Name                                                                            217 non-null object
region                                                                                  217 non-null object
sub-region                                                                              217 non-null object
Country Code                                                                            217 non-null object
Income group                                                                            217 non-null object
Population aged 0 to 14 years old (percentage)                                          217 non-null object
Population aged 60+ years old (percentage)                                              217 non-null object
Population density                                                                      217 non-null object
Population mid-year estimates (

In [6]:
df=df.replace('..', np.nan)


In [7]:
## Columns with number of NaN

df.isnull().sum()

Country Name                                                                              0
region                                                                                    0
sub-region                                                                                0
Country Code                                                                              0
Income group                                                                              0
Population aged 0 to 14 years old (percentage)                                           25
Population aged 60+ years old (percentage)                                               25
Population density                                                                        3
Population mid-year estimates (millions)                                                  3
Population mid-year estimates for females (millions)                                     25
Population mid-year estimates for males (millions)                              

Apart from the country name,geo and income features, the data also contains populations features, gdp, health expediture, health infra and covid-related features.

The covid-related features are cleaned separately as the data are time series.

Also features that have less than 70% info are dropped.

In [8]:
## To clean covid realted features separately
dfcovid=df[['Country Name','TOTAL TESTS','TOTAL TESTS_per thousand','Cumulative Cases','Cumulative Deaths','R0','Lockdown (Y/N)','Lockdown Date','Lockdown Type']]

In [9]:
## Dropping columns : Ethnicity,Proportion of health care facilities with basic hygiene services,Proportion of health care facilities with limited hygiene services 
## Proportion of health care facilities with no hygiene service,Capital health expenditure (% of GDP),Hospital beds (per 1,000 people)   
## Hospital beds (per 1,000 people)0.1,Hospital beds (per 1,000 people),Hospital beds (per 1,000 people)0.2,
## Healthcare Workers,Doctor-Population Ratio, and all covid related features
## ***Covid related features to be cleaned in a separate dataframe

In [10]:
df=df.drop(['Ethnicity','Proportion of health care facilities with basic hygiene services','Proportion of health care facilities with limited hygiene services', 
        'Proportion of health care facilities with no hygiene service','Capital health expenditure (% of GDP)','Hospital beds (per 1,000 people)',   
        'Hospital beds (per 1,000 people).1','Hospital beds (per 1,000 people).2','Total density per 100 000 population: Health posts','Total density per 100 000 population: Health centres',
        'Total density per 100 000 population: District/rural hospitals','Total density per 100 000 population: Provincial hospitals','Total density per 100 000 population: Specialized hospitals',
        'Total density per 100 000 population: Hospitals','Healthcare Workers','Doctor-Population Ratio',
        'TOTAL TESTS','TOTAL TESTS_per thousand','Cumulative Cases','Cumulative Deaths','R0','Lockdown (Y/N)','Lockdown Date','Lockdown Type'],axis=1)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 23 columns):
Country Name                                                                            217 non-null object
region                                                                                  217 non-null object
sub-region                                                                              217 non-null object
Country Code                                                                            217 non-null object
Income group                                                                            217 non-null object
Population aged 0 to 14 years old (percentage)                                          192 non-null object
Population aged 60+ years old (percentage)                                              192 non-null object
Population density                                                                      214 non-null object
Population mid-year estimates (

In [12]:
df.isnull().sum()

Country Name                                                                             0
region                                                                                   0
sub-region                                                                               0
Country Code                                                                             0
Income group                                                                             0
Population aged 0 to 14 years old (percentage)                                          25
Population aged 60+ years old (percentage)                                              25
Population density                                                                       3
Population mid-year estimates (millions)                                                 3
Population mid-year estimates for females (millions)                                    25
Population mid-year estimates for males (millions)                                      25

In [13]:
## Replacing values 0.0 with nan for imputing purposes
df=df.replace(0.0, np.nan)


In [14]:
df[
       ['Population aged 0 to 14 years old (percentage)',
       'Population aged 60+ years old (percentage)', 'Population density',
       'Population mid-year estimates (millions)',
       'Population mid-year estimates for females (millions)',
       'Population mid-year estimates for males (millions)',
       'Sex ratio (males per 100 females)', 'GDP(USD - billion)',
       'Current health expenditure per capita (current US$)',
       'Current health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of current health expenditure)',
       'Domestic general government health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure (% of current health expenditure)',
       'Domestic private health expenditure per capita (current US$)',
       'External health expenditure (% of current health expenditure)',
       'External health expenditure per capita (current US$)']]=df[
       ['Population aged 0 to 14 years old (percentage)',
       'Population aged 60+ years old (percentage)', 'Population density',
       'Population mid-year estimates (millions)',
       'Population mid-year estimates for females (millions)',
       'Population mid-year estimates for males (millions)',
       'Sex ratio (males per 100 females)', 'GDP(USD - billion)',
       'Current health expenditure per capita (current US$)',
       'Current health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of current health expenditure)',
       'Domestic general government health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure (% of current health expenditure)',
       'Domestic private health expenditure per capita (current US$)',
       'External health expenditure (% of current health expenditure)',
       'External health expenditure per capita (current US$)']].astype('float')

In [15]:
## Imputing General Features using median from income class and sub-region info


df['Population aged 0 to 14 years old (percentage)']=df['Population aged 0 to 14 years old (percentage)'].fillna(df.groupby(['sub-region','Income group'])['Population aged 0 to 14 years old (percentage)'].transform('median'))
df['Population aged 60+ years old (percentage)']=df['Population aged 60+ years old (percentage)'].fillna(df.groupby(['sub-region','Income group'])['Population aged 60+ years old (percentage)'].transform('median'))
df['Population density']=df['Population density'].fillna(df.groupby(['sub-region','Income group'])['Population density'].transform('median'))
df['Population mid-year estimates (millions)']=df['Population mid-year estimates (millions)'].fillna(df.groupby(['sub-region','Income group'])['Population mid-year estimates (millions)'].transform('median'))
df['Population mid-year estimates for females (millions)']=df['Population mid-year estimates for females (millions)'].fillna(df.groupby(['sub-region','Income group'])['Population mid-year estimates for females (millions)'].transform('median'))
df['Population mid-year estimates for males (millions)']=df['Population mid-year estimates for males (millions)'].fillna(df.groupby(['sub-region','Income group'])['Population mid-year estimates for males (millions)'].transform('median'))
df['Sex ratio (males per 100 females)']=df['Sex ratio (males per 100 females)'].fillna(df.groupby(['sub-region','Income group'])['Sex ratio (males per 100 females)'].transform('median'))


In [16]:
## Replacing values 0.0 with nan for imputing purposes
df=df.replace(0.0, np.nan)


In [17]:
#Second layer imputing group by region - for country that do not have sub-region neighbours info

df['Population aged 0 to 14 years old (percentage)']=df['Population aged 0 to 14 years old (percentage)'].fillna(df.groupby(['region','Income group'])['Population aged 0 to 14 years old (percentage)'].transform('median'))
df['Population aged 60+ years old (percentage)']=df['Population aged 60+ years old (percentage)'].fillna(df.groupby(['region','Income group'])['Population aged 60+ years old (percentage)'].transform('median'))
df['Population density']=df['Population density'].fillna(df.groupby(['region','Income group'])['Population density'].transform('median'))
df['Population mid-year estimates (millions)']=df['Population mid-year estimates (millions)'].fillna(df.groupby(['region','Income group'])['Population mid-year estimates (millions)'].transform('median'))
df['Population mid-year estimates for females (millions)']=df['Population mid-year estimates for females (millions)'].fillna(df.groupby(['region','Income group'])['Population mid-year estimates for females (millions)'].transform('median'))
df['Population mid-year estimates for males (millions)']=df['Population mid-year estimates for males (millions)'].fillna(df.groupby(['region','Income group'])['Population mid-year estimates for males (millions)'].transform('median'))
df['Sex ratio (males per 100 females)']=df['Sex ratio (males per 100 females)'].fillna(df.groupby(['region','Income group'])['Sex ratio (males per 100 females)'].transform('median'))


In [18]:
## Replacing values 0.0 with nan for imputing purposes
df=df.replace(0.0, np.nan)


In [19]:
# Imputing Health expenditure by taking the median of sub-region and income class
df['Current health expenditure per capita (current US$)']=df['Current health expenditure per capita (current US$)'].fillna(df.groupby(['sub-region','Income group'])['Current health expenditure per capita (current US$)'].transform('median'))
df['Current health expenditure (% of GDP)']=df['Current health expenditure (% of GDP)'].fillna(df.groupby(['sub-region','Income group'])['Current health expenditure (% of GDP)'].transform('median'))
df['Domestic general government health expenditure (% of current health expenditure)']=df['Domestic general government health expenditure (% of current health expenditure)'].fillna(df.groupby(['sub-region','Income group'])['Domestic general government health expenditure (% of current health expenditure)'].transform('median'))
df['Domestic general government health expenditure (% of GDP)']=df['Domestic general government health expenditure (% of GDP)'].fillna(df.groupby(['sub-region','Income group'])['Domestic general government health expenditure (% of GDP)'].transform('median'))
df['Domestic general government health expenditure (% of general government expenditure)']=df['Domestic general government health expenditure (% of general government expenditure)'].fillna(df.groupby(['sub-region','Income group'])['Domestic general government health expenditure (% of general government expenditure)'].transform('median'))
df['Domestic general government health expenditure per capita (current US$)']=df['Domestic general government health expenditure per capita (current US$)'].fillna(df.groupby(['sub-region','Income group'])['Domestic general government health expenditure per capita (current US$)'].transform('median'))
df['Domestic private health expenditure (% of current health expenditure)']=df['Domestic private health expenditure (% of current health expenditure)'].fillna(df.groupby(['sub-region','Income group'])['Domestic private health expenditure (% of current health expenditure)'].transform('median'))
df['Domestic private health expenditure per capita (current US$)']=df['Domestic private health expenditure per capita (current US$)'].fillna(df.groupby(['sub-region','Income group'])['Domestic private health expenditure per capita (current US$)'].transform('median'))
df['External health expenditure (% of current health expenditure)']=df['External health expenditure (% of current health expenditure)'].fillna(df.groupby(['sub-region','Income group'])['External health expenditure (% of current health expenditure)'].transform('median'))
df['External health expenditure per capita (current US$)']=df['External health expenditure per capita (current US$)'].fillna(df.groupby(['sub-region','Income group'])['External health expenditure per capita (current US$)'].transform('median'))


In [20]:
## Replacing values 0.0 with nan for imputing purposes
df=df.replace(0.0, np.nan)

In [21]:
# Second layer imputing Health expenditure by taking the median of region and income class - for country that do not have sub-region neighbours info

df['Current health expenditure per capita (current US$)']=df['Current health expenditure per capita (current US$)'].fillna(df.groupby(['region','Income group'])['Current health expenditure per capita (current US$)'].transform('median'))
df['Current health expenditure (% of GDP)']=df['Current health expenditure (% of GDP)'].fillna(df.groupby(['region','Income group'])['Current health expenditure (% of GDP)'].transform('median'))
df['Domestic general government health expenditure (% of current health expenditure)']=df['Domestic general government health expenditure (% of current health expenditure)'].fillna(df.groupby(['region','Income group'])['Domestic general government health expenditure (% of current health expenditure)'].transform('median'))
df['Domestic general government health expenditure (% of GDP)']=df['Domestic general government health expenditure (% of GDP)'].fillna(df.groupby(['region','Income group'])['Domestic general government health expenditure (% of GDP)'].transform('median'))
df['Domestic general government health expenditure (% of general government expenditure)']=df['Domestic general government health expenditure (% of general government expenditure)'].fillna(df.groupby(['region','Income group'])['Domestic general government health expenditure (% of general government expenditure)'].transform('median'))
df['Domestic general government health expenditure per capita (current US$)']=df['Domestic general government health expenditure per capita (current US$)'].fillna(df.groupby(['region','Income group'])['Domestic general government health expenditure per capita (current US$)'].transform('median'))
df['Domestic private health expenditure (% of current health expenditure)']=df['Domestic private health expenditure (% of current health expenditure)'].fillna(df.groupby(['region','Income group'])['Domestic private health expenditure (% of current health expenditure)'].transform('median'))
df['Domestic private health expenditure per capita (current US$)']=df['Domestic private health expenditure per capita (current US$)'].fillna(df.groupby(['region','Income group'])['Domestic private health expenditure per capita (current US$)'].transform('median'))
df['External health expenditure (% of current health expenditure)']=df['External health expenditure (% of current health expenditure)'].fillna(df.groupby(['region','Income group'])['External health expenditure (% of current health expenditure)'].transform('median'))
df['External health expenditure per capita (current US$)']=df['External health expenditure per capita (current US$)'].fillna(df.groupby(['region','Income group'])['External health expenditure per capita (current US$)'].transform('median'))


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 23 columns):
Country Name                                                                            217 non-null object
region                                                                                  217 non-null object
sub-region                                                                              217 non-null object
Country Code                                                                            217 non-null object
Income group                                                                            217 non-null object
Population aged 0 to 14 years old (percentage)                                          217 non-null float64
Population aged 60+ years old (percentage)                                              217 non-null float64
Population density                                                                      217 non-null float64
Population mid-year estimate

In [23]:
df[df['GDP(USD - billion)'].isnull()] ## keep these for now, can look at their info later

Unnamed: 0,Country Name,region,sub-region,Country Code,Income group,Population aged 0 to 14 years old (percentage),Population aged 60+ years old (percentage),Population density,Population mid-year estimates (millions),Population mid-year estimates for females (millions),Population mid-year estimates for males (millions),Sex ratio (males per 100 females),GDP(USD - billion),Current health expenditure per capita (current US$),Current health expenditure (% of GDP),Domestic general government health expenditure (% of current health expenditure),Domestic general government health expenditure (% of GDP),Domestic general government health expenditure (% of general government expenditure),Domestic general government health expenditure per capita (current US$),Domestic private health expenditure (% of current health expenditure),Domestic private health expenditure per capita (current US$),External health expenditure (% of current health expenditure),External health expenditure per capita (current US$)
9,Aruba,Americas,Latin America and the Caribbean,ABW,High income,17.6204,21.0612,590.6333,0.1063,0.0559,0.0504,90.2912,,1127.826454,6.736425,55.630717,3.315918,13.370603,622.160918,44.329505,497.901696,0.076705,0.816007
21,Bermuda,Americas,Northern America,BMU,High income,17.1976,23.3762,1250.12,0.0625,92.54305,90.6949,98.22035,,7013.020147,13.804035,77.644649,10.855432,29.238011,5676.107343,22.355351,1487.871346,0.076705,0.816007
27,British Virgin Islands,Americas,Latin America and the Caribbean,VGB,High income,19.4943,20.012,200.2,0.03,0.2003,0.1892,93.4099,,1127.826454,6.736425,55.630717,3.315918,13.370603,622.160918,44.329505,497.901696,0.076705,0.816007
36,Cayman Islands,Americas,Latin America and the Caribbean,CYM,High income,19.4943,20.012,270.6167,0.0649,0.2003,0.1892,93.4099,,1127.826454,6.736425,55.630717,3.315918,13.370603,622.160918,44.329505,497.901696,0.076705,0.816007
39,Channel Islands,Europe,Western Europe,CHI,High income,15.0674,23.6272,906.6263,0.1723,0.087,0.0853,97.9818,,4604.453326,10.398473,81.096054,8.038747,16.405838,3687.040287,18.326754,815.651883,1.154384,72.387087
61,Eritrea,Africa,Sub-Saharan Africa,ERI,Low income,41.5849,6.4373,34.6249,3.4971,1.7444,1.7527,100.4761,,27.180653,2.959313,29.218831,0.864677,2.919503,8.733867,59.058986,17.653454,11.722183,3.503904
65,Faroe Islands,Europe,Northern Europe,FRO,High income,16.93815,25.7982,34.8696,0.0487,2.56105,2.5696,98.0243,,4318.63538,8.892286,78.792707,7.051128,15.590733,3307.233522,21.201546,907.883537,0.01005,0.397773
69,French Polynesia,Oceania,Polynesia,PYF,High income,22.5639,13.0704,76.3079,0.2793,0.1378,0.1415,102.6845,,1471.44771,11.687648,59.764133,6.985021,18.70771,1000.505325,24.878769,416.492969,15.357097,257.091615
75,Gibraltar,Europe,Southern Europe,GIB,High income,14.3288,27.8954,3370.1,0.0337,5.3329,4.8367,96.392,,2249.910049,8.935493,71.237973,5.866349,13.47362,1469.082741,28.762027,605.915701,0.25582,3.864647
77,Greenland,Americas,Northern America,GRL,High income,17.1976,23.3762,0.1381,0.0567,92.54305,90.6949,98.22035,,7013.020147,13.804035,77.644649,10.855432,29.238011,5676.107343,22.355351,1487.871346,0.076705,0.816007


### There are about 19 countries without the GDP info 

In [24]:
df.to_csv('covid_cleaned_general.csv') ## saving the file

- I have googled the gdp info for the 19 countries and managed to find most of them except 5 countries

## Data Scaling

In [25]:
dfc=pd.read_csv('covid_cleaned_general.csv',encoding = "ISO-8859-1")

In [26]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 23 columns):
Country Name                                                                            217 non-null object
region                                                                                  217 non-null object
sub-region                                                                              217 non-null object
Country Code                                                                            217 non-null object
Income group                                                                            217 non-null object
Population aged 0 to 14 years old (percentage)                                          217 non-null float64
Population aged 60+ years old (percentage)                                              217 non-null float64
Population density                                                                      217 non-null float64
Population mid-year estimate

In [27]:
dfc.head()

Unnamed: 0,Country Name,region,sub-region,Country Code,Income group,Population aged 0 to 14 years old (percentage),Population aged 60+ years old (percentage),Population density,Population mid-year estimates (millions),Population mid-year estimates for females (millions),Population mid-year estimates for males (millions),Sex ratio (males per 100 females),GDP(USD - billion),Current health expenditure per capita (current US$),Current health expenditure (% of GDP),Domestic general government health expenditure (% of current health expenditure),Domestic general government health expenditure (% of GDP),Domestic general government health expenditure (% of general government expenditure),Domestic general government health expenditure per capita (current US$),Domestic private health expenditure (% of current health expenditure),Domestic private health expenditure per capita (current US$),External health expenditure (% of current health expenditure),External health expenditure per capita (current US$)
0,Afghanistan,Asia,Southern Asia,AFG,Low income,42.4723,4.1655,58.2694,38.0418,18.512,19.5297,105.4975,19.36297,60.088813,10.198677,5.128063,0.522995,2.014406,2.935752,77.401006,44.311117,17.470931,10.001891
1,Albania,Europe,Southern Europe,ALB,Upper middle income,17.3996,20.4779,105.143,2.8809,1.4141,1.4668,103.7236,15.102501,264.434603,6.697895,41.356808,2.770035,9.508133,112.301534,57.980901,157.443104,0.662291,1.798405
2,Algeria,Africa,Northern Africa,DZA,Upper middle income,30.5504,9.6817,18.0763,43.0531,21.3034,21.7497,102.0949,173.757953,290.50324,6.648475,67.686944,4.50015,10.734261,176.265316,32.289543,84.086032,0.023513,0.061231
3,American Samoa,Oceania,Polynesia,ASM,Upper middle income,36.46845,8.24515,276.56,0.0553,0.07365,0.07715,103.70785,0.636,222.897623,5.532094,76.380676,4.22545,9.014085,173.497543,12.788831,29.049634,12.156946,39.366339
4,Andorra,Europe,Southern Europe,AND,High income,14.3288,27.8954,164.1319,0.0771,5.3329,4.8367,96.392,3.236544,3698.117574,10.367728,49.129799,5.093644,14.017556,1883.995436,50.870201,1950.735145,0.25582,3.864647


In [28]:
dfc[dfc['GDP(USD - billion)'].isnull()]['Country Name'] ## the 5 countries without the GDP info

21                          Bermuda
39                  Channel Islands
172       Sint Maarten (Dutch part)
183      Saint Martin (French part)
212    United States Virgin Islands
Name: Country Name, dtype: object

In [29]:
dff=dfc.dropna() ## dropping theese countries

In [30]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 0 to 216
Data columns (total 23 columns):
Country Name                                                                            212 non-null object
region                                                                                  212 non-null object
sub-region                                                                              212 non-null object
Country Code                                                                            212 non-null object
Income group                                                                            212 non-null object
Population aged 0 to 14 years old (percentage)                                          212 non-null float64
Population aged 60+ years old (percentage)                                              212 non-null float64
Population density                                                                      212 non-null float64
Population mid-year estimate

In [31]:
dfscale=dff

In [33]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

min_max_scaler = MinMaxScaler() ## scaling

In [37]:
dfscale[['Population aged 0 to 14 years old (percentage)',
       'Population aged 60+ years old (percentage)', 'Population density',
       'Population mid-year estimates (millions)',
       'Population mid-year estimates for females (millions)',
       'Population mid-year estimates for males (millions)',
       'Sex ratio (males per 100 females)', 'GDP(USD - billion)',
       'Current health expenditure per capita (current US$)',
       'Current health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of current health expenditure)',
       'Domestic general government health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure (% of current health expenditure)',
       'Domestic private health expenditure per capita (current US$)',
       'External health expenditure (% of current health expenditure)',
       'External health expenditure per capita (current US$)']] = min_max_scaler.fit_transform(dff[['Population aged 0 to 14 years old (percentage)',
       'Population aged 60+ years old (percentage)', 'Population density',
       'Population mid-year estimates (millions)',
       'Population mid-year estimates for females (millions)',
       'Population mid-year estimates for males (millions)',
       'Sex ratio (males per 100 females)', 'GDP(USD - billion)',
       'Current health expenditure per capita (current US$)',
       'Current health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of current health expenditure)',
       'Domestic general government health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of general government expenditure)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure (% of current health expenditure)',
       'Domestic private health expenditure per capita (current US$)',
       'External health expenditure (% of current health expenditure)',
       'External health expenditure per capita (current US$)']])


In [38]:
dfscale.to_csv('scaled.csv')