# Getting rid of the GDP null values 

# Purpose 
This notebook is for getting rid of GDP null values that plague our country dataset so far. 

# Datasets
Uses: <br>
** countries_250.csv ** from 250-Removing_All_Countries_Without_Medals <br>
Creates: &emsp;
<br>
** countries_275.csv ** csv containing all the countries information with no GDP nulls bar the countries that will be fixed in notebook 400. 

In [1]:
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline
from bs4 import BeautifulSoup
import webbrowser
import urllib.request
from lxml import html
import zipfile
import re
import string
import sys, os
from IPython.display import Image

In [2]:
# Ensure the file exists
if not os.path.exists( r"../../data/prep/Countries/countries_250.csv" ):
    print("Missing dataset file")

In [3]:
# read the countries df into a dataframe
df = pd.read_csv(  r"../../data/prep/Countries/countries_250.csv" , encoding = "ISO-8859-1")

In [4]:
# checking the number of null values we are dealing with
df.isnull().sum()

Country                      0
Year                         0
Population                   0
Males                        0
Females                      0
Life_Expectancy            174
GDP                        511
Region                       0
Elevation                  315
Area_SqKM                  315
Centroid_Longitude         315
Centroid_Latitude          315
Population_Density         315
CO2_Emissions              453
Methane_Emissions          729
Nitrous_Oxide_Emisions     729
Total_Emissions              0
Emmisions_per_Capita      2793
Code                         0
dtype: int64

# Figuring out which countries have Null GDP values 

In [5]:
# these are all the rows with null GDP values
nullGDPdf = df.loc[df['GDP'].isnull()].reset_index()
# Dropping the old index
nullGDPdf = nullGDPdf.drop(nullGDPdf.columns[[0]], axis=1)

In [6]:
# a list of all the countries with null GDP values 
gdpNcs = ['Afghanistan', 'Argentina', 'Armenia', 'Azerbaijan', 'Bahrain',
       'Barbados', 'Belarus', 'Bermuda', 'Bosnia and Herzegovina',
       'Bulgaria', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic',
       'Djibouti', 'Egypt, Arab Rep.', 'Eritrea', 'Estonia', 'Ethiopia',
       'Georgia', 'Germany', 'Grenada', 'Hungary', 'Indonesia',
       'Iran, Islamic Rep.', 'Iraq', 'Jordan', 'Kazakhstan',
       'Korea, Dem. People?s Rep.', 'Kosovo', 'Kuwait', 'Kyrgyz Republic',
       'Latvia', 'Liechtenstein', 'Lithuania', 'Macedonia, FYR',
       'Mauritius', 'Moldova', 'Mongolia', 'Montenegro', 'Mozambique',
       'Namibia', 'New Zealand', 'Niger', 'Paraguay', 'Poland',
       'Puerto Rico', 'Qatar', 'Romania', 'Russian Federation', 'Samoa',
       'Saudi Arabia', 'Serbia', 'Slovak Republic', 'Slovenia',
       'Switzerland', 'Syrian Arab Republic', 'Tajikistan', 'Tanzania',
       'Tonga', 'Tunisia', 'Turkmenistan', 'Ukraine',
       'United Arab Emirates', 'Uzbekistan', 'Venezuela, RB', 'Vietnam',
       'Virgin Islands (U.S.)']

# Creating a dictionary with each country and their number of null values 

In [7]:
# null dictionary 
nullDic = {}

In [8]:
for x, row in nullGDPdf.iterrows():
    
    country = nullGDPdf['Country'].iloc[x]
    
    # gets how many rows/ years that have null values for GDP 
    temp = nullGDPdf.loc[nullGDPdf['Country'] == country]
    length = len(temp.loc[temp['GDP'].isnull()])
    
    nullDic.update({country: length})

In [9]:
# this is the null dictionary 
# The keys are the countries and the values are the number of years with null GDP
nullDic

{'Afghanistan': 13,
 'Armenia': 8,
 'Azerbaijan': 8,
 'Bahrain': 6,
 'Barbados': 8,
 'Belarus': 8,
 'Bermuda': 2,
 'Bosnia and Herzegovina': 9,
 'Bulgaria': 5,
 'Croatia': 10,
 'Cuba': 4,
 'Cyprus': 4,
 'Czech Republic': 8,
 'Djibouti': 9,
 'Eritrea': 11,
 'Estonia': 10,
 'Ethiopia': 6,
 'Georgia': 2,
 'Germany': 3,
 'Grenada': 5,
 'Hong Kong SAR, China': 1,
 'Hungary': 8,
 'Iraq': 2,
 'Ireland': 3,
 'Jamaica': 2,
 'Jordan': 4,
 'Kazakhstan': 8,
 'Korea, Dem. People?s Rep.': 21,
 'Kosovo': 12,
 'Kuwait': 8,
 'Kyrgyz Republic': 7,
 'Latvia': 10,
 'Liechtenstein': 20,
 'Lithuania': 10,
 'Macedonia, FYR': 8,
 'Mauritius': 4,
 'Moldova': 10,
 'Mongolia': 6,
 'Montenegro': 11,
 'Morocco': 2,
 'Mozambique': 5,
 'Namibia': 5,
 'New Zealand': 5,
 'Niger': 21,
 'Poland': 8,
 'Puerto Rico': 2,
 'Qatar': 12,
 'Romania': 8,
 'Russian Federation': 8,
 'Samoa': 6,
 'Saudi Arabia': 2,
 'Serbia': 10,
 'Slovak Republic': 8,
 'Slovenia': 10,
 'Sri Lanka': 1,
 'Suriname': 4,
 'Switzerland': 5,
 'Syrian A

# Excluding nations that particapted for different NOCs 
These are teams who competed for teams like the Unified Olympic Team and Solviet Union etc. There nulls will be filled in the Notebook 400. 

In [10]:
# list of all these countries
members = ['Montenegro',
 'Uzbekistan',
 'Slovak Republic',
 'Slovenia',
 'Turkmenistan',
 'Kazakhstan',
 'Serbia',
 'Azerbaijan',
 'Moldova',
 'Macedonia, FYR',
 'Czech Republic',
 'Kyrgyz Republic',
 'Belarus',
 'Ukraine',
 'Tajikistan',
 'Bosnia and Herzegovina',
 'Georgia',
 'Russian Federation',
 'Armenia', 'Lithuania']

** We need to save the countires that we remove so we can add them back later to be removed in notebook 400. The GDP values for these countries are filled in notebook 400.**

In [11]:
# the removed countries
removed = df[df['Country'].isin(members)].reset_index()

In [12]:
# Removing these countires from the country df to see how mmany nulls are left afterwards 
df = df[~df['Country'].isin(members)].reset_index()

In [13]:
df.isnull().sum()

index                        0
Country                      0
Year                         0
Population                   0
Males                        0
Females                      0
Life_Expectancy            142
GDP                        350
Region                       0
Elevation                  315
Area_SqKM                  315
Centroid_Longitude         315
Centroid_Latitude          315
Population_Density         315
CO2_Emissions              259
Methane_Emissions          601
Nitrous_Oxide_Emisions     601
Total_Emissions              0
Emmisions_per_Capita      2373
Code                         0
dtype: int64

# Same process for checking nulls as above

In [14]:
# these are all the rows with null GDP values
nullGDPdf = df.loc[df['GDP'].isnull()].reset_index()
# Dropping the old index
nullGDPdf = nullGDPdf.drop(nullGDPdf.columns[[0]], axis=1)

In [15]:
# null dictionary 
nullDic = {}

In [16]:
for x, row in nullGDPdf.iterrows():
    
    country = nullGDPdf['Country'].iloc[x]
    
    # gets how many rows/ years that have null values for GDP 
    temp = nullGDPdf.loc[nullGDPdf['Country'] == country]
    length = len(temp.loc[temp['GDP'].isnull()])
    
    nullDic.update({country: length})

In [17]:
nullDic

{'Afghanistan': 13,
 'Bahrain': 6,
 'Barbados': 8,
 'Bermuda': 2,
 'Bulgaria': 5,
 'Croatia': 10,
 'Cuba': 4,
 'Cyprus': 4,
 'Djibouti': 9,
 'Eritrea': 11,
 'Estonia': 10,
 'Ethiopia': 6,
 'Germany': 3,
 'Grenada': 5,
 'Hong Kong SAR, China': 1,
 'Hungary': 8,
 'Iraq': 2,
 'Ireland': 3,
 'Jamaica': 2,
 'Jordan': 4,
 'Korea, Dem. People?s Rep.': 21,
 'Kosovo': 12,
 'Kuwait': 8,
 'Latvia': 10,
 'Liechtenstein': 20,
 'Mauritius': 4,
 'Mongolia': 6,
 'Morocco': 2,
 'Mozambique': 5,
 'Namibia': 5,
 'New Zealand': 5,
 'Niger': 21,
 'Poland': 8,
 'Puerto Rico': 2,
 'Qatar': 12,
 'Romania': 8,
 'Samoa': 6,
 'Saudi Arabia': 2,
 'Sri Lanka': 1,
 'Suriname': 4,
 'Switzerland': 5,
 'Syrian Arab Republic': 21,
 'Tanzania': 7,
 'Tonga': 6,
 'Tunisia': 2,
 'Uganda': 6,
 'United Arab Emirates': 4,
 'Venezuela, RB': 1,
 'Vietnam': 6,
 'Virgin Islands (U.S.)': 14}

# Removing Nulls in list above
* In order to remove the remaning nulls we sourced as many of the years as possible from this UN website http://data.un.org/,  as we did in previous notebooks. However even this website did not contain all the GDPs. It usally had values years 1970 - 2016. <br>  The way we removed the rest of the null GDPs of the countries above is by using GDPs from later years in that country. If a country has a null GDP value we take the GDP values for the 3 years after and get the average growth. We then multiply 1 - average growth by the first year following the null GDP to get the GDP for this null year. We repeat the process to get GDP for all the nulls. We iterate through the df backwards so they're will always be a GDP value for each country before the null where we can apply the process just mentioned.
* Once we have elimated the possability of null values for the Years 2016, 2014 and 2012 for every country then every country will have at least three years that we can use to fill remaining GDP nulls by the process explained above.

So As long as the years 2012 - 2016 are non null we'll have no probelms filling any other nulls.

In [18]:
# Countries with null values for 2012, 2014 or 2016
temp = df[df['GDP'].isnull()]
temp[temp['Year'] >= 2012]['Country'].unique()

array(['Bahrain', 'Bermuda', 'Cuba', 'Djibouti', 'Eritrea',
       'Korea, Dem. People?s Rep.', 'Liechtenstein', 'Niger',
       'Puerto Rico', 'Syrian Arab Republic', 'Venezuela, RB',
       'Virgin Islands (U.S.)'], dtype=object)

# There is no data for the year 2016 for Virgin Islands
This is the only year it is missing. 

In [19]:
# Remove old df index
df = df.drop(df.columns[[0]], axis=1)
# The virgin Islands is just missing 2016 
# So we'll just find the 2016 value online and plug it in
df[df['Country'] == 'Virgin Islands (U.S.)'].tail(1)

Unnamed: 0,Country,Year,Population,Males,Females,Life_Expectancy,GDP,Region,Elevation,Area_SqKM,Centroid_Longitude,Centroid_Latitude,Population_Density,CO2_Emissions,Methane_Emissions,Nitrous_Oxide_Emisions,Total_Emissions,Emmisions_per_Capita,Code
2351,Virgin Islands (U.S.),2016,102951.0,49091.0,53860.0,,,South America,,,,,,,,,0.0,,ISV


This source https://www.bea.gov/newsreleases/general/terr/2017/vigdp_120117.pdf stated that GDP in Virgin Islands U.S. rose 1.5% from 2015 to 2016 after increasing 2.9% from 2014 to 2015 an overall increase of 4.4%. So we'll just apply this 4.4% increase to 2014 to get the 2016 value. 

In [20]:
# Setting this GDP for 2016 Virgin Islands by row number..
df.loc[2351, 'GDP'] = (df.loc[2350, 'GDP'] * 0.044) + (df.loc[2350, 'GDP'])

# Filling as many nulls as possible from 1970 to 2016
All of the cases below use the same process for filling the nulls for every country. 
First we load in the file containing GDP information from 1970 to 2016 then using each of these csvs we create a dictionary of Key, Year: GDP number, Vaue pairs. 
Afterwards we use a for loop to iterate through the country df and fill the values for each year if the column had a null value and is from the country we have data for. 

# Filling the Rest of these countries Nulls from 1970 to 2016 

# Korea

In [21]:
# Reading in the GDP info for Korea
if not os.path.exists( r"../../data/raw/UNData/UN_Korea_GDP.csv" ):
    print("Missing dataset file")

In [22]:
# read the Korea UN data df into a dataframe
kGDP = pd.read_csv( r"../../data/raw/UNData/UN_Korea_GDP.csv"  , encoding = "ISO-8859-1")

# Creating the Dictionary 

In [23]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in kGDP.iterrows():
    
    GDP = kGDP['Value'].iloc[x]
    Year = kGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

# Filling the rows with nulls of same country 

In [24]:
# filling some the nulls from 1970 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Korea, Dem. People?s Rep.' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Niger

In [25]:
# Reading in the GDP info for Niger
if not os.path.exists( r"../../data/raw/UNData/UN_Niger_GDP.csv" ):
    print("Missing dataset file")

In [26]:
# read the Un data for Niger into a dataframe
nGDP = pd.read_csv( r"../../data/raw/UNData/UN_Niger_GDP.csv"  , encoding = "ISO-8859-1")

In [27]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in nGDP.iterrows():
    
    GDP = nGDP['Value'].iloc[x]
    Year = nGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [28]:
# filling some the nulls from 1970 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Niger' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Bermuda 

In [29]:
# Reading in the GDP info for Bermuda
if not os.path.exists( r"../../data/raw/UNData/UN_Bermuda_GDP.csv" ):
    print("Missing dataset file")

In [30]:
# read the UN Data for Bermuda into a dataframe
bGDP = pd.read_csv( r"../../data/raw/UNData/UN_Bermuda_GDP.csv"  , encoding = "ISO-8859-1")

In [31]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in bGDP.iterrows():
    
    GDP = bGDP['Value'].iloc[x]
    Year = bGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [32]:
# filling some the nulls from 1970 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Bermuda' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Cuba 

In [33]:
# Reading in the GDP info for Cuba
if not os.path.exists( r"../../data/raw/UNData/UN_Cuba_GDP.csv" ):
    print("Missing dataset file")

In [34]:
# read the UN Data for Cuba into a dataframe
cGDP = pd.read_csv( r"../../data/raw/UNData/UN_Cuba_GDP.csv"  , encoding = "ISO-8859-1")

In [35]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in cGDP.iterrows():
    
    GDP = cGDP['Value'].iloc[x]
    Year = cGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [36]:
# filling some the nulls from 1970 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Cuba' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Djibouti

In [37]:
# Reading in the GDP info for Djibouti
if not os.path.exists( r"../../data/raw/UNData/UN_Djibouti_GDP.csv" ):
    print("Missing dataset file")

In [38]:
# read the UN Data for Djibouti into a dataframe
dGDP = pd.read_csv( r"../../data/raw/UNData/UN_Djibouti_GDP.csv"  , encoding = "ISO-8859-1")

In [39]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in dGDP.iterrows():
    
    GDP = dGDP['Value'].iloc[x]
    Year = dGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [40]:
# filling some the nulls from 1970 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Djibouti' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Eritrea

In [41]:
# Reading in the GDP info for Eritrea
if not os.path.exists( r"../../data/raw/UNData/UN_Eritrea_GDP.csv" ):
    print("Missing dataset file")

In [42]:
# read the UN Data for Eritrea into a dataframe
eGDP = pd.read_csv( r"../../data/raw/UNData/UN_Eritrea_GDP.csv"  , encoding = "ISO-8859-1")

In [43]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in eGDP.iterrows():
    
    GDP = eGDP['Value'].iloc[x]
    Year = eGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [44]:
# filling some the nulls from 1990 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    # GDP values are only given down to 1990
    if(country == 'Eritrea' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Liechtenstein 

In [45]:
# Reading in the GDP info for Liechtenstein 
if not os.path.exists( r"../../data/raw/UNData/UN_Liechtenstein_GDP.csv" ):
    print("Missing dataset file")

In [46]:
# read the UN data for Liechtenstein into a dataframe
lGDP = pd.read_csv( r"../../data/raw/UNData/UN_Liechtenstein_GDP.csv"  , encoding = "ISO-8859-1")

In [47]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in lGDP.iterrows():
    
    GDP = lGDP['Value'].iloc[x]
    Year = lGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [48]:
# filling some the nulls from 1990 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Liechtenstein' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Puerto Rico

In [49]:
# Reading in the GDP info for Puerto Rico
if not os.path.exists( r"../../data/raw/UNData/UN_PuertoRico_GDP.csv" ):
    print("Missing dataset file")

In [50]:
# read the UN Data for Puerto Rico into a dataframe
prGDP = pd.read_csv( r"../../data/raw/UNData/UN_PuertoRico_GDP.csv"  , encoding = "ISO-8859-1")

In [51]:
# Dictionary for holding GDP info Key - Year , Value - GDP

gdpDict = {}

for x, row in prGDP.iterrows():
    
    GDP = prGDP['Value'].iloc[x]
    Year = prGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [52]:
# filling some the nulls from 1990 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Puerto Rico' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Syrian Arab Republic 

In [53]:
# Reading in the GDP info for Syrian Arab Republic 
if not os.path.exists( r"../../data/raw/UNData/UN_SyrianArabRep_GDP.csv" ):
    print("Missing dataset file")

In [54]:
# read the UN Data for Syrian Arab Republic into a dataframe
sarGDP = pd.read_csv( r"../../data/raw/UNData/UN_SyrianArabRep_GDP.csv"  , encoding = "ISO-8859-1")

In [55]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in sarGDP.iterrows():
    
    GDP = sarGDP['Value'].iloc[x]
    Year = sarGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [56]:
# filling some the nulls from 1990 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Syrian Arab Republic' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Venezuela RB

In [57]:
# Reading in the GDP info for Venezuela
if not os.path.exists( r"../../data/raw/UNData/UN_Venezuela_GDP.csv" ):
    print("Missing dataset file")

In [58]:
# read  the UN Data for Venezuela into a dataframe
vGDP = pd.read_csv( r"../../data/raw/UNData/UN_Venezuela_GDP.csv"  , encoding = "ISO-8859-1")

In [59]:
# Dictionary for holding GDP info Key - Year , Value - GDP
gdpDict = {}

for x, row in vGDP.iterrows():
    
    GDP = vGDP['Value'].iloc[x]
    Year = vGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [60]:
# filling some the nulls from 1990 - 2016 with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Venezuela, RB' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

In [61]:
df.columns

Index(['Country', 'Year', 'Population', 'Males', 'Females', 'Life_Expectancy',
       'GDP', 'Region', 'Elevation', 'Area_SqKM', 'Centroid_Longitude',
       'Centroid_Latitude', 'Population_Density', 'CO2_Emissions',
       'Methane_Emissions', 'Nitrous_Oxide_Emisions', 'Total_Emissions',
       'Emmisions_per_Capita', 'Code'],
      dtype='object')

# Making sure all null values for GDP between 2012 and 2016 are eliminated

In [62]:
# Countries with null values for 2012, 2014 or 2016
temp = df[df['GDP'].isnull()]
temp[temp['Year'] >= 2012]['Country'].unique()

array(['Bahrain'], dtype=object)

# Removing Nulls in list above
If a country has a null GDP value we take the GDP values for the 3 years after and get the average growth. If the average growth is positive we then multiply 1 - average growth by the first year following the null GDP to get the GDP for this null year.<br> If the average growth is negative we get the absolute value of the negative average growth, add it to 1 and multiply it by the gdp for the year after. So here we assume that the Year with the null had a greater GDP than the next. We repeat the process to get GDP for all the nulls. We iterate through the df backwards so they're will always be a GDP value for each country before the null where we can apply the process just mentioned. Like we said above once we have elimated the possability of null values for the Years 2016, 2014 and 2012 for every country then every country will have at least three years that we can use to fill remaining GDP nulls by the process explained above.

# For Loop for filling the rest of the Nulls

In [63]:
# iteration through the df is backwards so we can use the GDP years from after the earlier null years 
for x in reversed(df.index):
    
    
    GDP = df['GDP'].iloc[x]
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    
    if(pd.isnull(GDP)):
        # getting values for the 3 years after the null year of GDP
        gdp_1 = df['GDP'].iloc[x + 1]
        gdp_2 = df['GDP'].iloc[x + 2]
        gdp_3 = df['GDP'].iloc[x + 3]
        
        # Calcullating the average growth 
        diff2 = (gdp_3-gdp_2)/gdp_2
        diff1 = (gdp_2-gdp_1)/gdp_1
        avgGrowth = (diff1+diff2)/2
        
        
        # Setting the GDP vaule of the null year 
        
        # if the avergae growth percentage is between 0-99%
        if(avgGrowth >= 0 and avgGrowth <=  1):
            gdp_curr = gdp_1*(1-avgGrowth)
            df.loc[x, 'GDP'] = gdp_curr
            
        # if the avergae growth is a negative value we presume the null year was also better than the following year
        # so the null year will be the average negative decrease higher than its following year 
        if(avgGrowth < 0):
            gdp_curr = gdp_1 / (1 + avgGrowth)
            df.loc[x, 'GDP'] = gdp_curr
        
        # if the average growth is above 100%
        if(avgGrowth > 1):
            gdp_curr = gdp_1*(avgGrowth)
            df.loc[x, 'GDP'] = gdp_curr
            
            
        # Updating the track variables
        GDP = df['GDP'].iloc[x]
        Year = df['Year'].iloc[x]
        country = df['Country'].iloc[x]

# Checking the nulls count again 

In [64]:
# these are all the rows with null GDP values
nullGDPdf = df.loc[df['GDP'].isnull()].reset_index()
# Dropping the old index
nullGDPdf = nullGDPdf.drop(nullGDPdf.columns[[0]], axis=1)

In [65]:
# null dictionary 
nullDic = {}

In [66]:
for x, row in nullGDPdf.iterrows():
    
    country = nullGDPdf['Country'].iloc[x]
    
    # gets how many rows/ years that have null values for GDP 
    temp = nullGDPdf.loc[nullGDPdf['Country'] == country]
    length = len(temp.loc[temp['GDP'].isnull()])
    
    nullDic.update({country: length})

In [67]:
nullDic

{}

In [68]:
# no nulls left 
df.isnull().sum()

Country                      0
Year                         0
Population                   0
Males                        0
Females                      0
Life_Expectancy            142
GDP                          0
Region                       0
Elevation                  315
Area_SqKM                  315
Centroid_Longitude         315
Centroid_Latitude          315
Population_Density         315
CO2_Emissions              259
Methane_Emissions          601
Nitrous_Oxide_Emisions     601
Total_Emissions              0
Emmisions_per_Capita      2373
Code                         0
dtype: int64

# Adding back in the previously removed countries 
The rest of the nulls below will be fixed in Notebook 400. 

In [69]:
# the gdp values for these countries are added in notebook 400 
df = pd.concat([df,removed],ignore_index=True)

In [70]:
# number of nulls 
len(df[df['GDP'].isnull()])

161

In [71]:
# Changing the order of the columns 
df = df[['Country', 'Year', 'Population', 'Males', 'Females', 'Life_Expectancy',
       'GDP', 'Region', 'Elevation', 'Area_SqKM', 'Centroid_Longitude',
       'Centroid_Latitude', 'Population_Density', 'CO2_Emissions',
       'Methane_Emissions', 'Nitrous_Oxide_Emisions', 'Total_Emissions',
       'Emmisions_per_Capita', 'Code']]

In [72]:
df.to_csv( r"../../data/prep/Countries/countries_275.csv", index=False)