# Notebook for getting rid of the GDP null values 

In [226]:
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline
from bs4 import BeautifulSoup
import webbrowser
import urllib.request
from lxml import html
import zipfile
import re
import string
import sys, os
from IPython.display import Image

In [227]:
# Ensure the file exists
if not os.path.exists( r"../../data/prep/Countries/countries_250.csv" ):
    print("Missing dataset file")

In [228]:
# read the countries df into a dataframe
df = pd.read_csv(  r"../../data/prep/Countries/countries_250.csv" , encoding = "ISO-8859-1")

In [229]:
# checking the number of null values we are dealing with
df.isnull().sum()

Country                      0
Year                         0
Population                   0
Males                        0
Females                      0
Life_Expectancy            174
GDP                        438
Region                       0
Elevation                  315
Area_SqKM                  315
Centroid_Longitude         315
Centroid_Latitude          315
Population_Density         315
CO2_Emissions              453
Methane_Emissions          729
Nitrous_Oxide_Emisions     729
Total_Emissions              0
Emmisions_per_Capita      2793
Code                         0
dtype: int64

# Figuring out which countries have Null GDP values 

In [230]:
# these are all the rows with null GDP values
nullGDPdf = df.loc[df['GDP'].isnull()].reset_index()
# Dropping the old index
nullGDPdf = nullGDPdf.drop(nullGDPdf.columns[[0]], axis=1)

In [231]:
# a list of all the countries with null GDP values 
gdpNcs = ['Afghanistan', 'Argentina', 'Armenia', 'Azerbaijan', 'Bahrain',
       'Barbados', 'Belarus', 'Bermuda', 'Bosnia and Herzegovina',
       'Bulgaria', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic',
       'Djibouti', 'Egypt, Arab Rep.', 'Eritrea', 'Estonia', 'Ethiopia',
       'Georgia', 'Germany', 'Grenada', 'Hungary', 'Indonesia',
       'Iran, Islamic Rep.', 'Iraq', 'Jordan', 'Kazakhstan',
       'Korea, Dem. People?s Rep.', 'Kosovo', 'Kuwait', 'Kyrgyz Republic',
       'Latvia', 'Liechtenstein', 'Lithuania', 'Macedonia, FYR',
       'Mauritius', 'Moldova', 'Mongolia', 'Montenegro', 'Mozambique',
       'Namibia', 'New Zealand', 'Niger', 'Paraguay', 'Poland',
       'Puerto Rico', 'Qatar', 'Romania', 'Russian Federation', 'Samoa',
       'Saudi Arabia', 'Serbia', 'Slovak Republic', 'Slovenia',
       'Switzerland', 'Syrian Arab Republic', 'Tajikistan', 'Tanzania',
       'Tonga', 'Tunisia', 'Turkmenistan', 'Ukraine',
       'United Arab Emirates', 'Uzbekistan', 'Venezuela, RB', 'Vietnam',
       'Virgin Islands (U.S.)']

In [232]:
# Creating a dictionary with each country and their null values 

In [233]:
# null dictionary 
nullDic = {}

In [234]:
for x, row in nullGDPdf.iterrows():
    
    country = nullGDPdf['Country'].iloc[x]
    
    # gets how many rows/ years that have null values for GDP 
    temp = nullGDPdf.loc[nullGDPdf['Country'] == country]
    length = len(temp.loc[temp['GDP'].isnull()])
    
    nullDic.update({country: length})

In [235]:
# this is the null dictionary 
# The keys are the countries and the values are the number of years with null GDP
nullDic

{'Afghanistan': 7,
 'Argentina': 1,
 'Armenia': 8,
 'Azerbaijan': 8,
 'Bahrain': 5,
 'Barbados': 8,
 'Belarus': 8,
 'Bermuda': 2,
 'Bosnia and Herzegovina': 9,
 'Bulgaria': 5,
 'Croatia': 10,
 'Cuba': 4,
 'Cyprus': 4,
 'Czech Republic': 8,
 'Djibouti': 8,
 'Egypt, Arab Rep.': 2,
 'Eritrea': 11,
 'Estonia': 10,
 'Ethiopia': 6,
 'Georgia': 8,
 'Germany': 3,
 'Grenada': 5,
 'Hungary': 8,
 'Indonesia': 2,
 'Iran, Islamic Rep.': 1,
 'Iraq': 6,
 'Jordan': 2,
 'Kazakhstan': 8,
 'Korea, Dem. People?s Rep.': 21,
 'Kosovo': 12,
 'Kuwait': 2,
 'Kyrgyz Republic': 8,
 'Latvia': 10,
 'Liechtenstein': 4,
 'Lithuania': 10,
 'Macedonia, FYR': 8,
 'Mauritius': 4,
 'Moldova': 10,
 'Mongolia': 6,
 'Montenegro': 12,
 'Mozambique': 5,
 'Namibia': 5,
 'Niger': 21,
 'Paraguay': 2,
 'Poland': 8,
 'Puerto Rico': 2,
 'Qatar': 3,
 'Romania': 7,
 'Russian Federation': 8,
 'Samoa': 6,
 'Saudi Arabia': 2,
 'Serbia': 10,
 'Slovak Republic': 8,
 'Slovenia': 10,
 'Switzerland': 2,
 'Syrian Arab Republic': 5,
 'Tajikist

# Excluding nations that particapted for different NOCs 
These are the nations we briefly above. They competed for teams like the unified olympic team and solviet unonetc. So we fill these null values in the notebook 400. 

In [236]:
# list of all these countries 
members = ['Montenegro',
 'Uzbekistan',
 'Slovak Republic',
 'Slovenia',
 'Turkmenistan',
 'Kazakhstan',
 'Serbia',
 'Azerbaijan',
 'Moldova',
 'Macedonia, FYR',
 'Czech Republic',
 'Kyrgyz Republic',
 'Belarus',
 'Ukraine',
 'Tajikistan',
 'Bosnia and Herzegovina',
 'Georgia',
 'Russian Federation',
 'Armenia', 'Lithuania']

** We need to save the countires that we remove so we can add them back later. The GDP values for these countries are filled in notebook 400.**

In [237]:
# the removed countries
removed = df[df['Country'].isin(members)].reset_index()
removed.head()

Unnamed: 0,index,Country,Year,Population,Males,Females,Life_Expectancy,GDP,Region,Elevation,Area_SqKM,Centroid_Longitude,Centroid_Latitude,Population_Density,CO2_Emissions,Methane_Emissions,Nitrous_Oxide_Emisions,Total_Emissions,Emmisions_per_Capita,Code
0,63,Armenia,1960,1874120.0,903002.0,971118.0,65.863463,,West and Central Asia,1792.034,30643.8,45.21657,40.30967,61.158211,,,,0.0,,ARM
1,64,Armenia,1964,2144998.0,1040243.0,1104755.0,67.565415,,West and Central Asia,1792.034,30643.8,45.21657,40.30967,69.997781,,,,0.0,,ARM
2,65,Armenia,1968,2401140.0,1170242.0,1230898.0,69.229268,,West and Central Asia,1792.034,30643.8,45.21657,40.30967,78.35647,,,,0.0,,ARM
3,66,Armenia,1972,2650484.0,1293462.0,1357022.0,70.367341,,West and Central Asia,1792.034,30643.8,45.21657,40.30967,86.49332,,1440.48,581.0237,2021.5037,,ARM
4,67,Armenia,1976,2889579.0,1408109.0,1481470.0,70.529415,,West and Central Asia,1792.034,30643.8,45.21657,40.30967,94.295714,,1590.73,664.0386,2254.7686,,ARM


In [238]:
# removing these countires from the df to see how mmany nulls are left afterwards 
df = df[~df['Country'].isin(members)].reset_index()

# Dealing with the rest of the nulls 

There are only 84 nulls and 11 countires with GDP nulls when we take out the countires above. 
We will address them now the way we have decided to do it is b taking the three GDP year values after a null value 

In [239]:
df.isnull().sum()

index                        0
Country                      0
Year                         0
Population                   0
Males                        0
Females                      0
Life_Expectancy            142
GDP                        267
Region                       0
Elevation                  315
Area_SqKM                  315
Centroid_Longitude         315
Centroid_Latitude          315
Population_Density         315
CO2_Emissions              259
Methane_Emissions          601
Nitrous_Oxide_Emisions     601
Total_Emissions              0
Emmisions_per_Capita      2373
Code                         0
dtype: int64

In [240]:
# these are all the rows with null GDP values
nullGDPdf = df.loc[df['GDP'].isnull()].reset_index()
# Dropping the old index
nullGDPdf = nullGDPdf.drop(nullGDPdf.columns[[0]], axis=1)

In [241]:
# null dictionary 
nullDic = {}

In [242]:
for x, row in nullGDPdf.iterrows():
    
    country = nullGDPdf['Country'].iloc[x]
    
    # gets how many rows/ years that have null values for GDP 
    temp = nullGDPdf.loc[nullGDPdf['Country'] == country]
    length = len(temp.loc[temp['GDP'].isnull()])
    
    nullDic.update({country: length})

In [243]:
nullDic

{'Afghanistan': 7,
 'Argentina': 1,
 'Bahrain': 5,
 'Barbados': 8,
 'Bermuda': 2,
 'Bulgaria': 5,
 'Croatia': 10,
 'Cuba': 4,
 'Cyprus': 4,
 'Djibouti': 8,
 'Egypt, Arab Rep.': 2,
 'Eritrea': 11,
 'Estonia': 10,
 'Ethiopia': 6,
 'Germany': 3,
 'Grenada': 5,
 'Hungary': 8,
 'Indonesia': 2,
 'Iran, Islamic Rep.': 1,
 'Iraq': 6,
 'Jordan': 2,
 'Korea, Dem. People?s Rep.': 21,
 'Kosovo': 12,
 'Kuwait': 2,
 'Latvia': 10,
 'Liechtenstein': 4,
 'Mauritius': 4,
 'Mongolia': 6,
 'Mozambique': 5,
 'Namibia': 5,
 'Niger': 21,
 'Paraguay': 2,
 'Poland': 8,
 'Puerto Rico': 2,
 'Qatar': 3,
 'Romania': 7,
 'Samoa': 6,
 'Saudi Arabia': 2,
 'Switzerland': 2,
 'Syrian Arab Republic': 5,
 'Tanzania': 7,
 'Tonga': 4,
 'Tunisia': 2,
 'United Arab Emirates': 4,
 'Venezuela, RB': 1,
 'Vietnam': 7,
 'Virgin Islands (U.S.)': 5}

# Removing Nulls in list above
* In order to remove the remaning nulls we sourced as many of the years as possible from this UN website http://data.un.org/,  as we did in previous notebooks. However even this website did not contain all the GDPs for every years usally before 1970. So The way we removed the rest of the null GDPs of the countries above is by using GDPs from later years in that country. If a country has a null GDP value we take the GDP values for the 3 years after and get the average growth. We then multiply 1 - average growth by the first year following the null GDP to get the GDP for this null year. We repeat the process to get GDP for all the nulls. We iterate through the df backwards so they're will always be a GDP value for each country before the null where we can apply the process just mentioned.
* Once we have elimated the possability of null values for the Years 2016, 2014 and 2012 for every country then every country will have at least three years that we can use to fill remaining GDP nulls by the process explained above.

In [244]:
# Countries with null values for 2012, 2014 or 2016
temp = df[df['GDP'].isnull()]
temp[temp['Year'] >= 2012]['Country'].unique()

array(['Bermuda', 'Cuba', 'Djibouti', 'Eritrea',
       'Korea, Dem. People?s Rep.', 'Liechtenstein', 'Niger',
       'Puerto Rico', 'Syrian Arab Republic', 'Venezuela, RB',
       'Virgin Islands (U.S.)'], dtype=object)

In [245]:
# Remove old df index
df = df.drop(df.columns[[0]], axis=1)
# The virgin Islands is just missing 2016 
# So we'll just find the 2016 value online and plug it in
df[df['Country'] == 'Virgin Islands (U.S.)'].tail(2)

Unnamed: 0,Country,Year,Population,Males,Females,Life_Expectancy,GDP,Region,Elevation,Area_SqKM,Centroid_Longitude,Centroid_Latitude,Population_Density,CO2_Emissions,Methane_Emissions,Nitrous_Oxide_Emisions,Total_Emissions,Emmisions_per_Capita,Code
2350,Virgin Islands (U.S.),2014,104170.0,49716.0,54454.0,79.773171,3624000000.0,South America,,,,,,,,,0.0,,ISV
2351,Virgin Islands (U.S.),2016,102951.0,49091.0,53860.0,,,South America,,,,,,,,,0.0,,ISV


This source https://www.bea.gov/newsreleases/general/terr/2017/vigdp_120117.pdf stated that GDP in Virgin Islands U.S. rose 1.5% from 2015 to 2016 after increasing 2.9% from 2014 to 2015 an overall increase of 4.4%. So we'll just apply this 4.4% increase to 2014 to get the 2016 value. 

In [246]:
# Setting this GDP for 2016 Virgin Islands by row number..
df.loc[2351, 'GDP'] = (df.loc[2350, 'GDP'] * 0.044) + (df.loc[2350, 'GDP'])

# Filling the Rest of these countries Nulls from 1970 -> 2016 

# Korea

In [247]:
# Reading in the GDP info for Korea
if not os.path.exists( r"../../data/raw/UNData/UN_Korea_GDP.csv" ):
    print("Missing dataset file")

In [248]:
# read the countries df into a dataframe
kGDP = pd.read_csv( r"../../data/raw/UNData/UN_Korea_GDP.csv"  , encoding = "ISO-8859-1")

In [249]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in kGDP.iterrows():
    
    GDP = kGDP['Value'].iloc[x]
    Year = kGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [250]:
# filling some of the nulls with the GDP dictionary 
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Korea, Dem. People?s Rep.' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Niger

In [251]:
# Reading in the GDP info for Korea
if not os.path.exists( r"../../data/raw/UNData/UN_Niger_GDP.csv" ):
    print("Missing dataset file")

In [252]:
# read the countries df into a dataframe
nGDP = pd.read_csv( r"../../data/raw/UNData/UN_Niger_GDP.csv"  , encoding = "ISO-8859-1")

In [253]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in nGDP.iterrows():
    
    GDP = nGDP['Value'].iloc[x]
    Year = nGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [254]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Niger' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Bermuda 

In [255]:
# Reading in the GDP info for Bermuda
if not os.path.exists( r"../../data/raw/UNData/UN_Bermuda_GDP.csv" ):
    print("Missing dataset file")

In [256]:
# read the countries df into a dataframe
bGDP = pd.read_csv( r"../../data/raw/UNData/UN_Bermuda_GDP.csv"  , encoding = "ISO-8859-1")

In [257]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in bGDP.iterrows():
    
    GDP = bGDP['Value'].iloc[x]
    Year = bGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [258]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Bermuda' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Cuba 

In [259]:
# Reading in the GDP info for Cuba
if not os.path.exists( r"../../data/raw/UNData/UN_Cuba_GDP.csv" ):
    print("Missing dataset file")

In [260]:
# read the countries df into a dataframe
cGDP = pd.read_csv( r"../../data/raw/UNData/UN_Cuba_GDP.csv"  , encoding = "ISO-8859-1")

In [261]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in cGDP.iterrows():
    
    GDP = cGDP['Value'].iloc[x]
    Year = cGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [262]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Cuba' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Djibouti

In [263]:
# Reading in the GDP info for Djibouti
if not os.path.exists( r"../../data/raw/UNData/UN_Djibouti_GDP.csv" ):
    print("Missing dataset file")

In [264]:
# read the countries df into a dataframe
dGDP = pd.read_csv( r"../../data/raw/UNData/UN_Djibouti_GDP.csv"  , encoding = "ISO-8859-1")

In [265]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in dGDP.iterrows():
    
    GDP = dGDP['Value'].iloc[x]
    Year = dGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [266]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Djibouti' and pd.isnull(GDP) and Year >= 1970):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Eritrea

In [267]:
# Reading in the GDP info for Eritrea
if not os.path.exists( r"../../data/raw/UNData/UN_Eritrea_GDP.csv" ):
    print("Missing dataset file")

In [268]:
# read the countries df into a dataframe
eGDP = pd.read_csv( r"../../data/raw/UNData/UN_Eritrea_GDP.csv"  , encoding = "ISO-8859-1")

In [269]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in eGDP.iterrows():
    
    GDP = eGDP['Value'].iloc[x]
    Year = eGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [270]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    # GDP values are only given down to 1990
    if(country == 'Eritrea' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Liechtenstein 

In [271]:
# Reading in the GDP info for Liechtenstein 
if not os.path.exists( r"../../data/raw/UNData/UN_Liechtenstein_GDP.csv" ):
    print("Missing dataset file")

In [272]:
# read the countries df into a dataframe
lGDP = pd.read_csv( r"../../data/raw/UNData/UN_Liechtenstein_GDP.csv"  , encoding = "ISO-8859-1")

In [273]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in lGDP.iterrows():
    
    GDP = lGDP['Value'].iloc[x]
    Year = lGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [274]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Liechtenstein' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Puerto Rico

In [275]:
# Reading in the GDP info for Puerto Rico
if not os.path.exists( r"../../data/raw/UNData/UN_PuertoRico_GDP.csv" ):
    print("Missing dataset file")

In [276]:
# read the countries df into a dataframe
prGDP = pd.read_csv( r"../../data/raw/UNData/UN_PuertoRico_GDP.csv"  , encoding = "ISO-8859-1")

In [277]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in prGDP.iterrows():
    
    GDP = prGDP['Value'].iloc[x]
    Year = prGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [278]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Puerto Rico' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Syrian Arab Republic 

In [279]:
# Reading in the GDP info for Syrian Arab Republic 
if not os.path.exists( r"../../data/raw/UNData/UN_SyrianArabRep_GDP.csv" ):
    print("Missing dataset file")

In [280]:
# read the countries df into a dataframe
sarGDP = pd.read_csv( r"../../data/raw/UNData/UN_SyrianArabRep_GDP.csv"  , encoding = "ISO-8859-1")

In [281]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in sarGDP.iterrows():
    
    GDP = sarGDP['Value'].iloc[x]
    Year = sarGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [282]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Syrian Arab Republic' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

# Venezuela RB

In [283]:
# Reading in the GDP info for Venezuela
if not os.path.exists( r"../../data/raw/UNData/UN_Venezuela_GDP.csv" ):
    print("Missing dataset file")

In [284]:
# read the countries df into a dataframe
vGDP = pd.read_csv( r"../../data/raw/UNData/UN_Venezuela_GDP.csv"  , encoding = "ISO-8859-1")

In [285]:
# dictionary for holding GDP info key - year , value - GDP
gdpDict = {}

for x, row in vGDP.iterrows():
    
    GDP = vGDP['Value'].iloc[x]
    Year = vGDP['Year'].iloc[x]
    
    gdpDict.update({Year: GDP})
    
    

In [286]:
# dictionary for holding GDP info key - year , value - GDP
for x, row in df.iterrows():
    
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    GDP = df['GDP'].iloc[x]
    
    if(country == 'Venezuela, RB' and pd.isnull(GDP) and Year >= 1990):
        df.loc[x, 'GDP'] = gdpDict[Year]
        
        

In [287]:
df.columns

Index(['Country', 'Year', 'Population', 'Males', 'Females', 'Life_Expectancy',
       'GDP', 'Region', 'Elevation', 'Area_SqKM', 'Centroid_Longitude',
       'Centroid_Latitude', 'Population_Density', 'CO2_Emissions',
       'Methane_Emissions', 'Nitrous_Oxide_Emisions', 'Total_Emissions',
       'Emmisions_per_Capita', 'Code'],
      dtype='object')

# Checking for any null GDP values for 2012 up

In [288]:
# Countries with null values for 2012, 2014 or 2016
temp = df[df['GDP'].isnull()]
temp[temp['Year'] >= 2012]['Country'].unique()

array([], dtype=object)

# For loop for filling the rest of the Nulls

In [289]:
# iteration through the df is backwards so we can use the GDP years from after the earlier null years 
for x in reversed(df.index):
    
    
    GDP = df['GDP'].iloc[x]
    Year = df['Year'].iloc[x]
    country = df['Country'].iloc[x]
    
    if(pd.isnull(GDP)):
        # getting values for the 3 years after the null year of GDP
        gdp_1 = df['GDP'].iloc[x + 1]
        gdp_2 = df['GDP'].iloc[x + 2]
        gdp_3 = df['GDP'].iloc[x + 3]
        
        # Calcullating the average growth 
        diff2 = (gdp_3-gdp_2)/gdp_2
        diff1 = (gdp_2-gdp_1)/gdp_1
        avgGrowth = (diff1+diff2)/2
        
        
        # Setting the GDP vaule of the null year 
        
        # if the avergae growth percentage is between 0-99%
        if(avgGrowth >= 0 and avgGrowth <=  1):
            gdp_curr = gdp_1*(1-avgGrowth)
            df.loc[x, 'GDP'] = gdp_curr
            
        # if the avergae growth is a negative value we presume the null year was also better than the following year
        # so the null year will be the average negative decrease higher than its following year 
        if(avgGrowth < 0):
            gdp_curr = gdp_1 / (1 + avgGrowth)
            df.loc[x, 'GDP'] = gdp_curr
        
        # if the average growth is above 100%
        if(avgGrowth > 1):
            gdp_curr = gdp_1*(avgGrowth)
            df.loc[x, 'GDP'] = gdp_curr
            
            
        # Updating the track variables
        GDP = df['GDP'].iloc[x]
        Year = df['Year'].iloc[x]
        country = df['Country'].iloc[x]

# Checking the nulls count again 

In [290]:
# these are all the rows with null GDP values
nullGDPdf = df.loc[df['GDP'].isnull()].reset_index()
# Dropping the old index
nullGDPdf = nullGDPdf.drop(nullGDPdf.columns[[0]], axis=1)

In [291]:
# null dictionary 
nullDic = {}

In [292]:
for x, row in nullGDPdf.iterrows():
    
    country = nullGDPdf['Country'].iloc[x]
    
    # gets how many rows/ years that have null values for GDP 
    temp = nullGDPdf.loc[nullGDPdf['Country'] == country]
    length = len(temp.loc[temp['GDP'].isnull()])
    
    nullDic.update({country: length})

In [293]:
nullDic

{}

In [294]:
# no nulls left 
df.isnull().sum()

Country                      0
Year                         0
Population                   0
Males                        0
Females                      0
Life_Expectancy            142
GDP                          0
Region                       0
Elevation                  315
Area_SqKM                  315
Centroid_Longitude         315
Centroid_Latitude          315
Population_Density         315
CO2_Emissions              259
Methane_Emissions          601
Nitrous_Oxide_Emisions     601
Total_Emissions              0
Emmisions_per_Capita      2373
Code                         0
dtype: int64

# Adding back in the previously removed countries 

In [295]:
# the gdp values for these countries are added in notebook 400 
df = pd.concat([df,removed],ignore_index=True)

In [296]:
df.isnull().sum()

Area_SqKM                  315
CO2_Emissions              453
Centroid_Latitude          315
Centroid_Longitude         315
Code                         0
Country                      0
Elevation                  315
Emmisions_per_Capita      2793
Females                      0
GDP                        171
Life_Expectancy            174
Males                        0
Methane_Emissions          729
Nitrous_Oxide_Emisions     729
Population                   0
Population_Density         315
Region                       0
Total_Emissions              0
Year                         0
index                     2394
dtype: int64

In [297]:
# Changing the order of the columns 
df = df[['Country', 'Year', 'Population', 'Males', 'Females', 'Life_Expectancy',
       'GDP', 'Region', 'Elevation', 'Area_SqKM', 'Centroid_Longitude',
       'Centroid_Latitude', 'Population_Density', 'CO2_Emissions',
       'Methane_Emissions', 'Nitrous_Oxide_Emisions', 'Total_Emissions',
       'Emmisions_per_Capita', 'Code']]

In [298]:
df.to_csv( r"../../data/prep/Countries/countries_275.csv", index=False)