DATASETS

1. World Bank Open Data - Forest coverage (API)
2. UN Data - Population by sex and urban/rural residence between 2000 - 2020 (csv)
3. Food and Agriculture Organization of the United Nations (FAO) - FAOSTAT


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

To access the World Bank Open Data API using Python, we installed the wbdata library

In [2]:
pip install wbdata

Note: you may need to restart the kernel to use updated packages.


Imported the data and looked at timeframe, countries and created the dataframe

In [3]:
import wbdata
import pandas as pd
import datetime 

# Set the indicator code for forest coverage
indicator_code = 'AG.LND.FRST.ZS'

# Set the start and end year for the data
start_year = 2000
end_year = 2020

# Set the countries to retrieve data for
countries = ['USA', 'CAN', 'MEX']

# Retrieve the data
start_date = datetime.datetime(start_year, 1, 1)
end_date = datetime.datetime(end_year, 12, 31)

forest_data = wbdata.get_data(indicator_code, country=countries, data_date=(start_date, end_date))

# Convert the data to a pandas DataFrame
df = pd.DataFrame(forest_data)
print(df)


                                            indicator  \
0   {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
1   {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
2   {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
3   {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
4   {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
..                                                ...   
58  {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
59  {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
60  {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
61  {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   
62  {'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...   

                                   country countryiso3code  date      value  \
0          {'id': 'CA', 'value': 'Canada'}             CAN  2020  38.695513   
1          {'id': 'CA', 'value': 'Canada'}             CAN  2019  38.699637   
2          {'id': 'CA', 'value': 'Canada'}             CAN  2018  38.703763   

In [4]:
df.head()


Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2020,38.695513,,,1
1,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2019,38.699637,,,1
2,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2018,38.703763,,,1
3,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2017,38.707888,,,1
4,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2016,38.712013,,,1


checking average forest coverage for selected countries

In [5]:

# Calculate the average forest coverage for each country
avg_forest_coverage = df.groupby('countryiso3code')['value'].mean()

# Print the average forest coverage for each country
print('Average forest coverage by country:')
print(avg_forest_coverage)


Average forest coverage by country:
countryiso3code
CAN    38.741693
MEX    34.463532
USA    33.631871
Name: value, dtype: float64


checking min and max coverage 

In [6]:

# Determine which country has the highest and lowest forest coverage
max_coverage = df.loc[df['value'].idxmax()]
min_coverage = df.loc[df['value'].idxmin()]

# Print the country with the highest and lowest forest coverage
print(f'The country with the highest forest coverage is {max_coverage["country"]} with a coverage of {max_coverage["value"]:.2f}%.')
print(f'The country with the lowest forest coverage is {min_coverage["country"]} with a coverage of {min_coverage["value"]:.2f}%.')

# Determine if any country has seen significant increases or decreases in forest coverage during the specified time period
for country in countries:
    country_data = df.loc[df['countryiso3code'] == country]
    initial_coverage = country_data.loc[country_data['date'] == str(start_year)]['value'].values[0]
    final_coverage = country_data.loc[country_data['date'] == str(end_year)]['value'].values[0]
    change = final_coverage - initial_coverage
    if abs(change) > 5:
        if change > 0:
            print(f'{country} has seen a significant increase in forest coverage of {change:.2f}% between {start_year} and {end_year}.')
        else:
            print(f'{country} has seen a significant decrease in forest coverage of {change:.2f}% between {start_year} and {end_year}.')




The country with the highest forest coverage is {'id': 'CA', 'value': 'Canada'} with a coverage of 38.79%.
The country with the lowest forest coverage is {'id': 'US', 'value': 'United States'} with a coverage of 33.13%.


In [7]:
# Retrieve the global forest coverage data for the specified time period
global_forest_data = wbdata.get_data(indicator_code,  data_date=(start_date, end_date))



In [8]:
global_forest_df = pd.DataFrame(global_forest_data)
global_forest_df.head()

Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2020,30.174186,,,1
1,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2019,30.391558,,,1
2,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2018,30.611444,,,1
3,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2017,30.824248,,,1
4,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2016,31.039613,,,1


In [9]:
# Calculate the average global forest coverage for the specified time period
global_avg_forest_coverage = df['value'].mean()
print(global_avg_forest_coverage)

# Compare the average forest coverage in the selected countries to the global average
if avg_forest_coverage.mean() > global_avg_forest_coverage:
    print('The selected countries have higher than average forest coverage compared to the global average.')
elif avg_forest_coverage.mean() < global_avg_forest_coverage:
    print('The selected countries have lower than average forest coverage compared to the global average.')
else:
    print('The selected countries have average forest coverage compared to the global average.')

35.61236560761129
The selected countries have higher than average forest coverage compared to the global average.


In [10]:
pip install pycountry pycountry_convert




In [11]:
import pycountry
import pycountry_convert as pc

def country_to_continent (country_alpha2):        
    continent_code = pc.country_alpha2_to_continent_code (country_alpha2)    
    continent_name = pc.convert_continent_code_to_continent_name(continent_code)
    return continent_name

# Create a lookup table mapping country codes to continents
country_code_to_continent = {}
for country in pycountry.countries:  
    try:
        continent = country_to_continent(country.alpha_2)        
        country_code_to_continent[country.alpha_3]= continent         
    except:
        pass
print(country_code_to_continent)


{'ABW': 'North America', 'AFG': 'Asia', 'AGO': 'Africa', 'AIA': 'North America', 'ALA': 'Europe', 'ALB': 'Europe', 'AND': 'Europe', 'ARE': 'Asia', 'ARG': 'South America', 'ARM': 'Asia', 'ASM': 'Oceania', 'ATG': 'North America', 'AUS': 'Oceania', 'AUT': 'Europe', 'AZE': 'Asia', 'BDI': 'Africa', 'BEL': 'Europe', 'BEN': 'Africa', 'BES': 'North America', 'BFA': 'Africa', 'BGD': 'Asia', 'BGR': 'Europe', 'BHR': 'Asia', 'BHS': 'North America', 'BIH': 'Europe', 'BLM': 'North America', 'BLR': 'Europe', 'BLZ': 'North America', 'BMU': 'North America', 'BOL': 'South America', 'BRA': 'South America', 'BRB': 'North America', 'BRN': 'Asia', 'BTN': 'Asia', 'BVT': 'Antarctica', 'BWA': 'Africa', 'CAF': 'Africa', 'CAN': 'North America', 'CCK': 'Asia', 'CHE': 'Europe', 'CHL': 'South America', 'CHN': 'Asia', 'CIV': 'Africa', 'CMR': 'Africa', 'COD': 'Africa', 'COG': 'Africa', 'COK': 'Oceania', 'COL': 'South America', 'COM': 'Africa', 'CPV': 'Africa', 'CRI': 'North America', 'CUB': 'North America', 'CUW': 'N

In [12]:
# add continent data to dataframe

global_forest_df['continent'] = global_forest_df['countryiso3code'].map(country_code_to_continent)
avg_forest_coverage_by_continent = global_forest_df.groupby('continent')['value'].mean()
print(avg_forest_coverage_by_continent)

continent
Africa           28.969330
Asia             22.960607
Europe           30.304611
North America    37.713132
Oceania          49.680202
South America    51.244379
Name: value, dtype: float64


UN Data on Population by sex and urban/rural residence between 2000 - 2020

In [13]:
population = pd.read_csv("population_by_sex_residence.csv", index_col = 0)
population.head()

Unnamed: 0_level_0,Year,Area,Sex,Record Type,Reliability,Source Year,Value,Value Footnotes
Country or Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,2020,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2021.0,31390171.0,1
Afghanistan,2019,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2019.0,30725560.0,1
Afghanistan,2018,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2019.0,30075018.0,1
Afghanistan,2017,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2018.0,28224323.0,1
Afghanistan,2016,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2017.0,27657145.0,1


In [14]:
population.columns
population.reset_index(inplace=True)
population.columns
population.head()

Unnamed: 0,Country or Area,Year,Area,Sex,Record Type,Reliability,Source Year,Value,Value Footnotes
0,Afghanistan,2020,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2021.0,31390171.0,1
1,Afghanistan,2019,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2019.0,30725560.0,1
2,Afghanistan,2018,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2019.0,30075018.0,1
3,Afghanistan,2017,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2018.0,28224323.0,1
4,Afghanistan,2016,Total,Both Sexes,Estimate - de facto,"Final figure, incomplete/questionable reliability",2017.0,27657145.0,1


In [15]:
def country_name_to_continent (country_name):        
    country_code = pc.country_name_to_country_alpha2(country_name)    
    continent_name = country_to_continent(country_code)
    return continent_name

population['continent'] = population['Country or Area'].map(country_name_to_continent(country_name_to_continent))
population.head()

TypeError: object of type 'function' has no len()

Third dataset on food production

In [16]:
food = pd.read_csv("food_production.csv", index_col = 0)
food.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
Area Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1975,1975,ha,0.0,E
2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1976,1976,ha,5900.0,E
2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1977,1977,ha,6000.0,E
2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1978,1978,ha,6000.0,E
2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1979,1979,ha,6000.0,E
