DATASETS

1. World Bank Open Data - Forest coverage (API)
2. UN Data - Population by sex and urban/rural residence between 2000 - 2020 (csv)
3. Food and Agriculture Organization of the United Nations (FAO) - FAOSTAT (csv)


In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re 
regex_pattern = re.compile(r'\s\((.*?)\)|(\s?,.*)')

To access the World Bank Open Data API using Python, we installed the wbdata library

In [2]:
pip install wbdata




Imported the data and looked at timeframe, countries and created the dataframe

In [3]:
import wbdata
import pandas as pd
import datetime 

# Set the indicator code for forest coverage
indicator_code = 'AG.LND.FRST.ZS'

# Set the start and end year for the data
start_year = 2000
end_year = 2020

# Set the countries to retrieve data for
countries = ['USA', 'CAN', 'MEX']

# Retrieve the data
start_date = datetime.datetime(start_year, 1, 1)
end_date = datetime.datetime(end_year, 12, 31)

forest_data = wbdata.get_data(indicator_code, country=countries, data_date=(start_date, end_date))

# Convert the data to a pandas DataFrame
df = pd.DataFrame(forest_data)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   indicator        63 non-null     object 
 1   country          63 non-null     object 
 2   countryiso3code  63 non-null     object 
 3   date             63 non-null     object 
 4   value            63 non-null     float64
 5   unit             63 non-null     object 
 6   obs_status       63 non-null     object 
 7   decimal          63 non-null     int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 4.1+ KB


In [4]:
df.head()


Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2020,38.695513,,,1
1,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2019,38.699637,,,1
2,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2018,38.703763,,,1
3,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2017,38.707888,,,1
4,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'CA', 'value': 'Canada'}",CAN,2016,38.712013,,,1


checking average forest coverage for selected countries

In [5]:

# Calculate the average forest coverage for each country
avg_forest_coverage = df.groupby('countryiso3code')['value'].mean()

# Print the average forest coverage for each country
print('Average forest coverage by country:')
print(avg_forest_coverage)


Average forest coverage by country:
countryiso3code
CAN    38.741693
MEX    34.463532
USA    33.631871
Name: value, dtype: float64


checking min and max coverage 

In [6]:

# Determine which country has the highest and lowest forest coverage
max_coverage = df.loc[df['value'].idxmax()]
min_coverage = df.loc[df['value'].idxmin()]

# Print the country with the highest and lowest forest coverage
print(f'The country with the highest forest coverage is {max_coverage["country"]} with a coverage of {max_coverage["value"]:.2f}%.')
print(f'The country with the lowest forest coverage is {min_coverage["country"]} with a coverage of {min_coverage["value"]:.2f}%.')

# Determine if any country has seen significant increases or decreases in forest coverage during the specified time period
for country in countries:
    country_data = df.loc[df['countryiso3code'] == country]
    initial_coverage = country_data.loc[country_data['date'] == str(start_year)]['value'].values[0]
    final_coverage = country_data.loc[country_data['date'] == str(end_year)]['value'].values[0]
    change = final_coverage - initial_coverage
    if abs(change) > 5:
        if change > 0:
            print(f'{country} has seen a significant increase in forest coverage of {change:.2f}% between {start_year} and {end_year}.')
        else:
            print(f'{country} has seen a significant decrease in forest coverage of {change:.2f}% between {start_year} and {end_year}.')




The country with the highest forest coverage is {'id': 'CA', 'value': 'Canada'} with a coverage of 38.79%.
The country with the lowest forest coverage is {'id': 'US', 'value': 'United States'} with a coverage of 33.13%.


In [7]:
# Retrieve the global forest coverage data for the specified time period
global_forest_data = wbdata.get_data(indicator_code,  data_date=(start_date, end_date))



In [8]:
global_forest_df = pd.DataFrame(global_forest_data)
global_forest_df.head()

Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2020,30.174186,,,1
1,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2019,30.391558,,,1
2,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2018,30.611444,,,1
3,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2017,30.824248,,,1
4,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2016,31.039613,,,1


In [9]:
# Calculate the average global forest coverage for the specified time period
global_avg_forest_coverage = df['value'].mean()
print(global_avg_forest_coverage)

# Compare the average forest coverage in the selected countries to the global average
if avg_forest_coverage.mean() > global_avg_forest_coverage:
    print('The selected countries have higher than average forest coverage compared to the global average.')
elif avg_forest_coverage.mean() < global_avg_forest_coverage:
    print('The selected countries have lower than average forest coverage compared to the global average.')
else:
    print('The selected countries have average forest coverage compared to the global average.')

35.61236560761129
The selected countries have higher than average forest coverage compared to the global average.


In [10]:
pip install pycountry pycountry_convert

Note: you may need to restart the kernel to use updated packages.


In [42]:
def clean_country_name (name):
    if name is None:
        return name
    return re.sub(regex_pattern, '', name)

In [43]:
import pycountry
import pycountry_convert as pc

def country_to_continent (country_alpha2):        
    continent_code = pc.country_alpha2_to_continent_code (country_alpha2)    
    continent_name = pc.convert_continent_code_to_continent_name(continent_code)
    return continent_name

def country_name_to_continent (country_name):        
    try:
        country_code = pc.country_name_to_country_alpha2(country_name)    
        continent_name = country_to_continent(country_code)
        return continent_name
    except:        
        pass
    

# Create a lookup table mapping country names to continents
country_name_to_continent = {}
for country in pycountry.countries:  
    try:
        continent = country_to_continent(country.alpha_2)        
        country_name_to_continent[clean_country_name(country.name)]= continent         
    except:
        pass
print(country_name_to_continent)




{'Aruba': 'North America', 'Afghanistan': 'Asia', 'Angola': 'Africa', 'Anguilla': 'North America', 'Åland Islands': 'Europe', 'Albania': 'Europe', 'Andorra': 'Europe', 'United Arab Emirates': 'Asia', 'Argentina': 'South America', 'Armenia': 'Asia', 'American Samoa': 'Oceania', 'Antigua and Barbuda': 'North America', 'Australia': 'Oceania', 'Austria': 'Europe', 'Azerbaijan': 'Asia', 'Burundi': 'Africa', 'Belgium': 'Europe', 'Benin': 'Africa', 'Bonaire': 'North America', 'Burkina Faso': 'Africa', 'Bangladesh': 'Asia', 'Bulgaria': 'Europe', 'Bahrain': 'Asia', 'Bahamas': 'North America', 'Bosnia and Herzegovina': 'Europe', 'Saint Barthélemy': 'North America', 'Belarus': 'Europe', 'Belize': 'North America', 'Bermuda': 'North America', 'Bolivia': 'South America', 'Brazil': 'South America', 'Barbados': 'North America', 'Brunei Darussalam': 'Asia', 'Bhutan': 'Asia', 'Bouvet Island': 'Antarctica', 'Botswana': 'Africa', 'Central African Republic': 'Africa', 'Canada': 'North America', 'Cocos Isla

In [12]:
def country_code_to_name (row):
    code = row ['countryiso3code']
    try:
        return pycountry.countries.get(alpha_3=code).name
    except:
        pass

global_forest_df['country_name']=global_forest_df.apply(lambda row:country_code_to_name(row), axis=1)
global_forest_df.head()

Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal,country_name
0,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2020,30.174186,,,1,
1,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2019,30.391558,,,1,
2,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2018,30.611444,,,1,
3,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2017,30.824248,,,1,
4,"{'id': 'AG.LND.FRST.ZS', 'value': 'Forest area...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2016,31.039613,,,1,


In [34]:
x=global_forest_df.filter(items=['country_name','date','value'])
x.head()
x['clean_country_name'] = x['country_name'].apply(lambda c:clean_country_name(c))

forest_valid_countries = x[x['clean_country_name'].isin(country_name_to_continent.keys())]


forest_valid_countries.head()
x.to_csv('forest.csv')

In [None]:
# add continent data to dataframe

forest_valid_countries['continent'] = forest_valid_countries['country_name'].map(country_name_to_continent)

avg_forest_coverage_by_continent = forest_valid_countries.groupby('continent')['value'].mean()
print(avg_forest_coverage_by_continent)
global_forest_df.info()

UN Data on Population by sex and urban/rural residence between 2000 - 2020

In [None]:
population = pd.read_csv("population_by_sex_residence.csv", index_col = 0)
population.head()

In [None]:
population.columns
population.reset_index(inplace=True)
population.columns
population.head()

In [None]:
population = population.filter(items=['Country or Area','Year','Reliability','Value']).loc[(population['Reliability']=='Final figure, complete')]
population['Year'] = population['Year'].astype(int)
population = population.loc[(population['Year']>=2000) & (population['Year']<2021)]
population.head()

In [None]:
population_valid_countries = population[population['Country or Area'].isin(country_name_to_continent.keys())]
population_valid_countries.head()

In [None]:
population['continent'] = population['Country or Area'].map(country_name_to_continent)
population=population.rename(columns={'Value':'population value'})
population.head()


In [None]:
population.to_csv("populationWithContinent.csv")

In [None]:
continent_population_data = population.groupby(['Year', 'continent']).sum('Value')
continent_population_data.reset_index(inplace=True)
continent_population_data.info()



In [None]:
continent_population_data['population value'] = (continent_population_data['population value'] / 1000000).round(2)
print(continent_population_data.values)

In [None]:
continent_forest_data = forest_valid_countries.groupby(['date', 'continent']).sum('value')
continent_forest_data.reset_index(inplace=True)
continent_forest_data['Year'] = continent_forest_data['date'].astype(int)
continent_forest_data=continent_forest_data.rename(columns={'value':'forest coverage'})
continent_forest_data.info()

In [None]:
merged_df = pd.merge(continent_population_data, continent_forest_data, left_on=['continent','Year'], right_on=['continent', 'Year'], how='inner')
print(merged_df.shape)
merged_df.head()


Third dataset on food production

In [None]:
food = pd.read_csv("food_production.csv", index_col = 0)
food.head()

In [None]:
filtered_food= food.filter(items=['Area','Item','Element','Year','Unit','Value','Flag']).loc[(food['Flag']=='A')& (food['Element']=='Area harvested') & (food['Year']>=2000) & (food['Year']<2021)]
food_summary=filtered_food.groupby(['Year', 'Area']).sum('Value')
food_summary.reset_index(inplace=True)
food_summary.head()


food_valid_countries = food_summary[food_summary['Area'].isin(country_name_to_continent.keys())]
print(food_valid_countries)

In [None]:
country_counts_food = food_valid_countries['Area'].value_counts()
filtered_food_valid_countries = food_valid_countries[food_valid_countries['Area'].isin(country_counts_food[country_counts_food == 21].index)]


print(filtered_food_valid_countries)

In [None]:

forest_valid_countries=forest_valid_countries.rename(columns={'value':'forest coverage %'})
forest_valid_countries['Year'] = forest_valid_countries['date'].astype(int)
forest_valid_countries.head()

In [None]:
country_counts_forest = forest_valid_countries['country_name'].value_counts()
filtered_forest_countries = forest_valid_countries[forest_valid_countries['country_name'].isin(country_counts_forest[country_counts_forest == 21].index)]


print(filtered_forest_countries)


In [None]:
filtered_forest_countries.groupby(['country_name']).count()

In [None]:
food_valid_countries['continent'] = food_valid_countries['Area'].map(country_name_to_continent)
food_valid_countries=food_valid_countries.rename(columns={'Value':'Area harvested - ha'})
food_valid_countries.head()

In [None]:
population_valid_countries = population_valid_countries.groupby(['Year', 'Country or Area']).mean('population value')
population_valid_countries.reset_index(inplace=True)
population_valid_countries.head()

In [None]:
country_counts_population = population_valid_countries['Country or Area'].value_counts()
filtered_countries = country_counts_population[country_counts_population == 21].index.tolist()


In [None]:

filtered_population_valid_countries = population_valid_countries[population_valid_countries['Country or Area'].isin(filtered_countries)]
filtered_population_valid_countries.head()

In [None]:
df_1 = pd.merge(forest_valid_countries, food_valid_countries, left_on=['country_name','Year'], right_on=['Area', 'Year'], how='inner')
print(df_1.shape)
df_1.head()

In [None]:
df_2 = pd.merge(df_1, population_valid_countries, left_on=['country_name','Year'], right_on=['Country or Area', 'Year'], how='inner')
print(df_2.shape)
df_2.head()

In [None]:
df_2.to_csv('merged_data.csv')

In [None]:
final_data= df_2.filter(items=['country_name','continent','Value','forest coverage %','Area harvested - ha', 'Year'])

print(final_data.shape)
final_data.head()

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
sns.barplot(data=final_data, y='Value', x='Year', hue='country_name')
plt.title('Population')

In [None]:
df_3 = pd.merge(filtered_forest_countries, filtered_food_valid_countries, left_on=['country_name','Year'], right_on=['Area', 'Year'], how='inner')
print(df_3.shape)
df_3.head()

In [None]:
df_4 = pd.merge(df_3, filtered_population_valid_countries, left_on=['country_name','Year'], right_on=['Country or Area', 'Year'], how='inner')
print(df_4.shape)
df_4.head()

In [None]:
df_4_final=df_4.rename(columns={'Value_x':'Area harvested - ha','Value_y':'Population' })
df_4_final.head()

In [None]:
CAN = df_4_final['country_name'] == 'Canada'
CAN_df_4 = df_4_final[CAN]

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
sns.barplot(data=CAN_df_4, y='Area harvested - ha', x='Year', hue='country_name')
plt.title('forest coverage %')

In [None]:
df_4.to_csv('df_4.csv')

In [None]:
df_4_final_continents = df_4_final.groupby(['Year', 'continent']).sum('value')
df_4_final_continents.reset_index(inplace=True)
df_4_final_continents.head()

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
sns.barplot(data=df_4_final_continents, y='Population', x='Year', hue='continent')
plt.title('Population')