In [None]:
# Region: 10 Countries with highest GDP
# Domain: GDP & Education Levels
# Resarch Question: GDP vs. Education for the Top Ten Countries with Highest GDP

# Import Datasets
import pandas as pd
import numpy as np

df1 = pd.read_csv('GDP_Data.csv')

# Data cleansing
df1.info()
df1.columns

# rename column names
new_columns = {
    'Series Name': 'GDP (current $US)',
    'Series Code': 'Code',
    'Country Name': 'Country',
    'Country Code': 'Code',
    '1990 [YR1990]': '1990',
    '2000 [YR2000]': '2000',
    '2013 [YR2013]': '2013',
    '2014 [YR2014]': '2014',
    '2015 [YR2015]': '2015',
    '2016 [YR2016]': '2016',
    '2017 [YR2017]': '2017',
    '2018 [YR2018]': '2018',
    '2019 [YR2019]': '2019',
    '2020 [YR2020]': '2020',
    '2021 [YR2021]': '2021',
    '2022 [YR2022]': '2022'
}

df1.rename(columns=new_columns, inplace=True)
df1 = df1[['Country', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']]

# convert GDP values to float
convert_columns = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

# use pd.to_numeric with errors='coerce' to handle any non-converts
df1[convert_columns] = df1[convert_columns].apply(pd.to_numeric, errors='coerce', downcast='float')

# Handle missing values
df1.dropna(inplace=True)

# Create Average GDP column
gdp_columns = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

df1['Average GDP 2013-2022'] = df1[gdp_columns].mean(axis=1)

df = df.sort_values(by='Average GDP 2013-2022', ascending=False)

columns_to_scale = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', 'Average GDP 2013-2022']

# Divide selected columns by 10^9 to represent values in billions
df[columns_to_scale] = df[columns_to_scale] / 1e9

df = df.head(193)

df = df[['Country', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', 'Average GDP 2013-2022']]

df.to_csv('final_gdp.csv')

In [None]:
df2 = pd.read_csv('Education_Expenditure_Data.csv')

df2.columns

# rename column names
new_columns = {
    'Series Name': 'Government Expenditure on Education (% total GDP)',
    'Series Code': 'Code',
    'Country Name': 'Country',
    'Country Code': 'Code',
    '1990 [YR1990]': '1990',
    '2000 [YR2000]': '2000',
    '2013 [YR2013]': '2013',
    '2014 [YR2014]': '2014',
    '2015 [YR2015]': '2015',
    '2016 [YR2016]': '2016',
    '2017 [YR2017]': '2017',
    '2018 [YR2018]': '2018',
    '2019 [YR2019]': '2019',
    '2020 [YR2020]': '2020',
    '2021 [YR2021]': '2021',
    '2022 [YR2022]': '2022'
}

df2.rename(columns=new_columns, inplace=True)
df2 = df2[['Country', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']]

# convert GDP values to float
convert_columns = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

# use pd.to_numeric with errors='coerce' to handle any non-converts
df2[convert_columns] = df2[convert_columns].apply(pd.to_numeric, errors='coerce', downcast='float')

# Handle missing values
df2.interpolate(method='linear', inplace=True)

# Create Average column
gdp_columns = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

df2['Average Expenditure on Education (% of GDP) 2013-2022'] = df2[gdp_columns].mean(axis=1)

df2.dropna(subset=['Country'], inplace=True)

df2 = df2.head(216)

df2 = df2.sort_values(by='Average Expenditure on Education (% of GDP) 2013-2022', ascending=False)

df2.to_csv('final_edu_v2.csv')

In [None]:
edu_df = pd.read_csv('final_edu_v2.csv')
gdp_df = pd.read_csv('final_gdp.csv')

gdp_df = gdp_df[['Country', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', 'Average GDP 2013-2022']]

import matplotlib.pyplot as plt

# data for plotting
countries = gdp_df['Country']
average_gdp_values = gdp_df['Average GDP 2013-2022']

# custom colors
colors = ['grey' if country != 'United States' else 'cadetblue' for country in countries]

# bar chart
plt.figure(figsize=(16, 9))
bars = plt.bar(countries, average_gdp_values, color=colors)
plt.xlabel('Country')
plt.title('Average GDP 2013-2022 for Ten Wealthiest Countries (Billions)')
plt.xticks(rotation=45, ha='right') # rotate labels for readability
plt.yticks([])

for bar in bars:
    height = bar.get_height()
    plt.gca().text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 500, '$' + str(int(height)),
                   ha='center', color='w', fontsize=11)

    # remove plot frame
for spine in plt.gca().spines.values():
    spine.set_visible(False)

plt.tight_layout()
plt.show()