<a href="https://colab.research.google.com/github/DataScienceLiam/DataScienceLiam.github.io/blob/main/colab/Tax_as_of_GDP_vs_GDP_growth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import pandas as pd
with open('Tax vs Spending per capita data.json') as f:
    data = json.load(f)
df = pd.DataFrame(data)

# Importing json from other chart as it has data that is transferable


In [None]:
# Calculating taxs as a percentage of GDP

df["Tax as % of GDP"] = (df["Total tax revenue"] / df["GDP"])*100
df.tail(10)



Unnamed: 0,Country,Population,GDP,GDP per capita,Country code,Total tax revenue,Government spending,Spending per Capita,Tax per Capita,Tax as % of GDP
181,United Kingdom,67081000.0,3186860000000.0,47508.0,GBR,863785000000.0,1103721000000.0,16453.556149,12876.746023,27.104579
182,United States,332183000.0,23315100000000.0,69227.0,USA,5923829000000.0,9818534000000.0,29557.605296,17833.028782,25.407693
183,Uruguay,3485152.0,59320000000.0,17021.0,URY,17690000000.0,19900000000.0,5709.937472,5075.818788,29.821308
184,Uzbekistan,34559000.0,69202000000.0,2002.0,UZB,20110000000.0,19920000000.0,576.405567,581.903412,29.059854
185,Vanuatu,314464.0,984000000.0,3128.0,VUT,204000000.0,207000000.0,658.262949,648.722906,20.731707
186,Venezuela,28704947.0,102084000000.0,3534.0,VEN,5521000000.0,10382000000.0,361.679818,192.336185,5.408291
187,Viet Nam,98168829.0,366201000000.0,3730.0,VNM,57837000000.0,78348000000.0,798.094475,589.1585,15.793785
188,Yemen,30490639.0,21062000000.0,691.0,YEM,3467000000.0,5232000000.0,171.593649,113.70703,16.460925
189,Zambia,18920657.0,21313000000.0,1126.0,ZMB,4895000000.0,7050000000.0,372.608626,258.711946,22.967203
190,Zimbabwe,15092171.0,26218000000.0,1737.0,ZWE,3600000000.0,4800000000.0,318.045694,238.534271,13.731024


In [None]:
# Scraping website to get 2011 GDP values, then clenaning data
GDP_2011 = pd.read_html(
    'https://countryeconomy.com/gdp?year=2011')

df_GDP_2011 = GDP_2011[0]

df_GDP_2011 = df_GDP_2011.loc[:, ['Countries', 'Annual GDP']]
df_GDP_2011["Countries"] = df_GDP_2011["Countries"].str.strip(' [+]')
df_GDP_2011["Annual GDP"] = df_GDP_2011["Annual GDP"].str.strip('€')
df_GDP_2011["Annual GDP"] = df_GDP_2011["Annual GDP"].str.strip('M')
df_GDP_2011 = df_GDP_2011.dropna()
df_GDP_2011["Annual GDP"] = df_GDP_2011["Annual GDP"].str.replace(',', '')
df_GDP_2011 = df_GDP_2011.rename(columns={'Annual GDP': 'Annual GDP 2011'})

df_GDP_2011.head(10)


Unnamed: 0,Countries,Annual GDP 2011
0,United States,11209034
1,Euro zone,9746637
2,United Kingdom,1912869
3,Germany,2693560
4,France,2058369
5,Japan,4247574
6,Spain,1063763
7,Italy,1648756
8,Portugal,176096
9,Greece,203308


In [None]:
# Doing the same for 2021 values
GDP_2021 = pd.read_html(
    'https://countryeconomy.com/gdp?year=2021')

df_GDP_2021 = GDP_2021[0]

df_GDP_2021 = df_GDP_2021.loc[:, ['Countries', 'Annual GDP']]
df_GDP_2021["Countries"] = df_GDP_2021["Countries"].str.strip(' [+]')
df_GDP_2021["Annual GDP"] = df_GDP_2021["Annual GDP"].str.strip('€')
df_GDP_2021["Annual GDP"] = df_GDP_2021["Annual GDP"].str.strip('M')
df_GDP_2021["Annual GDP"] = df_GDP_2021["Annual GDP"].str.replace(',', '')
df_GDP_2021 = df_GDP_2021.dropna()
df_GDP_2021 = df_GDP_2021.rename(columns={'Annual GDP': 'Annual GDP 2021'})

df_GDP_2021.head(10)


Unnamed: 0,Countries,Annual GDP 2021
0,United States,19430123
1,Euro zone,12313472
2,United Kingdom,2648575
3,Germany,3601750
4,France,2500870
5,Japan,4167675
6,Spain,1206842
7,Italy,1782050
8,Portugal,214471
9,Greece,181675


In [None]:
# Merging the 2011 data and 2021
df_GDP = pd.merge(df_GDP_2011, df_GDP_2021, on='Countries')


In [None]:
import numpy as np

# Calculating the avergae yearly growth rate over the ten years. 
df_GDP = df_GDP.loc[:, ['Countries','Annual GDP 2011', 'Annual GDP 2021']]
df_GDP.dropna(inplace=True)
df_GDP['Annual GDP 2011'] = df_GDP['Annual GDP 2011'].astype(int)
df_GDP['Annual GDP 2021'] = df_GDP['Annual GDP 2021'].astype(int)
df_GDP["Average growth"] = (
    np.power(df_GDP["Annual GDP 2021"] / df_GDP["Annual GDP 2011"], 1/10) - 1) * 100
df_GDP.head()

Unnamed: 0,Countries,Annual GDP 2011,Annual GDP 2021,Average growth
0,United States,11209034,19430123,5.655166
1,Euro zone,9746637,12313472,2.365255
2,United Kingdom,1912869,2648575,3.307703
3,Germany,2693560,3601750,2.948184
4,France,2058369,2500870,1.96633


In [None]:
%pip install pycountry
import pycountry

# Using pycountry to assign country codes
def do_fuzzy_search(country):
    try:
        result = pycountry.countries.search_fuzzy(country)
        return result[0].alpha_3
    except:
        return 'Did not work'


iso_map = {country: do_fuzzy_search(country)
           for country in df_GDP["Countries"].unique()}
df_GDP["Country code"] = df_GDP["Countries"].map(iso_map)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 4.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: pycountry
  Building wheel for pycountry (PEP 517) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681845 sha256=256be65961f237f75478c94658c37aa910658e3f60bdadb09705e26a08d6920d
  Stored in directory: /root/.cache/pip/wheels/e2/aa/0f/c224e473b464387170b83ca7c66947b4a7e33e8d903a679748
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-22.3.5


In [None]:
pd.set_option('display.max_rows', 500)

# Adding the few countries pycountry failed 
df_GDP.loc[40, 'Country code'] = 'COD'
df_GDP.loc[100, 'Country code'] = 'LAO'
df_GDP.loc[179, 'Country code'] = 'TUR'
df_GDP.drop(1, inplace=True)


In [None]:
df_merged = pd.merge(df, df_GDP, on='Country code')
df_merged.drop(df_merged.columns[[2, 3, 10]], axis=1, inplace=True)

# Drop any duplicates
df_merged = df_merged.drop_duplicates(subset='Country code')
df_merged = df_merged.reset_index(drop=True)
df_merged.head()

Unnamed: 0,Country,Population,Country code,Total tax revenue,Government spending,Spending per Capita,Tax per Capita,Tax as % of GDP,Annual GDP 2011,Annual GDP 2021,Average growth
0,Afghanistan,39835428.0,AFG,1992000000.0,6636000000.0,166.585382,50.005739,9.892729,12855,17643,3.216716
1,Albania,2793592.0,ALB,3486000000.0,3765000000.0,1347.727227,1247.855807,19.090909,9268,15432,5.230981
2,Algeria,44616626.0,DZA,41474000000.0,58397000000.0,1308.861858,929.563791,25.489365,143722,137480,-0.44304
3,Andorra,79535.0,AND,1872000000.0,2060000000.0,25900.546929,23536.807695,56.216216,2607,2815,0.770575
4,Angola,33933611.0,AGO,11243000000.0,13004000000.0,383.218868,331.323419,14.954974,80326,63521,-2.319895


In [None]:
import json

#Exporting the final dataframe as a json
exported_values = list(df_merged.T.to_dict().values())
open('Tax % of GDP vs growth.json',
     'w').write(json.dumps(exported_values))


67747