<a href="https://colab.research.google.com/github/DataScienceLiam/DataScienceLiam.github.io/blob/main/colab/Tax_vs_spending_per_capita.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from functools import reduce
pd.set_option('display.max_rows', 500)

# Scraping data for GDP per capita

Pop_GDP = pd.read_html(
    'https://countryeconomy.com/countries/groups/united-nations')

df_Pop_GDP = Pop_GDP[0]

df_Pop_GDP = df_Pop_GDP.loc[:, ['Countries', 'Population', 'Annual GDP.1', 'GDP per capita.1']]
df_Pop_GDP["Countries"] = df_Pop_GDP["Countries"].str.strip(' [+]')
df_Pop_GDP["Annual GDP.1"] = df_Pop_GDP["Annual GDP.1"].str.strip('$')
df_Pop_GDP["Annual GDP.1"] = df_Pop_GDP["Annual GDP.1"].str.strip('M')
df_Pop_GDP["Annual GDP.1"] = df_Pop_GDP["Annual GDP.1"].str.replace(',', '').astype(int)
df_Pop_GDP["GDP per capita.1"] = df_Pop_GDP["GDP per capita.1"].str.strip('$')
df_Pop_GDP["GDP per capita.1"] = df_Pop_GDP["GDP per capita.1"].str.replace(',', '').astype(int)
df_Pop_GDP.drop(df_Pop_GDP.index[[193]], inplace=True)
df_Pop_GDP.drop(df_Pop_GDP.index[[15]], inplace=True)
df_Pop_GDP.head(10)



Unnamed: 0,Countries,Population,Annual GDP.1,GDP per capita.1
0,Afghanistan,39835428,20136,517
1,Albania,2793592,18260,6536
2,Algeria,44616626,162711,3647
3,Andorra,79535,3330,41873
4,Angola,33933611,75179,2215
5,Antigua and Barbuda,98728,1471,14901
6,Argentina,45808747,491493,10729
7,Armenia,2968128,13928,4693
8,Australia,25767000,1635255,63464
9,Austria,8978929,477084,53332


In [None]:
# Scraping data for government spending and tax revenue

url = "https://en.wikipedia.org/wiki/List_of_countries_by_government_budget"

Spending = pd.read_html(url, match="Expenditures")
Spending
df_Spending = pd.DataFrame(Spending[0])


df_Spending = df_Spending.loc[:, ['Country', 'Revenues', 'Expenditures']]
df_Spending.drop(df_Spending.index[[64]], inplace=True)
df_Spending.drop(df_Spending.index[[226,227]], inplace=True)
#Droping as not in USD & total at bottom
df_Spending.head(10)

Unnamed: 0,Country,Revenues,Expenditures
0,USA,5923829.0,9818534.0
1,China,3622313.0,5388814.0
2,India,1729224.0,2038247.0
3,Germany,1666454.0,2362676.0
4,Japan,1334944.0,1609710.0
5,France,966407.0,1400776.0
6,United Kingdom,863785.0,1103721.0
7,Italy,620739.0,940771.0
8,Canada,598434.0,917271.0
9,Spain,481945.0,657750.0


In [None]:
import pandas as pd
%pip install pycountry
import pycountry
import numpy as np

# Adding country codes to make merging easier

def do_fuzzy_search(country):
    try:
        result = pycountry.countries.search_fuzzy(country)
        return result[0].alpha_3
    except:
        return

iso_map = {country: do_fuzzy_search(country)
           for country in df_Spending["Country"].unique()}
df_Spending["Country code"] = df_Spending["Country"].map(iso_map)


# Adding missing values
df_Spending.loc[123, 'Country code'] = 'COD'
df_Spending.loc[125, 'Country code'] = 'LAO'
df_Spending.loc[136, 'Country code'] = 'COG'
df_Spending.loc[124, 'Country code'] = 'PRK'
df_Spending.loc[13, 'Country code'] = 'KOR'
df_Spending.loc[89, 'Country code'] = 'MMR'

df_Spending.head(10)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 4.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: pycountry
  Building wheel for pycountry (PEP 517) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681845 sha256=2e9ef7d16780b0e77460f99db0a53f63605628d5efeb572137537c42a71dea4c
  Stored in directory: /root/.cache/pip/wheels/e2/aa/0f/c224e473b464387170b83ca7c66947b4a7e33e8d903a679748
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-22.3.5


Unnamed: 0,Country,Revenues,Expenditures,Country code
0,USA,5923829.0,9818534.0,USA
1,China,3622313.0,5388814.0,CHN
2,India,1729224.0,2038247.0,IND
3,Germany,1666454.0,2362676.0,DEU
4,Japan,1334944.0,1609710.0,JPN
5,France,966407.0,1400776.0,FRA
6,United Kingdom,863785.0,1103721.0,GBR
7,Italy,620739.0,940771.0,ITA
8,Canada,598434.0,917271.0,CAN
9,Spain,481945.0,657750.0,ESP


In [None]:
# Doing same for the other dataframe
iso_map = {country: do_fuzzy_search(country)
           for country in df_Pop_GDP["Countries"].unique()}
df_Pop_GDP["Country code"] = df_Pop_GDP["Countries"].map(iso_map)
df_Pop_GDP.loc[44, 'Country code'] = 'COD'
df_Pop_GDP.loc[90, 'Country code'] = 'LAO'
df_Pop_GDP.loc[177, 'Country code'] = 'TUR'
df_Pop_GDP.head(10)

Unnamed: 0,Countries,Population,Annual GDP.1,GDP per capita.1,Country code
0,Afghanistan,39835428,20136,517,AFG
1,Albania,2793592,18260,6536,ALB
2,Algeria,44616626,162711,3647,DZA
3,Andorra,79535,3330,41873,AND
4,Angola,33933611,75179,2215,AGO
5,Antigua and Barbuda,98728,1471,14901,ATG
6,Argentina,45808747,491493,10729,ARG
7,Armenia,2968128,13928,4693,ARM
8,Australia,25767000,1635255,63464,AUS
9,Austria,8978929,477084,53332,AUT


In [None]:
from functools import reduce
# Merging the two dataframes

dfs = [df_Pop_GDP, df_Spending]
df = reduce(lambda left, right: pd.merge(
    left, right, on=['Country code'], how='outer'), dfs)

In [None]:
df = df.drop(df.index[195:])
df = df.drop(columns=['Country'])
df = df.rename(columns={'Countries': 'Country', 'Annual GDP.1': 'GDP', 'GDP per capita.1': 'GDP per capita',  'Expenditures':'Government spending', 'Revenues': 'Total tax revenue'})

# Values were in millions.
df["GDP"] = df["GDP"].mul(1000000)
df["Government spending"] = df["Government spending"].mul(1000000)
df["Total tax revenue"] = df["Total tax revenue"].mul(1000000)

# Calculating per capita values
df["Spending per Capita"] = df["Government spending"] / df["Population"]
df["Tax per Capita"] = df["Total tax revenue"] / df["Population"]

# Some duplicates present
df = df.drop_duplicates(subset='Country code')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Country,Population,GDP,GDP per capita,Country code,Total tax revenue,Government spending,Spending per Capita,Tax per Capita
0,Afghanistan,39835428.0,20136000000.0,517.0,AFG,1992000000.0,6636000000.0,166.585382,50.005739
1,Albania,2793592.0,18260000000.0,6536.0,ALB,3486000000.0,3765000000.0,1347.727227,1247.855807
2,Algeria,44616626.0,162711000000.0,3647.0,DZA,41474000000.0,58397000000.0,1308.861858,929.563791
3,Andorra,79535.0,3330000000.0,41873.0,AND,1872000000.0,2060000000.0,25900.546929,23536.807695
4,Angola,33933611.0,75179000000.0,2215.0,AGO,11243000000.0,13004000000.0,383.218868,331.323419


In [None]:
import json

# Exporting as a json.
exported_values = list(df.T.to_dict().values())
open('Tax vs Spending per capita data.json', 'w').write(json.dumps(exported_values))


52562