<a href="https://colab.research.google.com/github/DataScienceLiam/DataScienceLiam.github.io/blob/main/colab/Taxes_of_the_World.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from pandas import DataFrame
from typing import Set, Any


def remove_others(df: DataFrame, columns: Set[Any]):
    cols_total: Set[Any] = set(df.columns)
    diff: Set[Any] = cols_total - columns
    df.drop(diff, axis=1, inplace=True)


# Importing the csv file as a dataframe
# https://github.com/DataScienceLiam/DataScienceLiam.github.io/blob/main/Country%20codes.csv
df_Codes = pd.read_csv('/content/Country codes.csv')
df_Codes

# Rempoving unwanted columns
remove_others(df_Codes, {"alpha3", "name"})

# Renaming columns
df_Codes = df_Codes.rename(columns={"alpha3": "Country code"})
df_Codes = df_Codes.rename(columns={"name": "Country Name"})
df_Codes


Unnamed: 0,Country code,Country Name
0,afg,Afghanistan
1,ala,Åland Islands
2,alb,Albania
3,dza,Algeria
4,asm,American Samoa
...,...,...
244,wlf,Wallis and Futuna
245,esh,Western Sahara
246,yem,Yemen
247,zmb,Zambia


In [None]:
# Importing country codes csv file and renaming some columns
# Downloaded from https://worldpopulationreview.com/country-rankings/highest-taxed-countries
df_Tax = pd.read_csv('/content/World Tax rates data.csv')
df_Tax = df_Tax.rename(columns={"country": "Country Name", "incomeTax": "Income Tax", "salesTax": "Sales Tax", "corpTax": "Corporation Tax"})

df_Tax


Unnamed: 0,Country Name,Income Tax,Sales Tax,Corporation Tax
0,Ivory Coast,60,18.0,25.0
1,Finland,56,24.0,20.0
2,Japan,55,10.0,30.0
3,Austria,55,20.0,25.0
4,Denmark,55,25.0,22.0
...,...,...,...,...
146,Bahrain,0,5.0,0.0
147,Brunei,0,,18.0
148,Bahamas,0,12.0,0.0
149,Cayman Islands,0,,0.0


In [None]:
from functools import reduce

# Combining the country codes dataframe and the tax dataframe
dfs = [df_Codes,df_Tax]
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['Country Name'], how='outer'), dfs)


In [None]:
df_merged
# Checking it worked

Unnamed: 0,Country code,Country Name,Income Tax,Sales Tax,Corporation Tax
0,afg,Afghanistan,20.0,10.0,20.0
1,ala,Åland Islands,,,
2,alb,Albania,23.0,20.0,15.0
3,dza,Algeria,35.0,19.0,26.0
4,asm,American Samoa,,,
...,...,...,...,...,...
263,,Russia,13.0,20.0,20.0
264,,Bolivia,13.0,13.0,25.0
265,,Moldova,12.0,20.0,12.0
266,,Macau,12.0,,12.0


In [None]:
# Droping rows which contain NaN
df_clean = df_merged.dropna(subset=['Income Tax', 'Sales Tax', 'Corporation Tax'], how='all')
df_clean = df_clean.reset_index(drop=True)
df_clean.head(5)


Unnamed: 0,Country code,Country Name,Income Tax,Sales Tax,Corporation Tax
0,afg,Afghanistan,20.0,10.0,20.0
1,alb,Albania,23.0,20.0,15.0
2,dza,Algeria,35.0,19.0,26.0
3,ago,Angola,25.0,14.0,25.0
4,arg,Argentina,35.0,21.0,25.0


# Using the csv to match country codes resulted in ~20 countreis to fail to match. I instead found a library called pycountry which should perform better.

In [None]:
%pip install pycountry
import pycountry

# Using pycountry to assign country codes


def do_fuzzy_search(country):
    try:
        result = pycountry.countries.search_fuzzy(country)
        return result[0].alpha_3
    except:
        return 'Did not work'


iso_map = {country: do_fuzzy_search(country)
           for country in df_clean["Country Name"].unique()}
df_clean["Country code"] = df_clean["Country Name"].map(iso_map)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 3.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: pycountry
  Building wheel for pycountry (PEP 517) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681845 sha256=7c80552e79065f933f2716a653238dc4ae9fd71e28a89bf233713f0c7ff3275f
  Stored in directory: /root/.cache/pip/wheels/e2/aa/0f/c224e473b464387170b83ca7c66947b4a7e33e8d903a679748
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-22.3.5


In [None]:
# Checking where it failed.
# pd.set_option('display.max_rows', 500)
# df_clean.head(200)
# It failed 3 times

# Assigning the correct values manually 

df_clean.loc[132, 'Country code'] = 'CIV'
df_clean.loc[142, 'Country code'] = 'LAO'
df_clean.loc[149, 'Country code'] = 'MAC'


In [None]:
# Adding average US sales tax, to avoid 0 being displayed
# It depends on the state
df_clean.loc[138, 'Sales Tax'] = 6.35

In [None]:
# Exporting dataframe to a csv file
df_clean.to_csv('Tax and country code data.csv', encoding='utf-8', index=False)