<center>

# $\textbf{Processing All in One}$

<center>

### $\textbf{Code}$

In [1]:
import time
import pandas as pd
import numpy as np

In [2]:
inicio = time.time()

### $\textbf{Carregar os Datasets}$

In [3]:
# Load the CSV file into a DataFrame
df_covid = pd.read_csv('Files/Covid.csv')
df_gdp = pd.read_csv('Files/GDP.csv')
df_inflation = pd.read_excel('Files/Inflation.xlsx')
df_migration = pd.read_csv("Files/Migration.csv")
df_population = pd.read_csv("Files/Population.csv")
df_tax = pd.read_csv('Files/Tax.csv')
df_unemployment = pd.read_csv('Files/Unemployment.csv')

### $\textbf{Covid - Processamento}$

In [4]:
# Select the columns you're interested in
selected_columns = ['location', 'date', 'total_cases']

# Create a new DataFrame with only the selected columns
covid = df_covid[selected_columns].copy()

# Convert 'date' column to datetime format
covid['date'] = pd.to_datetime(covid['date'])

# Group by country and year, and select the last date of each year for each country
covid_last_day_of_year = covid.groupby([covid['date'].dt.year, 'location']).tail(1)

# Extract the year from the 'date' column and store it in a new column
covid_last_day_of_year = covid_last_day_of_year.assign(year=covid_last_day_of_year['date'].dt.year)

# Drop the 'date' and 'continent' columns, and rename columns
covid_last_day_of_year = covid_last_day_of_year.drop(['date'], axis=1)
covid_last_day_of_year.rename(columns={'location': 'country', 
                                       'total_cases': 'covid_cases'}, inplace=True)

df_covid = covid_last_day_of_year.copy()

df_covid['country'] = df_covid['country'].replace({'Micronesia (country)': 'Micronesia', 
                                       'Saint Martin (French part)': 'Saint Martin',
                                       'Sint Maarten (Dutch part)': 'Sint Maarten',
                                       'Brunei' : 'Brunei Darussalam',
                                       'Cape Verde' : 'Cabo Verde',
                                       'Laos' : 'Lao PDR',
                                       'United States Virgin Islands' : 'Virgin Islands (U.S.)',
                                       'Turkey' : 'Turkiye',
                                       'Syria' : 'Syrian Arab Republic',
                                       'Timor' : 'Timor-Leste',
                                       'Russia' : 'Russian Federation',
                                       'Congo' : 'Democratic Republic of the Congo',
                                       'Slovakia' : 'Slovenia'})

# Columns to drop
countries_to_remove = ['Africa', 'Europe', 'Europe Union', 'Asia', 'Lower middle income', 'Upper middle income', 'World']

# Drop the specified columns
df_covid = df_covid[~df_covid['country'].isin(countries_to_remove)]

### $\textbf{GDP - Processamento}$

In [5]:
# Reshape the DataFrame using melt
melted_df = df_gdp.melt(id_vars='Country', var_name='Year', value_name='Value')

# Sort the DataFrame first by Country and then by Year within each Country
melted_df = melted_df.sort_values(by=['Country', 'Year'])

# Rename Column Value to GDP
melted_df.rename(columns={'Value': 'GDP', 'Country': 'country', 'Year': 'year'}, inplace=True)

df_gdp = melted_df.copy()

df_gdp['country'] = df_gdp['country'].replace({'China, People\'s Republic of': 'China', 
                                       'Gambia, The': 'Gambia',
                                       'Micronesia, Fed. States of': 'Micronesia',
                                       'South Sudan, Republic of': 'South Sudan',
                                       'Taiwan Province of China': 'Taiwan',
                                       'Türkiye, Republic of': 'Turkiye',
                                       'Bahamas, The' : 'Bahamas',
                                       'Syria' : 'Syrian Arab Republic',
                                       'Czech Republic' : 'Czechia',
                                       'Korea, Republic of' : 'South Korea',
                                       'Côte d\'Ivoire' : 'Cote d\'Ivoire',
                                       'Hong Kong SAR' : 'Hong Kong',
                                       'Lao P.D.R.' : 'Lao PDR',
                                       'Congo, Republic of ' : 'Republic of the Congo',
                                       'Congo, Dem. Rep. of the' : 'Democratic Republic of the Congo',
                                       'Pacific Islands ' : 'Pacific island small states',
                                       'North Macedonia ' : 'North Macedonia'})

# Columns to drop
countries_to_remove = ['ASEAN-5', 
                       'Advanced economies', 
                       'Africa (Region)', 
                       'Asia and Pacific', 
                       'Australia and New Zealand', 
                       'Central America', 
                       'Central Asia and the Caucasus', 
                       '©IMF, 2023',
                       'East Asia',
                       'Eastern Europe ',
                       'Emerging and Developing Asia',
                       'Emerging and Developing Europe',
                       'Emerging market and developing economies',
                       'Euro area',
                       'Europe',
                       'European Union',
                       'Latin America and the Caribbean',
                       'Macao SAR',
                       'Major advanced economies (G7)',
                       'Middle East (Region)',
                       'Middle East and Central Asia',
                       'North Africa',
                       'North America',
                       'North Macedonia',
                       'Other advanced economies',
                       'South America',
                       'South Asia',
                       'Southeast Asia',
                       'Sub-Saharan Africa',
                       'Sub-Saharan Africa (Region)',
                       'Western Europe',
                       'Western Hemisphere (Region)',
                       'World']

# Drop the specified columns
df_gdp = df_gdp[~df_gdp['country'].isin(countries_to_remove)]

### $\textbf{Inflation - Processamento}$

In [6]:
# Rename the column
df_inflation.rename(columns={'Inflation rate, average consumer prices (Annual percent change)': 'country'}, inplace=True)

# Reshape the DataFrame using melt
melted_df_inflation = df_inflation.melt(id_vars='country', var_name='year', value_name='inflation')

# Sort the DataFrame first by Country and then by Year within each Country
melted_df_inflation = melted_df_inflation.sort_values(by=['country', 'year'])

df_inflation = melted_df_inflation.copy()

df_inflation['country'] = df_inflation['country'].replace({'China, People\'s Republic of': 'China', 
                                       'Gambia, The': 'Gambia',
                                       'Micronesia, Fed. States of': 'Micronesia',
                                       'South Sudan, Republic of': 'South Sudan',
                                       'Taiwan Province of China': 'Taiwan',
                                       'Türkiye, Republic of': 'Turkiye',
                                       'Bahamas, The' : 'Bahamas',
                                       'Czech Republic' : 'Czechia',
                                       'Syria' : 'Syrian Arab Republic',
                                       'Côte d\'Ivoire' : 'Cote d\'Ivoire',
                                       'Hong Kong SAR' : 'Hong Kong',
                                       'Lao P.D.R.' : 'Lao PDR', 
                                       'Korea, Republic of' : 'South Korea',
                                       'Congo, Dem. Rep. of the' : 'Democratic Republic of the Congo',
                                       'Congo, Republic of ' : 'Republic of the Congo', 
                                       'Pacific Islands ' : 'Pacific island small states',
                                       'North Macedonia ' : 'North Macedonia'})

# Columns to drop
countries_to_remove = ['ASEAN-5', 
                       'Advanced economies', 
                       'Africa (Region)', 
                       'Asia and Pacific', 
                       'Australia and New Zealand', 
                       'Central America', 
                       'Central Asia and the Caucasus', 
                       '©IMF, 2023',
                       'East Asia',
                       'Eastern Europe ',
                       'Emerging and Developing Asia',
                       'Emerging and Developing Europe',
                       'Emerging market and developing economies',
                       'Euro area',
                       'Europe',
                       'European Union',
                       'Latin America and the Caribbean',
                       'Macao SAR',
                       'Major advanced economies (G7)',
                       'Middle East (Region)',
                       'Middle East and Central Asia',
                       'North Africa',
                       'North America',
                       'North Macedonia',
                       'Other advanced economies',
                       'South America',
                       'South Asia',
                       'Southeast Asia',
                       'Sub-Saharan Africa',
                       'Sub-Saharan Africa (Region)',
                       'Western Europe',
                       'Western Hemisphere (Region)',
                       'World',
                       'nan']

# Drop the specified columns
df_inflation = df_inflation[~df_inflation['country'].isin(countries_to_remove)]

# Remove rows with missing values in the 'country' column
df_inflation = df_inflation.dropna(subset=['country'])

### $\textbf{Migration - Processamento}$

In [7]:
# Select the columns you're interested in dropping
selected_columns = ['Series Name', 'Series Code', 'Country Code']
df_migration.drop(columns=selected_columns, inplace=True)

# Rename the column
df_migration.rename(columns={'Country Name': 'country'}, inplace=True)
df_migration.rename(columns={'2000 [YR2000]': '2000'}, inplace=True)
df_migration.rename(columns={'2001 [YR2001]': '2001'}, inplace=True)
df_migration.rename(columns={'2002 [YR2002]': '2002'}, inplace=True)
df_migration.rename(columns={'2003 [YR2003]': '2003'}, inplace=True)
df_migration.rename(columns={'2004 [YR2004]': '2004'}, inplace=True)
df_migration.rename(columns={'2005 [YR2005]': '2005'}, inplace=True)
df_migration.rename(columns={'2006 [YR2006]': '2006'}, inplace=True)
df_migration.rename(columns={'2007 [YR2007]': '2007'}, inplace=True)
df_migration.rename(columns={'2008 [YR2008]': '2008'}, inplace=True)
df_migration.rename(columns={'2009 [YR2009]': '2009'}, inplace=True)
df_migration.rename(columns={'2010 [YR2010]': '2010'}, inplace=True)
df_migration.rename(columns={'2011 [YR2011]': '2011'}, inplace=True)
df_migration.rename(columns={'2012 [YR2012]': '2012'}, inplace=True)
df_migration.rename(columns={'2013 [YR2013]': '2013'}, inplace=True)
df_migration.rename(columns={'2014 [YR2014]': '2014'}, inplace=True)
df_migration.rename(columns={'2015 [YR2015]': '2015'}, inplace=True)
df_migration.rename(columns={'2016 [YR2016]': '2016'}, inplace=True)
df_migration.rename(columns={'2017 [YR2017]': '2017'}, inplace=True)
df_migration.rename(columns={'2018 [YR2018]': '2018'}, inplace=True)
df_migration.rename(columns={'2019 [YR2019]': '2019'}, inplace=True)
df_migration.rename(columns={'2020 [YR2020]': '2020'}, inplace=True)
df_migration.rename(columns={'2021 [YR2021]': '2021'}, inplace=True)
df_migration.rename(columns={'2022 [YR2022]': '2022'}, inplace=True)
df_migration.rename(columns={'2023 [YR2023]': '2023'}, inplace=True)
df_migration.rename(columns={'2024 [YR2024]': '2024'}, inplace=True)
df_migration.rename(columns={'2025 [YR2025]': '2025'}, inplace=True)

# Reshape the DataFrame using melt
melted_df_migration = df_migration.melt(id_vars='country', var_name='year', value_name='migration')

# Sort the DataFrame first by Country and then by Year within each Country
melted_df_migration = melted_df_migration.sort_values(by=['country', 'year'])

df_migration = melted_df_migration.copy()

df_migration['country'] = df_migration['country'].replace({'Bahamas, The': 'Bahamas', 
                                       'Egypt, Arab Rep.': 'Egypt',
                                       'Micronesia, Fed. Sts.': 'Micronesia',
                                       'Sint Maarten (Dutch part)': 'Sint Maarten',
                                       'St. Martin (French part)': 'St. Martin',
                                       'Venezuela, RB': 'Venezuela',
                                       'Yemen, Rep.' : 'Yemen',
                                       'Caribbean small states' : 'Caribbean',
                                       'Gambia, The' : 'Gambia',
                                       'Hong Kong SAR, China' : 'Hong Kong',
                                       'Iran, Islamic Rep.' : 'Iran',
                                       'Macao SAR, China' : 'Macao SAR',
                                       'Korea, Rep.' : 'South Korea',
                                       'Korea, Dem. People\'s Rep.' : 'Korea',
                                       'Congo, Rep.' : 'Republic of the Congo',
                                       'Congo, Dem. Rep.' : 'Democratic Republic of the Congo',
                                       'Viet Nam' : 'Vietnam'})

# Columns to drop
countries_to_remove = ['Africa Eastern and Southern', 
                       'Africa Western and Central', 
                       'Central Europe and the Baltics', 
                       'Early-demographic dividend', 
                       'East Asia & Pacific', 
                       'East Asia & Pacific (IDA & IBRD countries)', 
                       'East Asia & Pacific (excluding high income)', 
                       'Euro area',
                       'Europe & Central Asia',
                       'Europe & Central Asia (IDA & IBRD countries)',
                       'Europe & Central Asia (excluding high income)',
                       'European Union',
                       'Fragile and conflict affected situations',
                       'Heavily indebted poor countries (HIPC)',
                       'High income',
                       'IBRD only',
                       'IDA & IBRD total',
                       'IDA blend',
                       'IDA only',
                       'IDA total',
                       'Latin America & the Caribbean (IDA & IBRD countries)',
                       'Latin America & Caribbean (excluding high income)',
                       'Least developed countries: UN classification',
                       'Low & middle income',
                       'Low income',
                       'Lower middle income',
                       'Middle East & North Africa',
                       'Middle East & North Africa (IDA & IBRD countries)',
                       'Middle East & North Africa (excluding high income)',
                       'Middle income',
                       'Not classified',
                       'OECD members',
                       'Other small states',
                       'Pacific island small states'
                       'Post-demographic dividend',
                       'Pre-demographic dividend',
                       'Small states',
                       'South Asia',
                       'South Asia (IDA & IBRD)',
                       'Sub-Saharan Africa (IDA & IBRD countries)',
                       'Sub-Saharan Africa (excluding high income)',
                       'Upper middle income',
                       'World'
                       ]

# Drop the specified columns
df_migration = df_migration[~df_migration['country'].isin(countries_to_remove)]

# Remove rows with missing values in the 'country' column
df_migration = df_migration.dropna(subset=['country'])

### $\textbf{Population - Processamento}$

In [8]:
# Select the columns you're interested in dropping
selected_columns = ['Series Name', 'Series Code', 'Country Code']
df_population.drop(columns=selected_columns, inplace=True)

# Rename the column
df_population.rename(columns={'Country Name': 'country'}, inplace=True)
df_population.rename(columns={'2000 [YR2000]': '2000'}, inplace=True)
df_population.rename(columns={'2001 [YR2001]': '2001'}, inplace=True)
df_population.rename(columns={'2002 [YR2002]': '2002'}, inplace=True)
df_population.rename(columns={'2003 [YR2003]': '2003'}, inplace=True)
df_population.rename(columns={'2004 [YR2004]': '2004'}, inplace=True)
df_population.rename(columns={'2005 [YR2005]': '2005'}, inplace=True)
df_population.rename(columns={'2006 [YR2006]': '2006'}, inplace=True)
df_population.rename(columns={'2007 [YR2007]': '2007'}, inplace=True)
df_population.rename(columns={'2008 [YR2008]': '2008'}, inplace=True)
df_population.rename(columns={'2009 [YR2009]': '2009'}, inplace=True)
df_population.rename(columns={'2010 [YR2010]': '2010'}, inplace=True)
df_population.rename(columns={'2011 [YR2011]': '2011'}, inplace=True)
df_population.rename(columns={'2012 [YR2012]': '2012'}, inplace=True)
df_population.rename(columns={'2013 [YR2013]': '2013'}, inplace=True)
df_population.rename(columns={'2014 [YR2014]': '2014'}, inplace=True)
df_population.rename(columns={'2015 [YR2015]': '2015'}, inplace=True)
df_population.rename(columns={'2016 [YR2016]': '2016'}, inplace=True)
df_population.rename(columns={'2017 [YR2017]': '2017'}, inplace=True)
df_population.rename(columns={'2018 [YR2018]': '2018'}, inplace=True)
df_population.rename(columns={'2019 [YR2019]': '2019'}, inplace=True)
df_population.rename(columns={'2020 [YR2020]': '2020'}, inplace=True)
df_population.rename(columns={'2021 [YR2021]': '2021'}, inplace=True)
df_population.rename(columns={'2022 [YR2022]': '2022'}, inplace=True)
df_population.rename(columns={'2023 [YR2023]': '2023'}, inplace=True)
df_population.rename(columns={'2024 [YR2024]': '2024'}, inplace=True)
df_population.rename(columns={'2025 [YR2025]': '2025'}, inplace=True)

# Reshape the DataFrame using melt
melted_df_population = df_population.melt(id_vars='country', var_name='year', value_name='population')

# Sort the DataFrame first by Country and then by Year within each Country
melted_df_population = melted_df_population.sort_values(by=['country', 'year'])

df_population = melted_df_population.copy()

df_population['country'] = df_population['country'].replace({'Bahamas, The': 'Bahamas', 
                                       'Egypt, Arab Rep.': 'Egypt',
                                       'Micronesia, Fed. Sts.': 'Micronesia',
                                       'Sint Maarten (Dutch part)': 'Sint Maarten',
                                       'St. Martin (French part)': 'St. Martin',
                                       'Venezuela, RB': 'Venezuela',
                                       'Yemen, Rep.' : 'Yemen',
                                       'Caribbean small states' : 'Caribbean',
                                       'Gambia, The' : 'Gambia',
                                       'Hong Kong SAR, China' : 'Hong Kong',
                                       'Iran, Islamic Rep.' : 'Iran',
                                        'Congo, Rep.' : 'Republic of the Congo',
                                       'Macao SAR, China' : 'Macao SAR',
                                       'Korea, Rep.' : 'South Korea',
                                       'Korea, Dem. People\'s Rep.' : 'Korea',
                                       'Congo, Dem. Rep.' : 'Democratic Republic of the Congo',
                                       'Viet Nam' : 'Vietnam' })

# Columns to drop
countries_to_remove = ['Africa Eastern and Southern', 
                       'Africa Western and Central', 
                       'Central Europe and the Baltics', 
                       'Early-demographic dividend', 
                       'East Asia & Pacific', 
                       'East Asia & Pacific (IDA & IBRD countries)', 
                       'East Asia & Pacific (excluding high income)', 
                       'Euro area',
                       'Europe & Central Asia',
                       'Europe & Central Asia (IDA & IBRD countries)',
                       'Europe & Central Asia (excluding high income)',
                       'European Union',
                       'Fragile and conflict affected situations',
                       'Heavily indebted poor countries (HIPC)',
                       'High income',
                       'IBRD only',
                       'IDA & IBRD total',
                       'IDA blend',
                       'IDA only',
                       'IDA total',
                       'Latin America & the Caribbean (IDA & IBRD countries)',
                       'Latin America & Caribbean (excluding high income)',
                       'Least developed countries: UN classification',
                       'Low & middle income',
                       'Low income',
                       'Lower middle income',
                       'Middle East & North Africa',
                       'Middle East & North Africa (IDA & IBRD countries)',
                       'Middle East & North Africa (excluding high income)',
                       'Middle income',
                       'Not classified',
                       'OECD members',
                       'Other small states',
                       'Pacific island small states'
                       'Post-demographic dividend',
                       'Pre-demographic dividend',
                       'Small states',
                       'South Asia',
                       'South Asia (IDA & IBRD)',
                       'Sub-Saharan Africa (IDA & IBRD countries)',
                       'Sub-Saharan Africa (excluding high income)',
                       'Upper middle income',
                       'World'
                       ]

# Drop the specified columns
df_population = df_population[~df_population['country'].isin(countries_to_remove)]

# Remove rows with missing values in the 'country' column
df_population = df_population.dropna(subset=['country'])

### $\textbf{Tax - Processamento}$

In [9]:
# Remove columns Country Code and Indicator Code
columns_to_drop = ['Country Code', 'Indicator Code', 'Indicator Name']
df_tax.drop(columns=columns_to_drop, inplace=True)

# Rename 'Unemployment  total (% of total labor force) (modeled ILO estimate)' to 'unemployment'
df_tax.rename(columns={'Country Name': 'country'}, inplace=True)

# Reshape the DataFrame using melt
melted_df_tax = df_tax.melt(id_vars='country', var_name='year', value_name='tax')

# Sort the DataFrame first by Country and then by Year within each Country
melted_df_tax = melted_df_tax.sort_values(by=['country', 'year'])

# Remove rows where 'tax' column has no value
melted_df_tax = melted_df_tax.dropna(subset=['tax'])

df_tax = melted_df_tax.copy()

df_tax['country'] = df_tax['country'].replace({'China, People\'s Republic of': 'China', 
                                       'Gambia, The': 'Gambia',
                                       'Micronesia, Fed. States of': 'Micronesia',
                                       'South Sudan, Republic of': 'South Sudan',
                                       'Taiwan Province of China': 'Taiwan',
                                       'Türkiye, Republic of': 'Türkiye',
                                       'Korea, Republic of' : 'South Korea',
                                       'Congo' : 'Democratic Republic of the Congo',
                                       'Caribbean small states' : 'Caribbean'})

# Columns to drop
countries_to_remove = ['Africa Eastern and Southern', 
                       'Africa Western and Central', 
                       'Central Europe and the Baltics', 
                       'Early-demographic dividend', 
                       'East Asia & Pacific', 
                       'East Asia & Pacific (IDA & IBRD countries)', 
                       'East Asia & Pacific (excluding high income)', 
                       'Euro area',
                       'Europe & Central Asia',
                       'Europe & Central Asia (IDA & IBRD countries)',
                       'Europe & Central Asia (excluding high income)',
                       'European Union',
                       'Fragile and conflict affected situations',
                       'Heavily indebted poor countries (HIPC)',
                       'High income',
                       'IBRD only',
                       'IDA & IBRD total',
                       'IDA blend',
                       'IDA only',
                       'IDA total',
                       'Latin America & the Caribbean (IDA & IBRD countries)',
                       'Latin America & Caribbean (excluding high income)',
                       'Least developed countries: UN classification',
                       'Low & middle income',
                       'Low income',
                       'Lower middle income',
                       'Middle East & North Africa',
                       'Middle East & North Africa (IDA & IBRD countries)',
                       'Middle East & North Africa (excluding high income)',
                       'Middle income',
                       'Not classified',
                       'OECD members',
                       'Other small states',
                       'Pacific island small states'
                       'Post-demographic dividend',
                       'Pre-demographic dividend',
                       'Small states',
                       'South Asia',
                       'South Asia (IDA & IBRD)',
                       'Sub-Saharan Africa (IDA & IBRD countries)',
                       'Sub-Saharan Africa (excluding high income)',
                       'Upper middle income',
                       'World'
                       ]

# Drop the specified columns
df_tax = df_tax[~df_tax['country'].isin(countries_to_remove)]

# Remove rows with missing values in the 'country' column
df_tax = df_tax.dropna(subset=['country'])

### $\textbf{Unemployment - Processamento}$

In [10]:
# Remove columns Country Code and Indicator Code
columns_to_drop = ['Country Code', 'Indicator Code', 'Indicator Name']
df_unemployment.drop(columns=columns_to_drop, inplace=True)

# Rename 'Unemployment  total (% of total labor force) (modeled ILO estimate)' to 'unemployment'
df_unemployment.rename(columns={'Country Name': 'country'}, inplace=True)

# Reshape the DataFrame using melt
melted_df_unemployment = df_unemployment.melt(id_vars='country', var_name='year', value_name='unemployment')

# Sort the DataFrame first by Country and then by Year within each Country
melted_df_unemployment = melted_df_unemployment.sort_values(by=['country', 'year'])

df_unemployment = melted_df_unemployment.copy()

df_unemployment['country'] = df_unemployment['country'].replace({'China, People\'s Republic of': 'China', 
                                       'Gambia, The': 'Gambia',
                                       'Micronesia, Fed. States of': 'Micronesia',
                                       'South Sudan, Republic of': 'South Sudan',
                                       'Taiwan Province of China': 'Taiwan',
                                       'Türkiye, Republic of': 'Türkiye',
                                       'Caribbean small states' : 'Caribbean',
                                       'Hong Kong SAR' : 'Hong Kong',
                                       'Congo' : 'Democratic Republic of the Congo',
                                       'Viet Nam' : 'Vietnam'})

# Columns to drop
countries_to_remove = ['Africa Eastern and Southern', 
                       'Africa Western and Central', 
                       'Central Europe and the Baltics', 
                       'Early-demographic dividend', 
                       'East Asia & Pacific', 
                       'East Asia & Pacific (IDA & IBRD countries)', 
                       'East Asia & Pacific (excluding high income)', 
                       'Euro area',
                       'Europe & Central Asia',
                       'Europe & Central Asia (IDA & IBRD countries)',
                       'Europe & Central Asia (excluding high income)',
                       'European Union',
                       'Fragile and conflict affected situations',
                       'Heavily indebted poor countries (HIPC)',
                       'High income',
                       'IBRD only',
                       'IDA & IBRD total',
                       'IDA blend',
                       'IDA only',
                       'IDA total',
                       'Latin America & the Caribbean (IDA & IBRD countries)',
                       'Latin America & Caribbean (excluding high income)',
                       'Least developed countries: UN classification',
                       'Low & middle income',
                       'Low income',
                       'Lower middle income',
                       'Middle East & North Africa',
                       'Middle East & North Africa (IDA & IBRD countries)',
                       'Middle East & North Africa (excluding high income)',
                       'Middle income',
                       'Not classified',
                       'OECD members',
                       'Other small states',
                       'Pacific island small states'
                       'Post-demographic dividend',
                       'Pre-demographic dividend',
                       'Small states',
                       'South Asia',
                       'South Asia (IDA & IBRD)',
                       'Sub-Saharan Africa (IDA & IBRD countries)',
                       'Sub-Saharan Africa (excluding high income)',
                       'Upper middle income',
                       'World',
                       'ther small states'
                       ]

# Drop the specified columns
df_unemployment = df_unemployment[~df_unemployment['country'].isin(countries_to_remove)]

# Remove rows with missing values in the 'country' column
df_unemployment = df_unemployment.dropna(subset=['country'])

### $\textbf{Merge Files}$

In [11]:
# Convert data types if needed
df_inflation['country'] = df_inflation['country'].astype(str)
df_inflation['year'] = df_inflation['year'].astype(int)

df_covid['country'] = df_covid['country'].astype(str)
df_covid['year'] = df_covid['year'].astype(int)

df_tax['country'] = df_tax['country'].astype(str)
df_tax['year'] = df_tax['year'].astype(int)

df_migration['country'] = df_migration['country'].astype(str)
df_migration['year'] = df_migration['year'].astype(int)

df_population['country'] = df_population['country'].astype(str)
df_population['year'] = df_population['year'].astype(int)

df_gdp['country'] = df_gdp['country'].astype(str)
df_gdp['year'] = df_gdp['year'].astype(int)

df_unemployment['country'] = df_unemployment['country'].astype(str)
df_unemployment['year'] = df_unemployment['year'].astype(int)

# Filter data for years greater than 2010
df_inflation = df_inflation[df_inflation['year'] > 2010]
df_covid = df_covid[df_covid['year'] > 2010]
df_tax = df_tax[df_tax['year'] > 2010]
df_migration = df_migration[df_migration['year'] > 2010]
df_population = df_population[df_population['year'] > 2010]
df_gdp = df_gdp[df_gdp['year'] > 2010]
df_unemployment = df_unemployment[df_unemployment['year'] > 2010]

# Filter data for years lower than 2024
df_inflation = df_inflation[df_inflation['year'] < 2024]
df_covid = df_covid[df_covid['year'] < 2024]
df_tax = df_tax[df_tax['year'] < 2024]
df_migration = df_migration[df_migration['year'] < 2024]
df_population = df_population[df_population['year'] < 2024]
df_gdp = df_gdp[df_gdp['year'] < 2024]
df_unemployment = df_unemployment[df_unemployment['year'] < 2024]

# Merge the dataframes
merged_data = df_inflation.merge(df_covid, on=['country', 'year'], how='outer') \
    .merge(df_migration, on=['country', 'year'], how='outer') \
    .merge(df_population, on=['country', 'year'], how='outer') \
    .merge(df_gdp, on=['country', 'year'], how='outer') \
    .merge(df_unemployment, on=['country', 'year'], how='outer') \
    .merge(df_tax, on=['country', 'year'], how='outer')

# Sort the merged data by country and then by year
merged_data = merged_data.sort_values(by=['country', 'year'])

df = merged_data.copy()

# Drop the specified columns
columns_to_drop = ['t. Lucia',
                   'ali', 
                   'alau',
                   'Anguilla',
                   'Bonaire Sint Eustatius and Saba',
                   'Cook Islands',
                   'American Samoa',
                   'Arab World',
                   'European Union',
                   'Faeroe Islands',
                   'Falkland Islands',
                   'Faroe Islands',
                   'French Guiana',
                   'Guadeloupe',
                   'Guernsey',
                   'High income',
                   'Isle of Man',
                   'Jersey',
                   'Kyrgyzstan',
                   'Late-demographic dividend',
                   'Latin America & Caribbean',
                   'Low income',
                   'Macao',              
                   'Martinique',
                   'Mayotte',
                   'Montserrat',
                   'Niue',
                   'North America',
                   'North Korea',
                   'Northern Cyprus',
                   'Northern Ireland',
                   'Oceania',
                   'Western Sahara',
                   'Wales',
                   'Wallis and Futuna',
                   'Vatican',
                   'Tokelau',
                   'Sub-Saharan Africa',
                   'Sub-Saharan Africa (Region) ',
                   'St. Lucia',
                   'St. Martin',
                   'Sint Maarten',
                   'Sint Maarten (Dutch part)',
                   'Scotland',
                   'Saint Martin',
                   'Saint Lucia',
                   'Saint Helena',
                   'Reunion',
                   'Post-demographic dividend',
                   'Pitcairn',
                   'Palestine',
                   'England',
                   'Saint Pierre and Miquelon',
                   'Saint Barthelemy',
                   'South America',
                   'St. Martin (French part)'
                   ]

df = df[~df['country'].isin(columns_to_drop)]

df['country'] = df['country'].replace({'Korea' : 'North Korea',
                                       'Pacific island small states' : 'Pacific Islands'})

# Assuming your DataFrame is named df
df.replace('no data', np.nan, inplace=True)

# Sort the merged data by country and then by year
df = df.sort_values(by=['country', 'year'])

# Save the merged dataframe to a new CSV file
df.to_csv('FinalFile.csv', index=False)

In [12]:
fim = time.time()
final = fim - inicio
print(final)

4.421375751495361
