In [230]:
import pandas as pd
import numpy as np
import glob

In [231]:
df = pd.read_csv('estat_tour_dem_ttw.csv')
df.columns = df.columns.str.strip()
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.head()

Unnamed: 0,freq,c_dest,purpose,duration,unit,geo,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,A,AFR,PER,N1-3,NR,AT,: u,: u,:,: u,: u,: u,: u,: u,:,: u,: u,: u
1,A,AFR,PER,N1-3,NR,BE,: u,: u,: u,: u,: u,: u,: u,: u,: u,: u,: u,: u
2,A,AFR,PER,N1-3,NR,BG,:,:,:,:,:,:,:,: u,:,:,:,: u
3,A,AFR,PER,N1-3,NR,CH,:,: u,: u,:,:,:,:,: u,:,:,: u,:
4,A,AFR,PER,N1-3,NR,CY,: u,: u,: u,: u,: u,:,: u,: u,: u,:,:,: u


In [232]:
path = '../ESTAT_CODELISTS/*.tsv'
files = glob.glob(path)
dataframes = {file.split('/')[-1].replace('.tsv', ''): pd.read_csv(
    file, sep='\t', on_bad_lines='error') for file in files}

In [233]:
for col in ["freq", "unit", "c_dest", "purpose", "duration", "geo"]:
    curr_codelist = dataframes[f"ESTAT_{col.upper()}"]
    df[col] = df[col].map(curr_codelist.set_index('CODE')['Label'])

df.rename(columns={'geo': 'country'}, inplace=True)
df.head()

Unnamed: 0,freq,c_dest,purpose,duration,unit,country,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Austria,: u,: u,:,: u,: u,: u,: u,: u,:,: u,: u,: u
1,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Belgium,: u,: u,: u,: u,: u,: u,: u,: u,: u,: u,: u,: u
2,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Bulgaria,:,:,:,:,:,:,:,: u,:,:,:,: u
3,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Switzerland,:,: u,: u,:,:,:,:,: u,:,:,: u,:
4,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Cyprus,: u,: u,: u,: u,: u,:,: u,: u,: u,:,:,: u


In [234]:
df.replace(": z", np.nan, inplace=True)
df.replace(": u", np.nan, inplace=True)
df.replace(":", np.nan, inplace=True)

In [235]:
year_cols = [col for col in df.columns if col.isdigit()]

for col in year_cols:
    df[col] = df[col].astype(str).str.extract(r'(\d+)').astype(float)

In [236]:
df.head()

Unnamed: 0,freq,c_dest,purpose,duration,unit,country,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Austria,,,,,,,,,,,,
1,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Belgium,,,,,,,,,,,,
2,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Bulgaria,,,,,,,,,,,,
3,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Switzerland,,,,,,,,,,,,
4,Annual,Africa,Personal reasons,From 1 to 3 nights,Number,Cyprus,,,,,,,,,,,,


In [237]:
for col in df.columns:
    if not col.isdigit() and df[col].nunique() == 1:
        df.drop(columns=[col], inplace=True)

In [238]:
df.drop(columns=['duration'], inplace=True)

In [239]:
df = df[df['purpose'] == "Total"]
df.drop(columns=['purpose'], inplace=True)

In [240]:
df.head()

Unnamed: 0,c_dest,country,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
286,Africa,Austria,,,,,,,,,,,,
287,Africa,Belgium,,,,,,,42919.0,43468.0,,,,
288,Africa,Bulgaria,,,,,,,,,,,,
289,Africa,Switzerland,,,,,,,,,,,,
290,Africa,Cyprus,,,,,,,,,,,,


In [241]:
data = []

for country in df['country'].unique():
	for year in year_cols:
		inbound = df[df['c_dest'] == country][year].sum()
		outbound = df[(df['country'] == country) & (df['c_dest']=="All countries of the world")][year].sum()
		data.append({"country": country, "year": year, "inbound": inbound, "outbound": outbound})

total = pd.DataFrame(data)
total
    

Unnamed: 0,country,year,inbound,outbound
0,Austria,2012,167917783.0,43844400.0
1,Austria,2013,172162182.0,44802455.0
2,Austria,2014,158960091.0,44940474.0
3,Austria,2015,158779952.0,43430850.0
4,Austria,2016,170161936.0,47122308.0
...,...,...,...,...
427,Serbia,2019,0.0,0.0
428,Serbia,2020,0.0,0.0
429,Serbia,2021,0.0,0.0
430,Serbia,2022,0.0,0.0


In [242]:
total.country.unique()

array(['Austria', 'Belgium', 'Bulgaria', 'Switzerland', 'Cyprus',
       'Czechia', 'Germany', 'Denmark',
       'Euro area – 20 countries (from 2023)', 'Estonia', 'Greece',
       'Spain', 'European Union - 27 countries (from 2020)', 'Finland',
       'France', 'Croatia', 'Hungary', 'Ireland', 'Italy', 'Lithuania',
       'Luxembourg', 'Latvia', 'Malta', 'Netherlands', 'Norway', 'Poland',
       'Portugal', 'Romania', 'Sweden', 'Slovenia', 'Slovakia',
       'United Kingdom', 'Albania', 'Montenegro', 'North Macedonia',
       'Serbia'], dtype=object)

In [243]:
total[['inbound', 'outbound']] = total[['inbound', 'outbound']].replace(0, np.nan)

In [244]:
total = total[(total.country != "Euro area - 20 countries (from 2023)") & (total.country != "European Union - 27 countries (from 2020)")]
total

Unnamed: 0,country,year,inbound,outbound
0,Austria,2012,167917783.0,43844400.0
1,Austria,2013,172162182.0,44802455.0
2,Austria,2014,158960091.0,44940474.0
3,Austria,2015,158779952.0,43430850.0
4,Austria,2016,170161936.0,47122308.0
...,...,...,...,...
427,Serbia,2019,,
428,Serbia,2020,,
429,Serbia,2021,,
430,Serbia,2022,,


In [245]:
total.to_csv("./clean/estat_tour_dem_ttw.csv", index=False)