# Preprocessing the Datasets


## Libraries

Libraries to complete the preprocessing part

In [87]:
import pandas as pd
import os

Files that are going to be used for the rest of the project

In [None]:
extot_path = "Datasets/Raw/estat_tour_dem_extot.tsv" # Path to the extot dataset, which contains total tourist arrivals data.
tnage_path = "Datasets/Raw/estat_tour_dem_tnage.tsv" # Path to the tnage dataset, which contains tourist arrivals data by age group.
ttsex_path = "Datasets/Raw/estat_tour_dem_ttsex.tsv" # Path to the ttsex dataset, which contains tourist arrivals data by sex.
path = "Datasets"

In [89]:
extot_raw = pd.read_csv(extot_path, sep="\t")
tnage_raw = pd.read_csv(tnage_path, sep="\t")
ttsex_raw = pd.read_csv(ttsex_path, sep="\t")
extot = extot_raw.copy()
tnage = tnage_raw.copy()
ttsex = ttsex_raw.copy()

Shape of the dataset

In [90]:
print("ttsex:", ttsex_raw.shape)
print("extot:", extot_raw.shape)
print("tnage:", tnage_raw.shape)

ttsex: (4679, 13)
extot: (31989, 13)
tnage: (14221, 13)


As we can see, the datasets are quite large. We will need to preprocess them to make them more manageable and suitable for analysis.

In [91]:
print("\nHead of extot:")
print(extot_raw.head())

print("\nHead of tnage:")
print(tnage_raw.head())

print("\nHead of ttsex:")
print(ttsex_raw.head())


Head of extot:
  freq,purpose,duration,c_dest,expend,statinfo,unit,geo\TIME_PERIOD   2012   \
0                A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,AL                     :    
1                A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,AT                 66.02    
2                A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,BE                 36.76    
3                A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,BG                 4.58 b   
4                A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,CH                 39.76    

    2013    2014    2015    2016     2017    2018    2019    2020    2021   \
0      :       :       :       :     2.38       :    3.88    9.51       :    
1  50.64   46.01   51.48   54.07    55.03   56.24   56.22   51.82   50.29    
2  39.50   37.12   37.13   34.41   39.76 b  34.96   37.88   37.23   40.52    
3   5.42    6.66    6.40    6.41     5.85    5.60    8.66    9.62   12.74    
4  47.91   46.39   53.58   52.03    53.80   52.26   55.43   60.78   70.47    

    2022    2023   
0      :       :   


In [92]:
def split_dimensions(df):
    first_col = df.columns[0]
    dims = first_col.split(",")
    df[dims] = df[first_col].str.split(",", expand=True)
    df = df.drop(columns=[first_col])
    return df, dims

In [93]:
ttsex, dims_ttsex = split_dimensions(ttsex_raw)
extot, dims_extot = split_dimensions(extot_raw)
tnage, dims_tnage = split_dimensions(tnage_raw)

In [94]:
print("Dims (ttsex):", dims_ttsex)
print("Dims (extot):", dims_extot)
print("Dims (tnage):", dims_tnage)


Dims (ttsex): ['freq', 'c_dest', 'purpose', 'duration', 'sex', 'unit', 'geo\\TIME_PERIOD']
Dims (extot): ['freq', 'purpose', 'duration', 'c_dest', 'expend', 'statinfo', 'unit', 'geo\\TIME_PERIOD']
Dims (tnage): ['freq', 'purpose', 'c_dest', 'duration', 'age', 'unit', 'geo\\TIME_PERIOD']


In [95]:
def melt_long(df, dims):
    df_long = df.melt(id_vars=dims, var_name="year", value_name="value")
    return df_long

In [96]:
ttsex_long = melt_long(ttsex, dims_ttsex)
extot_long = melt_long(extot, dims_extot)
tnage_long = melt_long(tnage, dims_tnage)

ttsex_long.head()

Unnamed: 0,freq,c_dest,purpose,duration,sex,unit,geo\TIME_PERIOD,year,value
0,A,DOM,PER,N1-3,F,NR,AL,2012,:
1,A,DOM,PER,N1-3,F,NR,AT,2012,3155661
2,A,DOM,PER,N1-3,F,NR,BE,2012,756712
3,A,DOM,PER,N1-3,F,NR,BG,2012,1029215 b
4,A,DOM,PER,N1-3,F,NR,CH,2012,2156668


Cleaning numeric values (remove flags like 'b', 'u', 'p')


In [97]:
def clean_values(df):
    df["value"] = df["value"].astype(str).str.replace(r"[^0-9.]", "", regex=True)
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    return df.dropna(subset=["value"])

In [98]:
ttsex_clean = clean_values(ttsex_long)
extot_clean = clean_values(extot_long)
tnage_clean = clean_values(tnage_long)


Standardize column names


In [99]:
def clean_columns(df):
    df.columns = (
        df.columns.str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
    )
    return df


In [100]:
ttsex_clean = clean_columns(ttsex_clean)
extot_clean = clean_columns(extot_clean)
tnage_clean = clean_columns(tnage_clean)


Here we save separate the cleaned datasets to make them proper for the rest of the project (for Neo4j KG import)


In [101]:
processed_path = os.path.join(path, "processed")
os.makedirs(processed_path, exist_ok=True)

extot_clean.to_csv(os.path.join(processed_path, "extot_clean.csv"), index=False)
ttsex_clean.to_csv(os.path.join(processed_path, "ttsex_clean.csv"), index=False)
tnage_clean.to_csv(os.path.join(processed_path, "tnage_clean.csv"), index=False)

print("✅ Separate cleaned datasets saved for Neo4j:")
print(" - extot_clean.csv")
print(" - ttsex_clean.csv")
print(" - tnage_clean.csv")


✅ Separate cleaned datasets saved for Neo4j:
 - extot_clean.csv
 - ttsex_clean.csv
 - tnage_clean.csv


I also create a dataset that combines all.

In [102]:
extot_ml = extot_clean.rename(columns={"value": "expenditure_value"})
ttsex_ml = ttsex_clean.rename(columns={"value": "sex_value"})
tnage_ml = tnage_clean.rename(columns={"value": "age_value"})

common_keys = list(
    set(extot_ml.columns) &
    set(ttsex_ml.columns) &
    set(tnage_ml.columns)
)

print("Common merge keys:", common_keys)

merged = extot_ml.merge(
    ttsex_ml, on=common_keys, how="outer", suffixes=("", "_sex")
)
merged = merged.merge(
    tnage_ml, on=common_keys, how="outer", suffixes=("", "_age")
)

# Save merged dataset
merged.to_csv(os.path.join(processed_path, "tourism_merged.csv"), index=False)

print("✅ Merged dataset for ML saved as tourism_merged.csv")
print("Merged dataset shape:", merged.shape)
print(merged.head())


Common merge keys: ['unit', 'c_dest', 'year', 'duration', 'geo\\time_period', 'freq', 'purpose']
✅ Merged dataset for ML saved as tourism_merged.csv
Merged dataset shape: (499442, 14)
  freq purpose duration c_dest expend statinfo unit geo\time_period   year  \
0    A     PER     N1-3    DOM  ACCOM  AVG_NGT  EUR              AT  2012    
1    A     PER     N1-3    DOM  ACCOM  AVG_TRP  EUR              AT  2012    
2    A     PER     N1-3    DOM  ACCOM    TOTAL  EUR              AT  2012    
3    A     PER     N1-3    DOM    DUR  AVG_NGT  EUR              AT  2012    
4    A     PER     N1-3    DOM    DUR  AVG_TRP  EUR              AT  2012    

   expenditure_value  sex  sex_value  age  age_value  
0       6.602000e+01  NaN        NaN  NaN        NaN  
1       1.327200e+02  NaN        NaN  NaN        NaN  
2       7.667658e+08  NaN        NaN  NaN        NaN  
3       1.670000e+00  NaN        NaN  NaN        NaN  
4       3.360000e+00  NaN        NaN  NaN        NaN  
