# Preprocessing the Datasets


## Importing the Libraries

In [12]:
import pandas as pd
import os


In [4]:
df_extot = pd.read_csv("Datasets/Raw/estat_tour_dem_extot.tsv", sep="\t")
df_extot.head()

Unnamed: 0,"freq,purpose,duration,c_dest,expend,statinfo,unit,geo\TIME_PERIOD",2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,"A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,AL",:,:,:,:,:,2.38,:,3.88,9.51,:,:,:
1,"A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,AT",66.02,50.64,46.01,51.48,54.07,55.03,56.24,56.22,51.82,50.29,57.58,67.50
2,"A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,BE",36.76,39.50,37.12,37.13,34.41,39.76 b,34.96,37.88,37.23,40.52,44.16,48.16
3,"A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,BG",4.58 b,5.42,6.66,6.40,6.41,5.85,5.60,8.66,9.62,12.74,16.82,18.54
4,"A,PER,N1-3,DOM,ACCOM,AVG_NGT,EUR,CH",39.76,47.91,46.39,53.58,52.03,53.80,52.26,55.43,60.78,70.47,72.51,:


In [5]:
df_tnage = pd.read_csv("Datasets/Raw/estat_tour_dem_tnage.tsv", sep="\t")
df_tnage.head()

Unnamed: 0,"freq,purpose,c_dest,duration,age,unit,geo\TIME_PERIOD",2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,"A,PER,DOM,N1-3,Y15-24,NR,AL",:,:,:,:,:,815878,:,773740,486390,:,:,:
1,"A,PER,DOM,N1-3,Y15-24,NR,AT",1716783,1881068,1711578,1577217,1956490,1837386,1871739,2044466,1543619,1889960,2886705,2207345
2,"A,PER,DOM,N1-3,Y15-24,NR,BE",194617 u,287394,194586,378535,214340,409604 b,520075,535349,513967,698793,617141,507323
3,"A,PER,DOM,N1-3,Y15-24,NR,BG",989104 b,881981,691500,467668,475164,911906,733618,631363,352139,457406,491589,563691
4,"A,PER,DOM,N1-3,Y15-24,NR,CH",958967 u,799593 u,694026 u,856549 u,1007354 u,666201 u,456367 u,898414 u,1169447 u,1211978 u,1633178 u,:


In [9]:
paths = {
    "extot": "Datasets/Raw/estat_tour_dem_extot.tsv",
    "tnpur": "Datasets/Raw/estat_tour_dem_tnpur.tsv",
    "ttls": "Datasets/Raw/estat_tour_dem_ttls.tsv",
    "tnage": "Datasets/Raw/estat_tour_dem_tnage.tsv",
    "tninc": "Datasets/Raw/estat_tour_dem_tninc.tsv",
    "ttsex": "Datasets/Raw/estat_tour_dem_ttsex.tsv"
}

# Metadata column definitions (based on Eurostat docs and inspection)
schemas = {
    "extot": ['freq','purpose','duration','c_dest','expend','statinfo','unit','geo'],
    "tnpur": ['freq','purpose','duration','unit','geo'],
    "ttls":  ['freq','duration','purpose','unit','geo'],
    "tnage": ['freq','age','purpose','duration','unit','geo'],
    "tninc": ['freq','income','purpose','duration','unit','geo'],
    "ttsex": ['freq','sex','purpose','duration','unit','geo']
}

## Preprocess Function

In [10]:
def preprocess(file_path, meta_cols):
    df = pd.read_csv(file_path, sep="\t")
    # Split metadata into parts
    parts = df[df.columns[0]].str.split(",", expand=True)
    # Pad if needed
    for i in range(len(parts.columns), len(meta_cols)):
        parts[i] = None
    df[meta_cols] = parts.iloc[:, :len(meta_cols)]
    # Year columns
    year_cols = [c for c in df.columns if c.strip().isdigit()]
    # Melt into long format
    df_long = df.melt(
        id_vars=meta_cols,
        value_vars=year_cols,
        var_name="year",
        value_name="value"
    )
    # Clean values
    df_long['year'] = df_long['year'].str.strip()
    df_long['value'] = df_long['value'].astype(str).str.replace(":", "", regex=False)
    df_long['value'] = df_long['value'].str.replace(r"[^\d\.\-]", "", regex=True)
    df_long['value'] = pd.to_numeric(df_long['value'], errors='coerce')
    df_long = df_long.dropna(subset=['value'])
    return df_long

Here we process all datasets with the preprocessing function


In [17]:
cleaned = {}
for name, path in paths.items():
    cleaned[name] = preprocess(path, schemas[name])


In [18]:
out_dir = "Datasets/Preprocessed"
os.makedirs(out_dir, exist_ok=True)
out_paths = {}
for name, df in cleaned_fixed.items():
    out_path = os.path.join(out_dir, f"{name}_cleaned.csv")
    df.to_csv(out_path, index=False)
    out_paths[name] = out_path

Overview of cleaned datasets


In [19]:
out_paths, cleaned["tnage"].head(10)

({'extot': 'Datasets/Preprocessed\\extot_cleaned.csv',
  'tnpur': 'Datasets/Preprocessed\\tnpur_cleaned.csv',
  'ttls': 'Datasets/Preprocessed\\ttls_cleaned.csv',
  'tnage': 'Datasets/Preprocessed\\tnage_cleaned.csv',
  'tninc': 'Datasets/Preprocessed\\tninc_cleaned.csv',
  'ttsex': 'Datasets/Preprocessed\\ttsex_cleaned.csv'},
    freq  age purpose duration    unit geo  year        value
 1     A  PER     DOM     N1-3  Y15-24  NR  2012    1716783.0
 2     A  PER     DOM     N1-3  Y15-24  NR  2012     194617.0
 3     A  PER     DOM     N1-3  Y15-24  NR  2012     989104.0
 4     A  PER     DOM     N1-3  Y15-24  NR  2012     958967.0
 5     A  PER     DOM     N1-3  Y15-24  NR  2012     252897.0
 6     A  PER     DOM     N1-3  Y15-24  NR  2012    4442405.0
 7     A  PER     DOM     N1-3  Y15-24  NR  2012   27422539.0
 8     A  PER     DOM     N1-3  Y15-24  NR  2012    7465010.0
 9     A  PER     DOM     N1-3  Y15-24  NR  2012  100835041.0
 10    A  PER     DOM     N1-3  Y15-24  NR  2012   

In [20]:
previews = {name: df.describe() for name, df in cleaned.items()}
previews

{'extot':               value
 count  2.181480e+05
 mean   1.137473e+09
 std    1.009742e+10
 min    0.000000e+00
 25%    2.052000e+01
 50%    1.089900e+02
 75%    2.311279e+07
 max    5.542959e+11,
 'tnpur':               value
 count  2.309800e+04
 mean   9.583558e+07
 std    3.833842e+08
 min    4.535000e+03
 25%    1.561194e+06
 50%    6.667105e+06
 75%    3.558578e+07
 max    5.717134e+09,
 'ttls':               value
 count  3.189600e+04
 mean   1.290318e+07
 std    5.936336e+07
 min    1.077000e+03
 25%    1.689805e+05
 50%    7.771140e+05
 75%    4.183584e+06
 max    1.143077e+09,
 'tnage':               value
 count  9.435900e+04
 mean   4.271674e+07
 std    2.019837e+08
 min    3.081000e+03
 25%    7.708255e+05
 50%    3.204786e+06
 75%    1.468253e+07
 max    5.717134e+09,
 'tninc':               value
 count  2.545100e+04
 mean   4.358925e+07
 std    1.291152e+08
 min    8.909000e+03
 25%    1.402792e+06
 50%    6.219921e+06
 75%    2.759901e+07
 max    1.498547e+09,
 'ttse

Here we export the cleaned datasets to CSV files and store the paths in a dictionary

In [21]:
export_paths = {}
for name, df in cleaned.items():
    path = os.path.join("Datasets/Preprocessed/", f"{name}_cleaned.csv")
    df.to_csv(path, index=False)
    export_paths[name] = path

export_paths


{'extot': 'Datasets/Preprocessed/extot_cleaned.csv',
 'tnpur': 'Datasets/Preprocessed/tnpur_cleaned.csv',
 'ttls': 'Datasets/Preprocessed/ttls_cleaned.csv',
 'tnage': 'Datasets/Preprocessed/tnage_cleaned.csv',
 'tninc': 'Datasets/Preprocessed/tninc_cleaned.csv',
 'ttsex': 'Datasets/Preprocessed/ttsex_cleaned.csv'}

We need to add unique IDs to each row in the datasets for better tracking and referencing. We will create a new column 'id' for each dataset, with a prefix based on the dataset name.

These ids are necessary for the knowledge graph construction because we need unique identifiers for each entry.

In [22]:
input_dir = "Datasets/Preprocessed"

# Output folder
output_dir = "Datasets/Preprocessed_withIDs"
os.makedirs(output_dir, exist_ok=True)

Function to add unique IDs based on dataset type

In [23]:
def add_ids(name, df):
    if name == "extot":
        df["trip_id"] = (
            df["year"].astype(str) + "_" +
            df["geo"].astype(str) + "_" +
            df["purpose"].astype(str) + "_" +
            df["duration"].astype(str) + "_" +
            df["expend"].astype(str) + "_" +
            df["statinfo"].astype(str)
        )
    elif name == "tnpur":
        df["purpose_id"] = (
            df["year"].astype(str) + "_" +
            df["geo"].astype(str) + "_" +
            df["purpose"].astype(str)
        )
    elif name == "ttls":
        df["duration_id"] = (
            df["year"].astype(str) + "_" +
            df["geo"].astype(str) + "_" +
            df["duration"].astype(str)
        )
    elif name == "tnage":
        df["agegroup_id"] = (
            df["year"].astype(str) + "_" +
            df["geo"].astype(str) + "_" +
            df["age"].astype(str)
        )
    elif name == "tninc":
        df["income_id"] = (
            df["year"].astype(str) + "_" +
            df["geo"].astype(str) + "_" +
            df["income"].astype(str)
        )
    elif name == "ttsex":
        df["sex_id"] = (
            df["year"].astype(str) + "_" +
            df["geo"].astype(str) + "_" +
            df["sex"].astype(str)
        )
    return df


We define the files that we need to preprocess

In [25]:
# Files that we want to process
files = {
    "extot": "extot_cleaned.csv",
    "tnpur": "tnpur_cleaned.csv",
    "ttls": "ttls_cleaned.csv",
    "tnage": "tnage_cleaned.csv",
    "tninc": "tninc_cleaned.csv",
    "ttsex": "ttsex_cleaned.csv"
}


In [26]:
# Process and save
for name, fname in files.items():
    path = os.path.join(input_dir, fname)
    df = pd.read_csv(path)
    df = add_ids(name, df)
    out_path = os.path.join(output_dir, fname.replace("_cleaned", "_withID"))
    df.to_csv(out_path, index=False)
    print(f"Saved {out_path}")


Saved Datasets/Preprocessed_withIDs\extot_withID.csv
Saved Datasets/Preprocessed_withIDs\tnpur_withID.csv
Saved Datasets/Preprocessed_withIDs\ttls_withID.csv
Saved Datasets/Preprocessed_withIDs\tnage_withID.csv
Saved Datasets/Preprocessed_withIDs\tninc_withID.csv
Saved Datasets/Preprocessed_withIDs\ttsex_withID.csv
