# Preprocessing the Datasets


## Importing the Libraries

In [31]:
import pandas as pd
import os


In [32]:
# Load the uploaded TSV files
extot_path = "Datasets/Raw/estat_tour_dem_extot.tsv"
tnage_path = "Datasets/Raw/estat_tour_dem_tnage.tsv"
ttsex_path = "Datasets/Raw/estat_tour_dem_ttsex.tsv"
path = "Datasets/"
# Load first few rows of each dataset
extot_raw = pd.read_csv(extot_path, sep="\t")
tnage_raw = pd.read_csv(tnage_path, sep="\t")
ttsex_raw = pd.read_csv(ttsex_path, sep="\t")


In [33]:

print("Shapes:")
print("ttsex:", ttsex_raw.shape)
print("extot:", extot_raw.shape)
print("tnage:", tnage_raw.shape)

Shapes:
ttsex: (4679, 13)
extot: (31989, 13)
tnage: (14221, 13)


In [34]:
# === Split dimension column into multiple columns (Eurostat style) ===
def split_dimensions(df):
    first_col = df.columns[0]
    dims = first_col.split(",")
    df[dims] = df[first_col].str.split(",", expand=True)
    df = df.drop(columns=[first_col])
    return df, dims

ttsex, dims_ttsex = split_dimensions(ttsex_raw)
extot, dims_extot = split_dimensions(extot_raw)
tnage, dims_tnage = split_dimensions(tnage_raw)

print("Dims (ttsex):", dims_ttsex)
print("Dims (extot):", dims_extot)
print("Dims (tnage):", dims_tnage)


Dims (ttsex): ['freq', 'c_dest', 'purpose', 'duration', 'sex', 'unit', 'geo\\TIME_PERIOD']
Dims (extot): ['freq', 'purpose', 'duration', 'c_dest', 'expend', 'statinfo', 'unit', 'geo\\TIME_PERIOD']
Dims (tnage): ['freq', 'purpose', 'c_dest', 'duration', 'age', 'unit', 'geo\\TIME_PERIOD']


In [35]:
# === Melt wide year-columns into long format ===
def melt_long(df, dims):
    df_long = df.melt(id_vars=dims, var_name="year", value_name="value")
    return df_long

ttsex_long = melt_long(ttsex, dims_ttsex)
extot_long = melt_long(extot, dims_extot)
tnage_long = melt_long(tnage, dims_tnage)

ttsex_long.head()


Unnamed: 0,freq,c_dest,purpose,duration,sex,unit,geo\TIME_PERIOD,year,value
0,A,DOM,PER,N1-3,F,NR,AL,2012,:
1,A,DOM,PER,N1-3,F,NR,AT,2012,3155661
2,A,DOM,PER,N1-3,F,NR,BE,2012,756712
3,A,DOM,PER,N1-3,F,NR,BG,2012,1029215 b
4,A,DOM,PER,N1-3,F,NR,CH,2012,2156668


In [36]:
# === Clean numeric values (remove flags like 'b', 'u', 'p') ===
def clean_values(df):
    df["value"] = df["value"].astype(str).str.replace(r"[^0-9.]", "", regex=True)
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    return df.dropna(subset=["value"])

ttsex_clean = clean_values(ttsex_long)
extot_clean = clean_values(extot_long)
tnage_clean = clean_values(tnage_long)


In [37]:
# === Standardize column names ===
def clean_columns(df):
    df.columns = (
        df.columns.str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace("/", "_", regex=False)
    )
    return df

ttsex_clean = clean_columns(ttsex_clean)
extot_clean = clean_columns(extot_clean)
tnage_clean = clean_columns(tnage_clean)


In [39]:
# === 1. Save separate cleaned datasets (for Neo4j KG import) ===
processed_path = os.path.join(path, "processed")
os.makedirs(processed_path, exist_ok=True)

extot_clean.to_csv(os.path.join(processed_path, "extot_clean.csv"), index=False)
ttsex_clean.to_csv(os.path.join(processed_path, "ttsex_clean.csv"), index=False)
tnage_clean.to_csv(os.path.join(processed_path, "tnage_clean.csv"), index=False)

print("✅ Separate cleaned datasets saved for Neo4j:")
print(" - extot_clean.csv")
print(" - ttsex_clean.csv")
print(" - tnage_clean.csv")


✅ Separate cleaned datasets saved for Neo4j:
 - extot_clean.csv
 - ttsex_clean.csv
 - tnage_clean.csv


In [40]:
# === 2. Create merged dataset (for ML models) ===

# Rename value columns
extot_ml = extot_clean.rename(columns={"value": "expenditure_value"})
ttsex_ml = ttsex_clean.rename(columns={"value": "sex_value"})
tnage_ml = tnage_clean.rename(columns={"value": "age_value"})

# Detect common keys automatically
common_keys = list(
    set(extot_ml.columns) &
    set(ttsex_ml.columns) &
    set(tnage_ml.columns)
)

print("Common merge keys:", common_keys)

# Merge all three on common keys
merged = extot_ml.merge(
    ttsex_ml, on=common_keys, how="outer", suffixes=("", "_sex")
)
merged = merged.merge(
    tnage_ml, on=common_keys, how="outer", suffixes=("", "_age")
)

# Save merged dataset
merged.to_csv(os.path.join(processed_path, "tourism_merged.csv"), index=False)

print("✅ Merged dataset for ML saved as tourism_merged.csv")
print("Merged dataset shape:", merged.shape)
print(merged.head())


Common merge keys: ['unit', 'c_dest', 'year', 'duration', 'geo\\time_period', 'freq', 'purpose']
✅ Merged dataset for ML saved as tourism_merged.csv
Merged dataset shape: (499442, 14)
  freq purpose duration c_dest expend statinfo unit geo\time_period   year  \
0    A     PER     N1-3    DOM  ACCOM  AVG_NGT  EUR              AT  2012    
1    A     PER     N1-3    DOM  ACCOM  AVG_TRP  EUR              AT  2012    
2    A     PER     N1-3    DOM  ACCOM    TOTAL  EUR              AT  2012    
3    A     PER     N1-3    DOM    DUR  AVG_NGT  EUR              AT  2012    
4    A     PER     N1-3    DOM    DUR  AVG_TRP  EUR              AT  2012    

   expenditure_value  sex  sex_value  age  age_value  
0       6.602000e+01  NaN        NaN  NaN        NaN  
1       1.327200e+02  NaN        NaN  NaN        NaN  
2       7.667658e+08  NaN        NaN  NaN        NaN  
3       1.670000e+00  NaN        NaN  NaN        NaN  
4       3.360000e+00  NaN        NaN  NaN        NaN  
