In [11]:
import pandas as pd
import os
import pickle
import numpy as np
import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load, combine and delete original csv's

In [12]:
# List of file names
files = [
    "barcelona_weekdays.csv", "budapest_weekends.csv", "london_weekends.csv", "vienna_weekdays.csv",
    "amsterdam_weekdays.csv", "barcelona_weekends.csv", "paris_weekdays.csv", "vienna_weekends.csv",
    "amsterdam_weekends.csv", "berlin_weekdays.csv", "lisbon_weekdays.csv", "paris_weekends.csv",
    "athens_weekdays.csv", "berlin_weekends.csv", "lisbon_weekends.csv", "rome_weekdays.csv",
    "athens_weekends.csv", "budapest_weekdays.csv", "london_weekdays.csv", "rome_weekends.csv"
]

# Directory containing files
directory = "../data/"

# Initialize an empty list to hold dataframes
dataframes = []

# Read each file and append to the list
for file in files:
    # Create the full path to the file
    file_path = os.path.join(directory, file)
    # Read the CSV file
    df = pd.read_csv(file_path)
    # Add a column to identify the file (city and weekday/weekend)
    city, period = file.replace('.csv', '').rsplit('_', 1)
    df['city'] = city.capitalize()
    df['period'] = period.capitalize()
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Delete all the original files
for file in files:
    os.remove(os.path.join(directory, file))  # Delete each file

# Add GDP

In [13]:
# GDP from wiki - United Nations estimate
# https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita
gdp_mapping = {
'Barcelona' : 30.058,       
'Budapest'  : 18.728,   
'London'    : 46.542,   
'Vienna'    : 53.840,   
'Amsterdam' : 57.871,       
'Paris'     : 44.229,   
'Berlin'    : 51.073,   
'Lisbon'    : 24.651,   
'Athens'    : 20.571,   
'Rome'      : 37.150}

# Apply the mapping to create a new 'GDP' column
combined_df['GDP'] = combined_df['city'].map(gdp_mapping)

# Remove very expensive rentals (1500 EUR pr 2 nights)

In [14]:
# filter out price over 1500
data = combined_df[combined_df['realSum'] < 1500]

# print message
removed = combined_df.shape[0]-data.shape[0]
percent = removed/combined_df.shape[0]
print(f'{removed} rows and {round(percent,4)}% of the data removed')

258 rows and 0.005% of the data removed


# Remove irrelevant columns

In [15]:
data = data[['realSum', 'room_type', 'person_capacity', 'biz', 'bedrooms', 'dist', 'metro_dist', 'city', 'period', 'attr_index', 'GDP']]

# Save df

In [16]:
data.to_csv('../data/clean_data.csv', index=False)

# Split data

In [17]:
X = data[['room_type', 'person_capacity', 'biz', 'bedrooms', 'dist', 'metro_dist', 'city', 'period', 'attr_index', 'GDP']]
y = data[['realSum']]

# 15% saved for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, 
                                                    random_state=42)

# 15% of the remaining saved for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train,
                                                  test_size=X_test.shape[0] / X_train.shape[0],  
                                                  random_state=42)

# Encoding and scaling

In [18]:
def one_hot_encode(df, categorical_cols):
    return pd.get_dummies(df, columns=categorical_cols)

def normalize(df, numerical_cols, scaler=None):
    if not scaler:
        scaler = MinMaxScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    return df, scaler

def preprocess_data(X_train, X_val, X_test, categorical_cols, numerical_cols, y_train, y_val, y_test):
    # Original data (no preprocessing)
    original = (X_train.copy(), X_val.copy(), X_test.copy(), y_train.copy(), y_val.copy(), y_test.copy())
    
    # One-hot encoded data
    X_train_hot = one_hot_encode(X_train.copy(), categorical_cols)
    X_val_hot = one_hot_encode(X_val.copy(), categorical_cols)
    X_test_hot = one_hot_encode(X_test.copy(), categorical_cols)
    
    hot = (X_train_hot, X_val_hot, X_test_hot, y_train.copy(), y_val.copy(), y_test.copy())
    
    # Normalized data (excluding categorical columns)
    X_train_norm, scaler_X = normalize(X_train.copy(), numerical_cols)
    X_val_norm, _ = normalize(X_val.copy(), numerical_cols, scaler=scaler_X)
    X_test_norm, _ = normalize(X_test.copy(), numerical_cols, scaler=scaler_X)
    
    y_train_norm, scaler_y = normalize(y_train.copy(), ['realSum'])
    y_val_norm, _ = normalize(y_val.copy(), ['realSum'], scaler=scaler_y)
    y_test_norm, _ = normalize(y_test.copy(), ['realSum'], scaler=scaler_y)
    
    norm = (X_train_norm, X_val_norm, X_test_norm, y_train_norm, y_val_norm, y_test_norm)
    
    # One hot and normalised
    X_train_hot_norm = one_hot_encode(X_train_norm.copy(), categorical_cols)
    X_val_hot_norm = one_hot_encode(X_val_norm.copy(), categorical_cols)
    X_test_hot_norm = one_hot_encode(X_test_norm.copy(), categorical_cols)

    # Collect the final datasets
    hot_norm = (X_train_hot_norm, X_val_hot_norm, X_test_hot_norm, y_train_norm, y_val_norm, y_test_norm)
    
    return original, hot, norm, hot_norm, scaler_X, scaler_y


In [19]:
# Ensure directories exist
os.makedirs('../data/original', exist_ok=True)
os.makedirs('../data/hot', exist_ok=True)
os.makedirs('../data/norm', exist_ok=True)
os.makedirs('../data/hot_norm', exist_ok=True)

def save_datasets(dataset, folder_name):
    X_train, X_val, X_test, y_train, y_val, y_test = dataset
    with open(f'../data/{folder_name}/train.pkl', 'wb') as f:
        pickle.dump((X_train.to_numpy(), y_train.to_numpy()), f)
    with open(f'../data/{folder_name}/val.pkl', 'wb') as f:
        pickle.dump((X_val.to_numpy(), y_val.to_numpy()), f)
    with open(f'../data/{folder_name}/test.pkl', 'wb') as f:
        pickle.dump((X_test.to_numpy(), y_test.to_numpy()), f)

original, hot, norm, hot_norm, scaler_X, scaler_y = preprocess_data(X_train, X_val, X_test, 
                                                                    categorical_cols=['room_type', 'city', 'period'], 
                                                                    numerical_cols=['person_capacity', 'biz', 'bedrooms', 'dist', 'metro_dist', 'attr_index', 'GDP'], 
                                                                    y_train=y_train, y_val=y_val, y_test=y_test)

# Save datasets
save_datasets(original, 'original')
save_datasets(hot, 'hot')
save_datasets(norm, 'norm')
save_datasets(hot_norm, 'hot_norm')

# Save scalers
with open('../data/norm/scaler_X.pkl', 'wb') as f:
    pickle.dump(scaler_X, f)
with open('../data/norm/scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)
# Save scalers
with open('../data/hot_norm/scaler_X.pkl', 'wb') as f:
    pickle.dump(scaler_X, f)
with open('../data/hot_norm/scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)
