In [1]:
import pandas as pd
import os
import pickle
import numpy as np
import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load, combine and delete original csv's

In [None]:
# List of file names
files = [
    "barcelona_weekdays.csv", "budapest_weekends.csv", "london_weekends.csv", "vienna_weekdays.csv",
    "amsterdam_weekdays.csv", "barcelona_weekends.csv", "paris_weekdays.csv", "vienna_weekends.csv",
    "amsterdam_weekends.csv", "berlin_weekdays.csv", "lisbon_weekdays.csv", "paris_weekends.csv",
    "athens_weekdays.csv", "berlin_weekends.csv", "lisbon_weekends.csv", "rome_weekdays.csv",
    "athens_weekends.csv", "budapest_weekdays.csv", "london_weekdays.csv", "rome_weekends.csv"
]

# Directory containing files
directory = "../data/"

# Initialize an empty list to hold dataframes
dataframes = []

# Read each file and append to the list
for file in files:
    # Create the full path to the file
    file_path = os.path.join(directory, file)
    # Read the CSV file
    df = pd.read_csv(file_path)
    # Add a column to identify the file (city and weekday/weekend)
    city, period = file.replace('.csv', '').rsplit('_', 1)
    df['city'] = city.capitalize()
    df['period'] = period.capitalize()
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Delete all the original files
for file in files:
    os.remove(os.path.join(directory, file))  # Delete each file

# Add GDP

In [None]:
# GDP from wiki - United Nations estimate
# https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita
gdp_mapping = {
'Barcelona' : 30.058,       
'Budapest'  : 18.728,   
'London'    : 46.542,   
'Vienna'    : 53.840,   
'Amsterdam' : 57.871,       
'Paris'     : 44.229,   
'Berlin'    : 51.073,   
'Lisbon'    : 24.651,   
'Athens'    : 20.571,   
'Rome'      : 37.150}

# Apply the mapping to create a new 'GDP' column
combined_df['GDP'] = combined_df['city'].map(gdp_mapping)

# Remove very expensive rentals (1500 EUR pr 2 nights)

In [None]:
# filter out price over 1500
data = combined_df[combined_df['realSum'] < 1500]

# print message
removed = combined_df.shape[0]-data.shape[0]
percent = removed/combined_df.shape[0]
print(f'{removed} rows and {round(percent,4)}% of the data removed')

# Keep only relevant columns

In [None]:
data = data[['realSum', 'room_type', 'person_capacity', 'biz', 'multi', 'bedrooms', 'dist', 'metro_dist', 'city', 'period', 'attr_index', 'rest_index', 'GDP']]

# Save df

In [None]:
data.to_csv('../data/clean_data.csv', index=False)

In [22]:
# load the data
data = pd.read_csv('../data/clean_data.csv')

hot_cols = ['room_type', 'city', 'period'] # Columns to one-hot encode

norm_cols = ['person_capacity', # Columns to normalize
            'bedrooms',
            'dist',
            'metro_dist',
            'attr_index',
            'rest_index',
            'GDP'] 

# Split data

In [23]:
y = data[['realSum']]
X = data.drop(columns='realSum')

# 15% saved for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, 
                                                    random_state=42)

# 15% of the remaining saved for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train,
                                                  test_size=X_test.shape[0] / X_train.shape[0],  
                                                  random_state=42)

# Encoding and scaling

In [24]:
def one_hot_encode(df, categorical_cols):
    return pd.get_dummies(df, columns=categorical_cols, dtype=int)

def normalize(df, numerical_cols, scaler=None):
    if not scaler:
        scaler = MinMaxScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    return df, scaler

def preprocess_data(X_train, X_val, X_test, categorical_cols, numerical_cols, y_train, y_val, y_test):
    # One-hot encoded data
    X_train_hot = one_hot_encode(X_train.copy(), categorical_cols)
    X_val_hot = one_hot_encode(X_val.copy(), categorical_cols)
    X_test_hot = one_hot_encode(X_test.copy(), categorical_cols)
    
    # delete the column ['period_Weekdays']
    X_train_hot = X_train_hot.drop(columns=['period_Weekdays'])
    X_val_hot = X_val_hot.drop(columns=['period_Weekdays'])
    X_test_hot = X_test_hot.drop(columns=['period_Weekdays'])

    
    hot = (X_train_hot, X_val_hot, X_test_hot, y_train.copy(), y_val.copy(), y_test.copy())
    
    # Normalized data (excluding categorical columns)
    X_train_hot_norm, scaler_X = normalize(X_train_hot.copy(), numerical_cols)
    X_val_hot_norm, _ = normalize(X_val_hot.copy(), numerical_cols, scaler=scaler_X)
    X_test_hot_norm, _ = normalize(X_test_hot.copy(), numerical_cols, scaler=scaler_X)
    
    y_train_hot_norm, scaler_y = normalize(y_train.copy(), ['realSum'])
    y_val_hot_norm, _ = normalize(y_val.copy(), ['realSum'], scaler=scaler_y)
    y_test_hot_norm, _ = normalize(y_test.copy(), ['realSum'], scaler=scaler_y)
    
    # Collect the final datasets
    hot_norm = (X_train_hot_norm, X_val_hot_norm, X_test_hot_norm, y_train_hot_norm, y_val_hot_norm, y_test_hot_norm)
    
    return hot, hot_norm, scaler_X, scaler_y

def save_datasets(dataset, folder_name):
    X_train, X_val, X_test, y_train, y_val, y_test = dataset
    with open(f'../data/{folder_name}/train.pkl', 'wb') as f:
        pickle.dump((X_train.to_numpy(), y_train.to_numpy()), f)
    with open(f'../data/{folder_name}/val.pkl', 'wb') as f:
        pickle.dump((X_val.to_numpy(), y_val.to_numpy()), f)
    with open(f'../data/{folder_name}/test.pkl', 'wb') as f:
        pickle.dump((X_test.to_numpy(), y_test.to_numpy()), f)


In [25]:
# Ensure directories exist
os.makedirs('../data/hot', exist_ok=True)
os.makedirs('../data/hot_norm', exist_ok=True)

# Preprocess data
hot, hot_norm, scaler_X, scaler_y = preprocess_data(X_train=X_train, X_val=X_val, X_test=X_test,
                                                    y_train=y_train, y_val=y_val, y_test=y_test ,
                                                    categorical_cols=hot_cols, 
                                                    numerical_cols=norm_cols)

# Save datasets
save_datasets(hot, 'hot')
save_datasets(hot_norm, 'hot_norm')

# Save scalers
with open('../data/scaler_X.pkl', 'wb') as f:
    pickle.dump(scaler_X, f)
with open('../data/scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)


In [33]:
hot_norm[1]

Unnamed: 0,person_capacity,biz,multi,bedrooms,dist,metro_dist,attr_index,rest_index,GDP,room_type_Entire home/apt,...,city_Barcelona,city_Berlin,city_Budapest,city_Lisbon,city_London,city_Paris,city_Rome,city_Vienna,period_Weekdays,period_Weekends
16286,0.75,0,1,0.1,0.082412,0.066246,0.024209,0.025750,0.897019,1,...,0,0,0,0,0,0,0,1,0,1
48936,1.00,0,1,0.2,0.016653,0.021776,0.119495,0.209619,0.470633,1,...,0,0,0,0,0,0,1,0,0,1
30747,0.00,0,0,0.1,0.037896,0.034385,0.069173,0.113481,0.151317,1,...,0,0,0,1,0,0,0,0,0,1
12394,0.00,0,1,0.1,0.108667,0.021010,0.058686,0.090109,0.289451,0,...,1,0,0,0,0,0,0,0,0,1
30779,0.00,1,0,0.1,0.040370,0.026289,0.112791,0.133064,0.151317,1,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7850,0.50,0,1,0.1,0.071023,0.023772,0.122111,0.148927,0.710574,1,...,0,0,0,0,1,0,0,0,0,1
39514,0.50,1,0,0.1,0.056345,0.009299,0.041659,0.038630,0.047084,1,...,0,0,0,0,0,0,0,0,0,1
30669,0.50,0,1,0.1,0.208925,0.292355,0.021360,0.023917,0.151317,1,...,0,0,0,1,0,0,0,0,0,1
39195,0.25,1,0,0.0,0.052348,0.084538,0.022956,0.028482,0.047084,1,...,0,0,0,0,0,0,0,0,0,1
