In [3]:
import pandas as pd
import os
import pickle
import numpy as np
import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load, combine and delete original csv's

In [4]:
# List of file names
files = [
    "barcelona_weekdays.csv", "budapest_weekends.csv", "london_weekends.csv", "vienna_weekdays.csv",
    "amsterdam_weekdays.csv", "barcelona_weekends.csv", "paris_weekdays.csv", "vienna_weekends.csv",
    "amsterdam_weekends.csv", "berlin_weekdays.csv", "lisbon_weekdays.csv", "paris_weekends.csv",
    "athens_weekdays.csv", "berlin_weekends.csv", "lisbon_weekends.csv", "rome_weekdays.csv",
    "athens_weekends.csv", "budapest_weekdays.csv", "london_weekdays.csv", "rome_weekends.csv"
]

# Directory containing files
directory = "../data/"

# Initialize an empty list to hold dataframes
dataframes = []

# Read each file and append to the list
for file in files:
    # Create the full path to the file
    file_path = os.path.join(directory, file)
    # Read the CSV file
    df = pd.read_csv(file_path)
    # Add a column to identify the file (city and weekday/weekend)
    city, period = file.replace('.csv', '').rsplit('_', 1)
    df['city'] = city.capitalize()
    df['period'] = period.capitalize()
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Delete all the original files
for file in files:
    os.remove(os.path.join(directory, file))  # Delete each file

# Add GDP

In [5]:
# GDP from wiki - United Nations estimate
# https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita
gdp_mapping = {
'Barcelona' : 30.058,       
'Budapest'  : 18.728,   
'London'    : 46.542,   
'Vienna'    : 53.840,   
'Amsterdam' : 57.871,       
'Paris'     : 44.229,   
'Berlin'    : 51.073,   
'Lisbon'    : 24.651,   
'Athens'    : 20.571,   
'Rome'      : 37.150}

# Apply the mapping to create a new 'GDP' column
combined_df['GDP'] = combined_df['city'].map(gdp_mapping)

# Remove very expensive rentals (1500 EUR pr 2 nights)

In [6]:
# filter out price over 1500
data = combined_df[combined_df['realSum'] < 1500]

# print message
removed = combined_df.shape[0]-data.shape[0]
percent = removed/combined_df.shape[0]
print(f'{removed} rows and {round(percent,4)}% of the data removed')

258 rows and 0.005% of the data removed


# Remove irrelevant columns

In [7]:
data = data[['realSum', 'room_type', 'person_capacity', 'biz', 'bedrooms', 'dist', 'metro_dist', 'city', 'period', 'attr_index', 'GDP']]

# Save df

In [8]:
data.to_csv('../data/clean_data.csv', index=False)

# Split data

In [10]:
X = data[['room_type', 'person_capacity', 'biz', 'bedrooms', 'dist', 'metro_dist', 'city', 'period', 'attr_index', 'GDP']]
y = data[['realSum']]

# 15% saved for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, 
                                                    random_state=42)

# 15% of the remaining saved for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train,
                                                  test_size=X_test.shape[0] / X_train.shape[0],  
                                                  random_state=42)

In [31]:
# Save datasets
with open('../data/original/train.pkl', 'wb') as f:
    pickle.dump((X_train.to_numpy(), y_train.to_numpy()), f)
with open('../data/original/val.pkl', 'wb') as f:
    pickle.dump((X_val.to_numpy(), y_val.to_numpy()), f)
with open('../data/original/test.pkl', 'wb') as f:
    pickle.dump((X_test.to_numpy(), y_test.to_numpy()), f)

# One hot

In [15]:
# Look at factor variables
print(X_train['room_type'].unique())
print(X_train['city'].unique())
print(X_train['period'].unique())

['Entire home/apt' 'Private room' 'Shared room']
['Lisbon' 'Athens' 'Rome' 'Paris' 'Budapest' 'Barcelona' 'London'
 'Amsterdam' 'Berlin' 'Vienna']
['Weekends' 'Weekdays']


In [20]:
def one_hot(df):
    # Apply one-hot encoding to the categorical columns
    df_encoded = pd.get_dummies(df, columns=['room_type', 'city', 'period'])
    
    # Drop one of the period columns to avoid perfect multicollinearity
    if 'period_Weekdays' in df_encoded.columns:
        df_encoded.drop('period_Weekdays', axis=1, inplace=True)

    # Convert all Boolean columns to integers (0 and 1)
    for col in df_encoded.columns:
        if df_encoded[col].dtype == bool:
            df_encoded[col] = df_encoded[col].astype(int)

    return df_encoded.to_numpy()


In [25]:
X_train_hot = one_hot(X_train)
X_val_hot = one_hot(X_val)
X_test_hot = one_hot(X_test)

In [None]:
# Save datasets
with open('../data/hot/train.pkl', 'wb') as f:
    pickle.dump((X_train_hot, y_train.to_numpy()), f)
with open('../data/hot/val.pkl', 'wb') as f:
    pickle.dump((X_val_hot, y_val.to_numpy()), f)
with open('../data/hot/test.pkl', 'wb') as f:
    pickle.dump((X_test_hot, y_test.to_numpy()), f)

# Normalise AND one-hot

In [28]:
# Fit and transform
scaler_x = MinMaxScaler()
X_train_hot_norm = scaler_x.fit_transform(X_train_hot)

# Only transform
X_val_hot_norm = scaler_x.transform(X_val_hot)
X_test_hot_norm = scaler_x.transform(X_test_hot)


In [None]:
# Fit and transform
scaler_y = MinMaxScaler()
y_train_norm = scaler_y.fit_transform(y_train)

# Only transform
y_val_norm = scaler_x.transform(y_val)
y_test_norm = scaler_x.transform(y_test)

In [None]:
# Save datasets
with open('../data/hot_norm/train.pkl', 'wb') as f:
    pickle.dump((X_train_hot_norm, y_train_norm), f)
with open('../data/hot_norm/val.pkl', 'wb') as f:
    pickle.dump((X_val_hot_norm, y_val_norm), f)
with open('../data/hot_norm/test.pkl', 'wb') as f:
    pickle.dump((X_test_hot_norm, y_test_norm), f)