In [5]:
import pandas as pd
import os
import pickle
import numpy as np
import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load, combine and delete original csv's

In [10]:
# List of file names
files = [
    "barcelona_weekdays.csv", "budapest_weekends.csv", "london_weekends.csv", "vienna_weekdays.csv",
    "amsterdam_weekdays.csv", "barcelona_weekends.csv", "paris_weekdays.csv", "vienna_weekends.csv",
    "amsterdam_weekends.csv", "berlin_weekdays.csv", "lisbon_weekdays.csv", "paris_weekends.csv",
    "athens_weekdays.csv", "berlin_weekends.csv", "lisbon_weekends.csv", "rome_weekdays.csv",
    "athens_weekends.csv", "budapest_weekdays.csv", "london_weekdays.csv", "rome_weekends.csv"
]

# Directory containing files
directory = "../data/"

# Initialize an empty list to hold dataframes
dataframes = []

# Read each file and append to the list
for file in files:
    # Create the full path to the file
    file_path = os.path.join(directory, file)
    # Read the CSV file
    df = pd.read_csv(file_path)
    # Add a column to identify the file (city and weekday/weekend)
    city, period = file.replace('.csv', '').rsplit('_', 1)
    df['city'] = city.capitalize()
    df['period'] = period.capitalize()
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all dataframes into one
data_full = pd.concat(dataframes, ignore_index=True)

# Delete all the original files
#for file in files:
#    os.remove(os.path.join(directory, file))  # Delete each file

# Keep only relevant columns

In [11]:
data_full = data_full[['realSum', 'room_type', 'person_capacity', 'biz', 'multi', 'bedrooms', 'dist', 'metro_dist', 'city', 'period', 'attr_index_norm', 'rest_index_norm']]

# Make "single" variable

In [12]:
# When biz and multi are both 0, single must be 1
data_full.loc[(data_full['biz'] == 0) & (data_full['multi'] == 0), 'single'] = 1
# Change to int 
data_full['single'] = data_full['single'].fillna(0).astype(int)

# Save df

In [13]:
#data_full.to_csv('../data/clean_data_full.csv', index=False)

# Remove very expensive rentals (1500 EUR pr 2 nights)

In [14]:
# filter out price over 1500
data = data_full[data_full['realSum'] < 1500]

# print message
removed = data_full.shape[0]-data.shape[0]
percent = removed/data_full.shape[0]
print(f'{removed} rows and {round(percent,4)}% of the data removed')

258 rows and 0.005% of the data removed


# Save df

### This is the data we use for vizzing

In [11]:
#data.to_csv('../data/clean_data.csv', index=False)

In [12]:
# load the data
data = pd.read_csv('../data/clean_data.csv')

hot_cols = ['room_type', 'city', 'period'] # Columns to one-hot encode

norm_cols = ['person_capacity', # Columns to normalize
            'bedrooms',
            'dist',
            'metro_dist',
            'attr_index',
            'rest_index',
            'GDP'] 

# Split data

In [13]:
y = data[['realSum']]
X = data.drop(columns='realSum')

# 15% saved for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, 
                                                    random_state=42)

# 15% of the remaining saved for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train,
                                                  test_size=X_test.shape[0] / X_train.shape[0],  
                                                  random_state=42)

# Encoding and scaling

In [14]:
def one_hot_encode(df, categorical_cols):
    return pd.get_dummies(df, columns=categorical_cols, dtype=int)

def normalize(df, numerical_cols, scaler=None):
    if not scaler:
        scaler = MinMaxScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    return df, scaler

def preprocess_data(X_train, X_val, X_test, categorical_cols, numerical_cols, y_train, y_val, y_test):
    # One-hot encoded data
    X_train_hot = one_hot_encode(X_train.copy(), categorical_cols)
    X_val_hot = one_hot_encode(X_val.copy(), categorical_cols)
    X_test_hot = one_hot_encode(X_test.copy(), categorical_cols)
    
    # delete the column ['period_Weekdays']
    X_train_hot = X_train_hot.drop(columns=['period_Weekdays'])
    X_val_hot = X_val_hot.drop(columns=['period_Weekdays'])
    X_test_hot = X_test_hot.drop(columns=['period_Weekdays'])

    
    hot = (X_train_hot, X_val_hot, X_test_hot, y_train.copy(), y_val.copy(), y_test.copy())
    
    # Normalized data (excluding categorical columns)
    scaler_X = MinMaxScaler()
    X_train_hot_norm = scaler_X.fit_transform(X_train_hot.copy())
    X_val_hot_norm = scaler_X.transform(X_val_hot.copy())
    X_test_hot_norm = scaler_X.transform(X_test_hot.copy())

    scaler_y = MinMaxScaler()
    y_train_hot_norm = scaler_y.fit_transform(y_train.copy())
    y_val_hot_norm = scaler_y.transform(y_val.copy())
    y_test_hot_norm = scaler_y.transform(y_test.copy())
    
    # Collect the final datasets
    hot_norm = (X_train_hot_norm, X_val_hot_norm, X_test_hot_norm, y_train_hot_norm, y_val_hot_norm, y_test_hot_norm)
    
    return hot, hot_norm, scaler_X, scaler_y


def save_datasets(dataset, folder_name):
    X_train, X_val, X_test, y_train, y_val, y_test = dataset
    with open(f'../data/{folder_name}/train.pkl', 'wb') as f:
        pickle.dump((X_train, y_train), f)
    with open(f'../data/{folder_name}/val.pkl', 'wb') as f:
        pickle.dump((X_val, y_val), f)
    with open(f'../data/{folder_name}/test.pkl', 'wb') as f:
        pickle.dump((X_test, y_test), f)


In [15]:
# Ensure directories exist
os.makedirs('../data/hot', exist_ok=True)
os.makedirs('../data/hot_norm', exist_ok=True)

# Preprocess data
hot, hot_norm, scaler_X, scaler_y = preprocess_data(X_train=X_train, X_val=X_val, X_test=X_test,
                                                    y_train=y_train, y_val=y_val, y_test=y_test ,
                                                    categorical_cols=hot_cols, 
                                                    numerical_cols=norm_cols)

# Save datasets
save_datasets(hot, 'hot')
save_datasets(hot_norm, 'hot_norm')

# Save scalers
with open('../data/scaler_X.pkl', 'wb') as f:
    pickle.dump(scaler_X, f)
with open('../data/scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)
