In [1]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load, combine and delete original csv's

In [2]:
# List of file names
files = [
    "barcelona_weekdays.csv", "budapest_weekends.csv", "london_weekends.csv", "vienna_weekdays.csv",
    "amsterdam_weekdays.csv", "barcelona_weekends.csv", "paris_weekdays.csv", "vienna_weekends.csv",
    "amsterdam_weekends.csv", "berlin_weekdays.csv", "lisbon_weekdays.csv", "paris_weekends.csv",
    "athens_weekdays.csv", "berlin_weekends.csv", "lisbon_weekends.csv", "rome_weekdays.csv",
    "athens_weekends.csv", "budapest_weekdays.csv", "london_weekdays.csv", "rome_weekends.csv"
]

# Directory containing files
directory = "../data/"

# Initialize an empty list to hold dataframes
dataframes = []

# Read each file and append to the list
for file in files:
    # Create the full path to the file
    file_path = os.path.join(directory, file)
    # Read the CSV file
    df = pd.read_csv(file_path)
    # Add a column to identify the file (city and weekday/weekend)
    city, period = file.replace('.csv', '').rsplit('_', 1)
    df['city'] = city.capitalize()
    df['period'] = period.capitalize()
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Delete all the original files
for file in files:
    os.remove(os.path.join(directory, file))  # Delete each file

# Add GDP

In [3]:
# GDP from wiki - United Nations estimate
# https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita
gdp_mapping = {
'Barcelona' : 30.058,       
'Budapest'  : 18.728,   
'London'    : 46.542,   
'Vienna'    : 53.840,   
'Amsterdam' : 57.871,       
'Paris'     : 44.229,   
'Berlin'    : 51.073,   
'Lisbon'    : 24.651,   
'Athens'    : 20.571,   
'Rome'      : 37.150}

# Apply the mapping to create a new 'GDP' column
combined_df['GDP'] = combined_df['city'].map(gdp_mapping)

In [4]:
# GDP WEIRD?
combined_df['GDP']


0        30.058
1        30.058
2        30.058
3        30.058
4        30.058
          ...  
51702    37.150
51703    37.150
51704    37.150
51705    37.150
51706    37.150
Name: GDP, Length: 51707, dtype: float64

# Remove very expensive rentals (1500 EUR pr night)

In [5]:
# filter out price over 1500
data = combined_df[combined_df['realSum'] < 1500]

# print message
removed = combined_df.shape[0]-data.shape[0]
percent = removed/combined_df.shape[0]
print(f'{removed} rows and {round(percent,4)}% of the data removed')

258 rows and 0.005% of the data removed


# Save df

In [6]:
data['GDP']

0        30.058
1        30.058
2        30.058
3        30.058
4        30.058
          ...  
51702    37.150
51703    37.150
51704    37.150
51705    37.150
51706    37.150
Name: GDP, Length: 51449, dtype: float64

In [7]:
data.to_csv('../data/data.csv', index=False)

# One hot encode

In [8]:
# look at all collumns
list(data)

['Unnamed: 0',
 'realSum',
 'room_type',
 'room_shared',
 'room_private',
 'person_capacity',
 'host_is_superhost',
 'multi',
 'biz',
 'cleanliness_rating',
 'guest_satisfaction_overall',
 'bedrooms',
 'dist',
 'metro_dist',
 'attr_index',
 'attr_index_norm',
 'rest_index',
 'rest_index_norm',
 'lng',
 'lat',
 'city',
 'period',
 'GDP']

In [9]:
# Look at factor variables
print(data['room_type'].unique())
print(data['city'].unique())
print(data['period'].unique())

['Entire home/apt' 'Private room' 'Shared room']
['Barcelona' 'Budapest' 'London' 'Vienna' 'Amsterdam' 'Paris' 'Berlin'
 'Lisbon' 'Athens' 'Rome']
['Weekdays' 'Weekends']


In [10]:
# Start by selecting only the chosen columns
filtered_data = data[['room_type', 'person_capacity', 'biz', 'bedrooms', 'dist', 'metro_dist', 'city', 'period', 'attr_index', 'GDP']]

# Apply one-hot encoding to the categorical columns 'room_type', 'city', and 'period'
one_hot_encoded_data = pd.get_dummies(filtered_data, columns=['room_type', 'city', 'period'])

# Convert all Boolean columns to integers (0 and 1)
for col in one_hot_encoded_data.columns:
    if one_hot_encoded_data[col].dtype == bool:
        one_hot_encoded_data[col] = one_hot_encoded_data[col].astype(int)


In [11]:
# GDP WEIRD?
one_hot_encoded_data['GDP']

0        30.058
1        30.058
2        30.058
3        30.058
4        30.058
          ...  
51702    37.150
51703    37.150
51704    37.150
51705    37.150
51706    37.150
Name: GDP, Length: 51449, dtype: float64

# Normalise

In [12]:
# Normalize data
scaler_norm = MinMaxScaler()
X_data_normalized = scaler_norm.fit_transform(one_hot_encoded_data)
y_data_normalized = scaler_norm.fit_transform(data.loc[:, ['realSum']])

X = X_data_normalized
y = y_data_normalized

In [13]:
# GDP WEIRD?
len(np.unique(X[-1]))

7

# Split the data

In [14]:
# 15% saved for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, 
                                                    random_state=42)

# 15% of the remaining saved for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train,
                                                  test_size=X_test.shape[0] / X_train.shape[0],  
                                                  random_state=42)

# Save split data

In [15]:
# Save datasets
with open('../data/train_data.pkl', 'wb') as f:
    pickle.dump((X_train, y_train), f)
with open('../data/validation_data.pkl', 'wb') as f:
    pickle.dump((X_val, y_val), f)
with open('../data/test_data.pkl', 'wb') as f:
    pickle.dump((X_test, y_test), f)