In [4]:
import re
import pandas as pd
import numpy as np
from IPython.display import display

from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestRegressor

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [5]:
# Load the CSV file
data = pd.read_csv("data/listings/listings.csv")

# display(data[['bathrooms', 'bathrooms_text']])

# Select important columns
important_columns = [
    'id',
    'price',
    'neighbourhood_cleansed',
    'room_type',
    'bedrooms',
    'bathrooms',
    'accommodates',
    'amenities',
    'minimum_nights',
    'number_of_reviews',
    'review_scores_rating',
    'name',
    'description'
]# Save the filtered data to a new CSV file

df_listing = data[important_columns].copy() # Use copy to be independent of the original data

# PREPROCESSING

## Helper Functions

In [6]:
def append_sentiment_score(listings_df: pd.DataFrame, reviews_df: pd.DataFrame):
    """
    Add sentiment score directly to the listings_df based on reviews_df.
    This modifies listings_df in place.
    """
    listings_df['id'] = listings_df['id'].astype(int)
    reviews_df['listing_id'] = reviews_df['listing_id'].astype(int)

    # Convert sentiment labels to binary scores
    reviews_df['sentiment_score'] = reviews_df['sentiment'].map({'POSITIVE': 1, 'NEGATIVE': 0})

    # Aggregate average sentiment score
    aggregated_sentiment = (
        reviews_df.groupby('listing_id')['sentiment_score']
        .mean()
        .reset_index()
    )
    sentiment_map = dict(zip(aggregated_sentiment['listing_id'], aggregated_sentiment['sentiment_score']))
    listings_df['sentiment_score'] = listings_df['id'].map(sentiment_map)

def embedd_amenity(amenities: list[str]):
    return embedding_model.encode(amenities)

def itemize_amenities(listings_df: pd.DataFrame):
    """
    Converts the 'amenities' column of stringified lists into multi-hot encoded columns.
    Modifies listings_df in place by appending amenity columns.
    """

    listings_df['embedded_amenities'] = listings_df['amenities'].apply(embedd_amenity)



## Preprocessing the Data

In [22]:
# Price
df_listing["price"] = df_listing["price"].replace('[\$,]', '', regex=True).astype(float)
df_listing = df_listing.dropna(subset=['price'])
df_listing = df_listing[df_listing['price'] != 0]

df_reviews = pd.read_csv("data/sentiment_reviews.csv")
append_sentiment_score(df_listing, df_reviews)
itemize_amenities(df_listing)

display(df_listing)


Unnamed: 0,id,price,neighbourhood_cleansed,room_type,bedrooms,bathrooms,accommodates,amenities,minimum_nights,number_of_reviews,review_scores_rating,name,description,sentiment_score,embedded_amenities
0,164448,944.0,Södermalms,Private room,1.0,1.0,2,"[""Hair dryer"", ""Hangers"", ""Long term stays all...",2,430,4.86,Double room in central Stockholm with Wi-Fi,I am renting out a nice double room on the top...,1.000000,"[-0.020701448, -0.024009565, 0.058711186, 0.00..."
1,220851,414.0,Kungsholmens,Private room,1.0,1.0,1,"[""Hangers"", ""Apple sound system with aux"", ""P...",2,64,4.68,One room in appartement,Welcome!,0.981818,"[-0.0023745005, -0.06360794, 0.043378565, -0.0..."
2,238411,1320.0,Norrmalms,Entire home/apt,1.0,1.0,2,"[""Washer"", ""Hair dryer"", ""Hangers"", ""Long term...",1,110,4.79,Cozy apartment in central Stockholm,,1.000000,"[-0.03775145, -0.02206401, 0.06119034, 0.02586..."
3,242188,814.0,Södermalms,Private room,1.0,1.0,1,"[""Hair dryer"", ""Hangers"", ""Fast wifi \u2013 28...",2,414,4.90,Single room in central Stockholm with Wi-Fi,I am renting out a nice single room on the top...,1.000000,"[-0.018808702, -0.044104837, 0.071508996, 0.00..."
4,273906,2750.0,Södermalms,Entire home/apt,3.0,2.0,4,"[""Washer"", ""TV with standard cable"", ""Dedicate...",6,4,5.00,Penthouse in central Stockholm,,1.000000,"[-0.01589693, -0.0668648, 0.027026793, -0.0588..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5218,1321121257587188985,800.0,Östermalms,Entire home/apt,2.0,1.0,4,"[""First aid kit"", ""Smoke alarm"", ""Air conditio...",30,0,,Perfekt familjelägenhet,"Cozy and fresh three with two bedrooms, open p...",,"[0.010715313, 0.075288564, 0.052025743, -0.020..."
5219,1321592072491619841,1014.0,Skarpnäcks,Entire home/apt,1.0,1.0,2,"[""Washer"", ""Wine glasses"", ""Hangers"", ""Wifi"", ...",3,0,,Designlägenhet,"This unique accommodation of 22 sqm, has large...",,"[-0.039519697, -0.011447098, 1.3068041e-05, 0...."
5220,1321811206140917325,360.0,Farsta,Private room,1.0,0.5,1,"[""Hair dryer"", ""Hangers"", ""Host greets you"", ""...",1,0,,Best south stockholm,Green unique area with the first pedestrian st...,,"[0.0025318966, -0.019209959, 0.04600891, 0.031..."
5221,1321863150641245338,1400.0,Älvsjö,Entire home/apt,2.0,1.0,2,"[""Washer"", ""Hair dryer"", ""Hangers"", ""First aid...",1,0,,Comfort de Luxe Älvsjö,Bring the whole family to this amazing place w...,,"[-0.006820489, -0.018918872, 0.0042910096, 0.0..."


# Train the Model

In [8]:
numerical_features = df_listing[['bedrooms', 'bathrooms', 'accommodates', 'sentiment_score']].values
embedded_features = np.array(df_listing['embedded_amenities'].tolist())

X = np.hstack((embedded_features, numerical_features))
y = df_listing['price'].values

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)


# Test the Model

In [21]:
new_amenities = ['Hair dryer', 'Shampoo', 'WiFi']

new_amenities_embedding = np.array(embedding_model.encode(new_amenities))

new_amenities_embedding = np.mean(new_amenities_embedding, axis=0).reshape(1, -1)

new_numerical_features = np.array([[1, 2, 2, 0.2]]) # 1 bedroom, 2 bathrooms, accommodates 2 people, sentiment score 0.6

new_features = np.hstack((new_amenities_embedding, new_numerical_features))

# 5. Predict
predicted_price = model.predict(new_features)
print(predicted_price)

[2446.08619048]
