In [None]:
import re
import pandas as pd
import numpy as np
from IPython.display import display

from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestRegressor

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# Load the CSV file
data = pd.read_csv("data/listings/listings.csv")

# display(data[['bathrooms', 'bathrooms_text']])

# Select important columns
important_columns = [
    'id',
    'price',
    'neighbourhood_cleansed',
    'room_type',
    'bedrooms',
    'bathrooms',
    'accommodates',
    'amenities',
    'minimum_nights',
    'number_of_reviews',
    'review_scores_rating',
    'name',
    'description'
]# Save the filtered data to a new CSV file

df_listing = data[important_columns].copy() # Use copy to be independent of the original data

# PREPROCESSING

## Helper Functions

In [None]:
def append_sentiment_score(listings_df: pd.DataFrame, reviews_df: pd.DataFrame):
    """
    Add sentiment score directly to the listings_df based on reviews_df.
    This modifies listings_df in place.
    """
    listings_df['id'] = listings_df['id'].astype(int)
    reviews_df['listing_id'] = reviews_df['listing_id'].astype(int)

    # Convert sentiment labels to binary scores
    reviews_df['sentiment_score'] = reviews_df['sentiment'].map({'POSITIVE': 1, 'NEGATIVE': 0})

    # Aggregate average sentiment score
    aggregated_sentiment = (
        reviews_df.groupby('listing_id')['sentiment_score']
        .mean()
        .reset_index()
    )
    sentiment_map = dict(zip(aggregated_sentiment['listing_id'], aggregated_sentiment['sentiment_score']))
    listings_df['sentiment_score'] = listings_df['id'].map(sentiment_map)

def embedd_amenity(amenities: list[str]):
    return embedding_model.encode(amenities)

def itemize_amenities(listings_df: pd.DataFrame):
    """
    Converts the 'amenities' column of stringified lists into multi-hot encoded columns.
    Modifies listings_df in place by appending amenity columns.
    """

    listings_df['embedded_amenities'] = listings_df['amenities'].apply(embedd_amenity)



## Preprocessing the Data

In [None]:
# Price
df_listing["price"] = df_listing["price"].replace('[\$,]', '', regex=True).astype(float)
df_listing = df_listing.dropna(subset=['price'])
df_listing = df_listing[df_listing['price'] != 0]

df_reviews = pd.read_csv("data/sentiment_reviews.csv")
append_sentiment_score(df_listing, df_reviews)
itemize_amenities(df_listing)

display(df_listing)


# Train the Model

In [None]:

numerical_features = df_listing[['bedrooms', 'bathrooms', 'accommodates', 'minimum_nights']].values
embedded_features = np.array(df_listing['embedded_amenities'].tolist())

X = np.hstack((embedded_features, numerical_features))
y = df_listing['price'].values

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)


# Test the Model

In [None]:
new_amenities = [
  "Hair dryer",
  "Hangers",
  "Long term stays allowed",
  "Host greets you",
  "Bathtub",
  "Luggage dropoff allowed",
  "Iron",
  "Essentials",
  "Free washer – In building",
  "Elevator",
  "Free dryer – In building",
  "Courtyard view",
  "Smoke alarm",
  "TV",
  "Garden view",
  "Dishes and silverware",
  "Shared backyard – Not fully fenced",
  "Outdoor playground",
  "Heating",
  "Hot water",
  "Shampoo",
  "Bed linens",
  "Extra pillows and blankets",
  "Lock on bedroom door",
  "Fast wifi – 399 Mbps",
  "Park view",
  "Refrigerator",
  "Microwave",
  "Coffee maker"
]

new_amenities_embedding = np.array(embedding_model.encode(new_amenities))

new_amenities_embedding = np.mean(new_amenities_embedding, axis=0).reshape(1, -1)

new_numerical_features = np.array([[2, 1, 2, 2]]) # 1 bedroom, 2 bathrooms, accommodates 2 people, sentiment score 0.6 , minimum nights 10, number of reviews 0, review score 0

new_features = np.hstack((new_amenities_embedding, new_numerical_features))

# 5. Predict
predicted_price = model.predict(new_features)
print(predicted_price)