In [6]:
from datetime import datetime, timedelta
import os

import altair as alt
import pandas as pd
from torch import float16
from transformers import AutoTokenizer, pipeline
from superlinked import framework as sl

alt.renderers.enable(sl.get_altair_renderer())
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", 1000)
START_OF_2024_TS = int(datetime(2024, 1, 2).timestamp())
EXECUTOR_DATA = {sl.CONTEXT_COMMON: {sl.CONTEXT_COMMON_NOW: START_OF_2024_TS}}
TOP_N = 8


Path to dataset files: /Users/amlk/.cache/kagglehub/datasets/henriquejborges/house-prices-in-sweden/versions/9


In [13]:
import pandas as pd

airbnb_data = pd.read_csv('/home/jovyan/MLOps/listings.csv')
# Update the price column to handle both string and float types
airbnb_data['price'] = airbnb_data['price'].apply(lambda x: float(x[1:].replace(',', '')) if isinstance(x, str) else x)
airbnb_data['host_is_superhost']=airbnb_data['host_is_superhost'].apply(lambda x: 1 if x == 't' else 0)
airbnb_data['last_scraped'] = pd.to_datetime(airbnb_data['last_scraped'])
airbnb_data['review_scores_rating'].fillna(0, inplace=True)
                          

11549

In [8]:
airbnb_data.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [12]:
class DataSchema(sl.Schema):
    id: sl.IdField
    date: sl.Timestamp
    name: sl.String
    description: sl.String
    review_scores_communication: sl.Float
    review_scores_location: sl.Float
    review_scores_cleanliness: sl.Float
    bedrooms: sl.Float
    beds: sl.Float
    bathrooms: sl.Float
    bathrooms_text: sl.String
    #room_text: sl.String
    review_scores_rating: sl.Float
    host_is_superhost: sl.Integer
    price: sl.Float
    amenities: sl.StringList
    number_of_reviews: sl.Integer

In [None]:
airbnb = DataSchema()

In [38]:
%%capture
relevance_space = sl.TextSimilaritySpace(
    text=sl.chunk(airbnb.description, chunk_size=100, chunk_overlap=20),
    model="sentence-transformers/all-mpnet-base-v2",
)
recency_space=sl.RecencySpace(
    timestamp=airbnb.date, 
    period_time_list=[sl.PeriodTime(timedelta(days=300))],
)

review_space = sl.NumberSpace(
    number=airbnb.review_scores_rating, min_value=0.0, max_value=5.0, mode=sl.Mode.MAXIMUM
)


In [None]:
airbnb_index = sl.Index([relevance_space, recency_space, review_space])
airbnb_parser = sl.DataFrameParser(airbnb,mapping={airbnb.id:"id", airbnb.date: "last_scraped", })
source: sl.InMemorySource = sl.InMemorySource(airbnb, parser=airbnb_parser)
executor = sl.InMemoryExecutor(sources=[source], indices=[airbnb_index], context_data=EXECUTOR_DATA)
app = executor.run()
source.put([airbnb_data])

In [None]:
knowledgebase_query = (
    sl.Query(
        airbnb_index,
        weights = {
            relevance_space: sl.Param("relevance_weight"),
            recency_space: sl.Param("recency_weight"),
            review_space: sl.Param("review_weight")
        },
    )
    .find(airbnb)
    .similar(relevance_space, sl.Param("search_query"))
    .select_all()
    .limit(sl.Param("limit"))
)

In [None]:
query_text: str = "Where is the apartment in Stockholm in old town?"

In [None]:
only_relevance_result = app.query(
    knowledgebase_query,
    relevance_weight=1,
    recency_weight=0,
    review_weight=0,
    search_query=query_text,
    limit=TOP_N,
)

df = sl.PandasConverter.to_pandas(only_relevance_result)
#sl.PandasConverter.format_date_column(df, "date", "last_fetched")