file version v0.2

## A recommender system based on listing numeric data

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

RANDOM_STATE= 42

from sklearn.metrics.pairwise import cosine_similarity

### Prepare data

#### Basic additional cleaning

In [2]:
# load dataset that has already been cleaned
raw_df = pd.read_pickle('../data/data_cleaned/cleaned_listing_and_review_with_polarity.zip')
print(raw_df.shape)

(4933, 73)


In [3]:
raw_df.columns

Index(['listing_id', 'listing_url', 'last_scraped', 'listing_name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bedrooms',
       'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights',
       'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'av

Some columns we decided to remove with the reason in comments

In [4]:
feature_to_remove = ['host_total_listings_count','host_listings_count', # same description with different values, use calculated_host_listings_count instead
                    'minimum_minimum_nights', 'maximum_minimum_nights', # those num are from calender
                    'minimum_maximum_nights', 'maximum_maximum_nights', # they're constantly changing 
                    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', # so do not make much sense

                    'has_availability','availability_30',
                    'availability_60', 'availability_90','availability_365',
                    #'reviewer_count', #TODO: remove it, already has it # REMOVED
                    #'host_number_of_year', # TODO: rename it as host_operating_years #RENAMEED
                    ]

In [5]:
df_model = raw_df.drop(columns=feature_to_remove)
df_model.shape

(4933, 60)

In [6]:
df_model.isna().sum()

listing_id                                        0
listing_url                                       0
last_scraped                                      0
listing_name                                      0
description                                       0
neighborhood_overview                             0
picture_url                                       0
host_id                                           0
host_url                                          0
host_name                                         0
host_since                                        0
host_location                                     0
host_about                                        0
host_response_time                                0
host_response_rate                                0
host_acceptance_rate                              0
host_is_superhost                                 0
host_picture_url                                  0
host_neighbourhood                                0
host_verific

In [7]:
# check if any columns have Nan... 
df_model.columns[df_model.isna().any()].tolist()

['comments', 'polarity']

In [8]:
# remove na polarity rows, polarity is numeric, goal is to get all numeric columns
df_model = df_model.dropna()
print(df_model.columns[df_model.isna().any()].tolist(), df_model.shape)

[] (4086, 60)


In [9]:
df_model.head(5)

Unnamed: 0,listing_id,listing_url,last_scraped,listing_name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_picture_url,host_neighbourhood,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,has_license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,bathrooms_count,amenities_count,host_response_time_encoded,host_operate_years,comments,polarity
0,49113826,https://www.airbnb.com/rooms/49113826,2022-12-24,Private Seattle Getaway Home Near Lake Washing...,A Seattle vacation destination for those from ...,SeaTac International Airport: 14 min drive<br ...,https://a0.muscache.com/pictures/miso/Hosting-...,188538325,https://www.airbnb.com/users/show/188538325,Xiao,2018-05-08,"Seattle, WA",,within an hour,1.0,1.0,1,https://a0.muscache.com/im/pictures/user/32fd3...,Oakland,"['email', 'phone']",1,1,Rainier View,Rainier Valley,47.51028,-122.24668,Entire home,Entire home/apt,8,4.0,4.0,"[""Hot water"", ""Dining table"", ""Toaster"", ""Refr...",280.0,3,30,18,7,1,2021-06-22,2022-11-29,4.83,4.89,4.94,4.89,4.89,4.61,4.72,1,0,1,1,0,0,0.98,2.0,53,1,4,Unexpectedly the place was like new home since...,0.817365
1,7455832,https://www.airbnb.com/rooms/7455832,2022-12-24,Classic remodeled in Georgetown,Enjoy your Seattle stay in the vibrant & artis...,Georgetown is a cool neighborhood about three ...,https://a0.muscache.com/pictures/miso/Hosting-...,2144954,https://www.airbnb.com/users/show/2144954,Ryan,2012-04-15,"Seattle, WA",,within an hour,1.0,1.0,0,https://a0.muscache.com/im/pictures/user/a6711...,Greater Duwamish,"['email', 'phone', 'work_email']",1,0,Georgetown,Other neighborhoods,47.544739,-122.319786,Entire home,Entire home/apt,6,3.0,3.0,"[""Hot water"", ""Crib - available upon request"",...",156.0,1,150,27,27,3,2022-06-09,2022-12-13,4.89,4.96,4.89,4.93,4.96,4.93,4.93,1,1,2,2,0,0,4.07,2.0,55,1,10,The house is so comfortable and clean. It has ...,0.913648
2,42313537,https://www.airbnb.com/rooms/42313537,2022-12-24,Beautifully renovated cottage studio at Alki,"Welcome to your private, quiet retreat, featur...","Only 1-block away, Alki Beach is perfect for a...",https://a0.muscache.com/pictures/6ba9d3d0-68d2...,10181843,https://www.airbnb.com/users/show/10181843,Kristin & Stuart,2013-11-21,"Seattle, WA",New Mexico and Texas natives who couldn't wait...,within an hour,1.0,1.0,1,https://a0.muscache.com/im/pictures/user/c5d48...,West Seattle,"['email', 'phone']",1,1,Alki,West Seattle,47.5738,-122.41552,Entire guesthouse,Entire home/apt,2,1.0,1.0,"[""Hot water"", ""Toaster"", ""Refrigerator"", ""Mini...",135.0,1,31,177,82,4,2021-02-24,2022-12-21,4.99,4.99,5.0,5.0,5.0,4.99,4.94,1,0,1,1,0,0,7.94,1.0,40,1,9,Kristin and Stuart’s cottage is the perfect va...,0.881181
4,46727219,https://www.airbnb.com/rooms/46727219,2022-12-24,"Classic, two-story home w/ high-speed WiFi, fu...",<b>The space</b><br />Northlake Excellence<br ...,,https://a0.muscache.com/pictures/prohost-api/H...,111812937,https://www.airbnb.com/users/show/111812937,Vacasa Washington,2017-01-16,,Vacasa\nVacation Home Management\n\nVacasa unl...,within an hour,0.97,0.99,0,https://a0.muscache.com/im/pictures/user/4a227...,Capitol Hill,"['email', 'phone', 'work_email']",1,1,Wallingford,Other neighborhoods,47.64952,-122.33891,Entire home,Entire home/apt,6,3.0,3.0,"[""Hot water"", ""Refrigerator"", ""Bathtub"", ""Dish...",130.0,2,1125,34,20,0,2021-02-06,2022-08-28,4.74,4.76,4.82,4.88,4.71,4.94,4.65,1,1,27,27,0,0,1.48,1.0,32,1,5,Good location. A great spot in the trendiest p...,0.780712
6,16962405,https://www.airbnb.com/rooms/16962405,2022-12-24,Spectacular Apt in 5-STAR Home -Mt. & Sunset v...,* PRIVATE entry to 2+ B/R Luxury Apt. with ow...,"Queen Anne is a quiet, residential walking nei...",https://a0.muscache.com/pictures/2d7918da-4568...,40706640,https://www.airbnb.com/users/show/40706640,Cathy,2015-08-06,"Seattle, WA",,within an hour,1.0,0.95,1,https://a0.muscache.com/im/users/40706640/prof...,Queen Anne,"['email', 'phone']",1,1,West Queen Anne,Queen Anne,47.63194,-122.36813,Entire guest suite,Entire home/apt,6,2.0,2.0,"[""Hot water"", ""Refrigerator"", ""Dishwasher"", ""L...",286.0,3,1125,98,23,3,2017-04-16,2022-12-21,4.94,4.96,4.91,4.99,4.99,4.97,4.9,1,0,1,1,0,0,1.41,1.0,46,1,7,We loved staying at this clean and convenientl...,0.920071


Reset index so easier to debug later

In [10]:
df_model.reset_index(inplace=True, drop=True)
df_model.head(5)

Unnamed: 0,listing_id,listing_url,last_scraped,listing_name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_picture_url,host_neighbourhood,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,has_license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,bathrooms_count,amenities_count,host_response_time_encoded,host_operate_years,comments,polarity
0,49113826,https://www.airbnb.com/rooms/49113826,2022-12-24,Private Seattle Getaway Home Near Lake Washing...,A Seattle vacation destination for those from ...,SeaTac International Airport: 14 min drive<br ...,https://a0.muscache.com/pictures/miso/Hosting-...,188538325,https://www.airbnb.com/users/show/188538325,Xiao,2018-05-08,"Seattle, WA",,within an hour,1.0,1.0,1,https://a0.muscache.com/im/pictures/user/32fd3...,Oakland,"['email', 'phone']",1,1,Rainier View,Rainier Valley,47.51028,-122.24668,Entire home,Entire home/apt,8,4.0,4.0,"[""Hot water"", ""Dining table"", ""Toaster"", ""Refr...",280.0,3,30,18,7,1,2021-06-22,2022-11-29,4.83,4.89,4.94,4.89,4.89,4.61,4.72,1,0,1,1,0,0,0.98,2.0,53,1,4,Unexpectedly the place was like new home since...,0.817365
1,7455832,https://www.airbnb.com/rooms/7455832,2022-12-24,Classic remodeled in Georgetown,Enjoy your Seattle stay in the vibrant & artis...,Georgetown is a cool neighborhood about three ...,https://a0.muscache.com/pictures/miso/Hosting-...,2144954,https://www.airbnb.com/users/show/2144954,Ryan,2012-04-15,"Seattle, WA",,within an hour,1.0,1.0,0,https://a0.muscache.com/im/pictures/user/a6711...,Greater Duwamish,"['email', 'phone', 'work_email']",1,0,Georgetown,Other neighborhoods,47.544739,-122.319786,Entire home,Entire home/apt,6,3.0,3.0,"[""Hot water"", ""Crib - available upon request"",...",156.0,1,150,27,27,3,2022-06-09,2022-12-13,4.89,4.96,4.89,4.93,4.96,4.93,4.93,1,1,2,2,0,0,4.07,2.0,55,1,10,The house is so comfortable and clean. It has ...,0.913648
2,42313537,https://www.airbnb.com/rooms/42313537,2022-12-24,Beautifully renovated cottage studio at Alki,"Welcome to your private, quiet retreat, featur...","Only 1-block away, Alki Beach is perfect for a...",https://a0.muscache.com/pictures/6ba9d3d0-68d2...,10181843,https://www.airbnb.com/users/show/10181843,Kristin & Stuart,2013-11-21,"Seattle, WA",New Mexico and Texas natives who couldn't wait...,within an hour,1.0,1.0,1,https://a0.muscache.com/im/pictures/user/c5d48...,West Seattle,"['email', 'phone']",1,1,Alki,West Seattle,47.5738,-122.41552,Entire guesthouse,Entire home/apt,2,1.0,1.0,"[""Hot water"", ""Toaster"", ""Refrigerator"", ""Mini...",135.0,1,31,177,82,4,2021-02-24,2022-12-21,4.99,4.99,5.0,5.0,5.0,4.99,4.94,1,0,1,1,0,0,7.94,1.0,40,1,9,Kristin and Stuart’s cottage is the perfect va...,0.881181
3,46727219,https://www.airbnb.com/rooms/46727219,2022-12-24,"Classic, two-story home w/ high-speed WiFi, fu...",<b>The space</b><br />Northlake Excellence<br ...,,https://a0.muscache.com/pictures/prohost-api/H...,111812937,https://www.airbnb.com/users/show/111812937,Vacasa Washington,2017-01-16,,Vacasa\nVacation Home Management\n\nVacasa unl...,within an hour,0.97,0.99,0,https://a0.muscache.com/im/pictures/user/4a227...,Capitol Hill,"['email', 'phone', 'work_email']",1,1,Wallingford,Other neighborhoods,47.64952,-122.33891,Entire home,Entire home/apt,6,3.0,3.0,"[""Hot water"", ""Refrigerator"", ""Bathtub"", ""Dish...",130.0,2,1125,34,20,0,2021-02-06,2022-08-28,4.74,4.76,4.82,4.88,4.71,4.94,4.65,1,1,27,27,0,0,1.48,1.0,32,1,5,Good location. A great spot in the trendiest p...,0.780712
4,16962405,https://www.airbnb.com/rooms/16962405,2022-12-24,Spectacular Apt in 5-STAR Home -Mt. & Sunset v...,* PRIVATE entry to 2+ B/R Luxury Apt. with ow...,"Queen Anne is a quiet, residential walking nei...",https://a0.muscache.com/pictures/2d7918da-4568...,40706640,https://www.airbnb.com/users/show/40706640,Cathy,2015-08-06,"Seattle, WA",,within an hour,1.0,0.95,1,https://a0.muscache.com/im/users/40706640/prof...,Queen Anne,"['email', 'phone']",1,1,West Queen Anne,Queen Anne,47.63194,-122.36813,Entire guest suite,Entire home/apt,6,2.0,2.0,"[""Hot water"", ""Refrigerator"", ""Dishwasher"", ""L...",286.0,3,1125,98,23,3,2017-04-16,2022-12-21,4.94,4.96,4.91,4.99,4.99,4.97,4.9,1,0,1,1,0,0,1.41,1.0,46,1,7,We loved staying at this clean and convenientl...,0.920071


#### TODO: Add in categorical values using one hot encoding

One hot encoding is a common strategy to turn a categorical column into multiple numeric columns so we can more easily use numeric methods on them such as cosine_similarity later.

In [10]:
df_model.shape

(4086, 60)

In [12]:
# X = df_model.copy()
# X = pd.get_dummies(data=X)
# X.shape

#### TODO: Should we also add in datetime types too since they are kind of numeric?

In [13]:
# Might need to get date objects into our model too?

#### TODO: Should we also add in clustering results as a column?

In [14]:
# If we decide to add in the cluster id, would be great if we had a 
# cleaned data set that had the cluster id once the clustering research
# is complete

#### Take all the numerical columns as features for our model

TODO: should we take all the numeric features?

In [15]:
df_model.shape

(4086, 60)

In [16]:
df_model.dtypes

listing_id                                               int64
listing_url                                             object
last_scraped                                    datetime64[ns]
listing_name                                            object
description                                             object
neighborhood_overview                                   object
picture_url                                             object
host_id                                                  int64
host_url                                                object
host_name                                               object
host_since                                      datetime64[ns]
host_location                                           object
host_about                                              object
host_response_time                                      object
host_response_rate                                     float64
host_acceptance_rate                                   

In [18]:
number_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_model_num = df_model.select_dtypes(include=number_types)
df_model_num.shape

(4086, 37)

In [19]:
df_model_num.dtypes

listing_id                                        int64
host_id                                           int64
host_response_rate                              float64
host_acceptance_rate                            float64
host_is_superhost                                 int64
host_has_profile_pic                              int64
host_identity_verified                            int64
latitude                                        float64
longitude                                       float64
accommodates                                      int64
bedrooms                                        float64
beds                                            float64
price                                           float64
minimum_nights                                    int64
maximum_nights                                    int64
number_of_reviews                                 int64
number_of_reviews_ltm                             int64
number_of_reviews_l30d                          

### Find cosine similarity between all rental properties

#### Build similarity matrix

In [20]:
df_model_num.head(5)

Unnamed: 0,listing_id,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,has_license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,bathrooms_count,amenities_count,host_response_time_encoded,host_operate_years,polarity
0,49113826,188538325,1.0,1.0,1,1,1,47.51028,-122.24668,8,4.0,4.0,280.0,3,30,18,7,1,4.83,4.89,4.94,4.89,4.89,4.61,4.72,1,0,1,1,0,0,0.98,2.0,53,1,4,0.817365
1,7455832,2144954,1.0,1.0,0,1,0,47.544739,-122.319786,6,3.0,3.0,156.0,1,150,27,27,3,4.89,4.96,4.89,4.93,4.96,4.93,4.93,1,1,2,2,0,0,4.07,2.0,55,1,10,0.913648
2,42313537,10181843,1.0,1.0,1,1,1,47.5738,-122.41552,2,1.0,1.0,135.0,1,31,177,82,4,4.99,4.99,5.0,5.0,5.0,4.99,4.94,1,0,1,1,0,0,7.94,1.0,40,1,9,0.881181
4,46727219,111812937,0.97,0.99,0,1,1,47.64952,-122.33891,6,3.0,3.0,130.0,2,1125,34,20,0,4.74,4.76,4.82,4.88,4.71,4.94,4.65,1,1,27,27,0,0,1.48,1.0,32,1,5,0.780712
6,16962405,40706640,1.0,0.95,1,1,1,47.63194,-122.36813,6,2.0,2.0,286.0,3,1125,98,23,3,4.94,4.96,4.91,4.99,4.99,4.97,4.9,1,0,1,1,0,0,1.41,1.0,46,1,7,0.920071


In [21]:
similarity = cosine_similarity(df_model_num)
similarity

array([[1.        , 0.50980478, 0.4714846 , ..., 0.99273633, 0.99243226,
        0.25208508],
       [0.50980478, 1.        , 0.99903284, ..., 0.60960351, 0.40030876,
        0.96102129],
       [0.4714846 , 0.99903284, 1.        , ..., 0.57415848, 0.35962819,
        0.97224846],
       ...,
       [0.99273633, 0.60960351, 0.57415848, ..., 1.        , 0.97045024,
        0.36667893],
       [0.99243226, 0.40030876, 0.35962819, ..., 0.97045024, 1.        ,
        0.13134957],
       [0.25208508, 0.96102129, 0.97224846, ..., 0.36667893, 0.13134957,
        1.        ]])

In [22]:
similarity[500]

array([0.99904379, 0.47170479, 0.43247758, ..., 0.98652701, 0.9968519 ,
       0.20953526])

#### Recommend top n similar properties

Let's determine what columns should we show in our app

In [23]:
model_columns_all = list(df_model.columns.values)
model_columns_all[:10]

['listing_id',
 'listing_url',
 'last_scraped',
 'listing_name',
 'description',
 'neighborhood_overview',
 'picture_url',
 'host_id',
 'host_url',
 'host_name']

In [24]:
ui_display_columns = ['listing_id', 'listing_url', 
                    'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
                    'longitude', 'property_type', 'room_type', 'accommodates', 'bedrooms',
                    'beds', 'amenities', 'price', 'review_scores_rating',
                    'bathrooms_count', 'amenities_count', 'host_response_time_encoded',
                    'host_operate_years', 'polarity']

Since we're using iloc in our recommender to retrieve the rows, we need to also use the numeric index for the columns.

In [25]:
iloc_cols = [model_columns_all.index(x) for x in ui_display_columns]
iloc_cols

[0, 1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 40, 54, 55, 56, 57, 59]

Get the recommendations

In [26]:
def get_recommendations(df, similarity, n, listing_id=None, listing_url=None, query_element=None):

    # convert query into and a similarity matrix row index
    item_index = None
    try:    
        if listing_id is not None:
            item_index = df['listing_id'].tolist().index(listing_id)
        elif listing_url is not None:
            item_index = df['listing_url'].tolist().index(listing_url)
        elif query_element is not None:
            item_index = query_element
    except ValueError as error:
        print(error)

    # get the top n similar items
    top_idx = np.argsort(similarity[item_index])[::-1][:n]
    print(top_idx)
    result_df = df.iloc[top_idx, iloc_cols]

    # add in similarity score as a column
    top_scores = [similarity[item_index][x] for x in top_idx]
    result_df.insert(loc=2, column='similarity', value=top_scores)

    return result_df


### Try the recommender system

Search for properties similar to the property at index == 500

In [27]:
df_recs = get_recommendations(df_model, similarity, 5, query_element=500)

[ 500  345 1741  360 1680]


The similarities seem so close! **Too good to be true?** Probably because we haven't included our categorical values yet. Or we need feature reduction maybe. Have to display with 10 digit precision just to see the difference!

In [28]:
with pd.option_context('display.float_format', '{:0.10f}'.format):
    display(df_recs)

Unnamed: 0,listing_id,listing_url,similarity,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,amenities,price,review_scores_rating,bathrooms_count,amenities_count,host_response_time_encoded,host_operate_years,polarity
594,26258898,https://www.airbnb.com/rooms/26258898,1.0,Greenwood,Other neighborhoods,47.68454,-122.35851,Entire home,Entire home/apt,5,3.0,4.0,"[""Hot water"", ""Gas stove"", ""Refrigerator"", ""Ba...",262.0,5.0,1.5,50,1,5,0.9563
419,41218952,https://www.airbnb.com/rooms/41218952,0.9999999998,Bitter Lake,Other neighborhoods,47.7119,-122.35053,Entire guest suite,Entire home/apt,2,1.0,1.0,"[""Hot water"", ""Blender"", ""Shared patio or balc...",77.0,4.97,1.0,55,1,4,0.93568
2098,4129218,https://www.airbnb.com/rooms/4129218,0.999999963,Broadway,Capitol Hill,47.62233,-122.31947,Entire rental unit,Entire home/apt,3,2.0,2.0,"[""Hot water"", ""32\"" HDTV with standard cable"",...",140.0,4.83,1.0,42,1,8,0.8422106557
435,50159554,https://www.airbnb.com/rooms/50159554,0.9999999578,First Hill,Downtown,47.61189,-122.32485,Entire rental unit,Entire home/apt,3,1.0,2.0,"[""Hot water"", ""Private entrance"", ""Bay view"", ...",110.0,4.5,0.0,21,5,3,0.97415
2025,51263519,https://www.airbnb.com/rooms/51263519,0.9999997656,Mid-Beacon Hill,Beacon Hill,47.5589,-122.30933,Entire home,Entire home/apt,10,5.0,5.0,"[""Hot water"", ""Refrigerator"", ""Bathtub"", ""Dish...",153.0,4.7,3.0,43,1,3,0.6816222222


Search for the simular properties using listing_id

In [28]:
df_recs = get_recommendations(df_model, similarity, 5, listing_id=26258898)
with pd.option_context('display.float_format', '{:0.10f}'.format):
    display(df_recs)

[ 500  345 1741  360 1680]


Unnamed: 0,listing_id,listing_url,similarity,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,amenities,price,review_scores_rating,bathrooms_count,amenities_count,host_response_time_encoded,host_operate_years,polarity
500,26258898,https://www.airbnb.com/rooms/26258898,1.0,Greenwood,Other neighborhoods,47.68454,-122.35851,Entire home,Entire home/apt,5,3.0,4.0,"[""Hot water"", ""Gas stove"", ""Refrigerator"", ""Ba...",262.0,5.0,1.5,50,1,5,0.9563
345,41218952,https://www.airbnb.com/rooms/41218952,0.9999999998,Bitter Lake,Other neighborhoods,47.7119,-122.35053,Entire guest suite,Entire home/apt,2,1.0,1.0,"[""Hot water"", ""Blender"", ""Shared patio or balc...",77.0,4.97,1.0,55,1,4,0.93568
1741,4129218,https://www.airbnb.com/rooms/4129218,0.999999963,Broadway,Capitol Hill,47.62233,-122.31947,Entire rental unit,Entire home/apt,3,2.0,2.0,"[""Hot water"", ""32\"" HDTV with standard cable"",...",140.0,4.83,1.0,42,1,8,0.8422106557
360,50159554,https://www.airbnb.com/rooms/50159554,0.9999999578,First Hill,Downtown,47.61189,-122.32485,Entire rental unit,Entire home/apt,3,1.0,2.0,"[""Hot water"", ""Private entrance"", ""Bay view"", ...",110.0,4.5,0.0,21,5,3,0.97415
1680,51263519,https://www.airbnb.com/rooms/51263519,0.9999997656,Mid-Beacon Hill,Beacon Hill,47.5589,-122.30933,Entire home,Entire home/apt,10,5.0,5.0,"[""Hot water"", ""Refrigerator"", ""Bathtub"", ""Dish...",153.0,4.7,3.0,43,1,3,0.6816222222


Search for simular properties by listing_url

In [29]:
df_recs = get_recommendations(df_model, similarity, 5,
                              listing_url='https://www.airbnb.com/rooms/26258898')
with pd.option_context('display.float_format', '{:0.10f}'.format):
    display(df_recs)

[ 500  345 1741  360 1680]


Unnamed: 0,listing_id,listing_url,similarity,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,amenities,price,review_scores_rating,bathrooms_count,amenities_count,host_response_time_encoded,host_operate_years,polarity
500,26258898,https://www.airbnb.com/rooms/26258898,1.0,Greenwood,Other neighborhoods,47.68454,-122.35851,Entire home,Entire home/apt,5,3.0,4.0,"[""Hot water"", ""Gas stove"", ""Refrigerator"", ""Ba...",262.0,5.0,1.5,50,1,5,0.9563
345,41218952,https://www.airbnb.com/rooms/41218952,0.9999999998,Bitter Lake,Other neighborhoods,47.7119,-122.35053,Entire guest suite,Entire home/apt,2,1.0,1.0,"[""Hot water"", ""Blender"", ""Shared patio or balc...",77.0,4.97,1.0,55,1,4,0.93568
1741,4129218,https://www.airbnb.com/rooms/4129218,0.999999963,Broadway,Capitol Hill,47.62233,-122.31947,Entire rental unit,Entire home/apt,3,2.0,2.0,"[""Hot water"", ""32\"" HDTV with standard cable"",...",140.0,4.83,1.0,42,1,8,0.8422106557
360,50159554,https://www.airbnb.com/rooms/50159554,0.9999999578,First Hill,Downtown,47.61189,-122.32485,Entire rental unit,Entire home/apt,3,1.0,2.0,"[""Hot water"", ""Private entrance"", ""Bay view"", ...",110.0,4.5,0.0,21,5,3,0.97415
1680,51263519,https://www.airbnb.com/rooms/51263519,0.9999997656,Mid-Beacon Hill,Beacon Hill,47.5589,-122.30933,Entire home,Entire home/apt,10,5.0,5.0,"[""Hot water"", ""Refrigerator"", ""Bathtub"", ""Dish...",153.0,4.7,3.0,43,1,3,0.6816222222


Quick double check that our original table with all the juicy data is the same number of rows as our numeric dataframe that we used to calculate cosine_similarity. (Otherwise we might be pulling the wrong listings in our UI) Yep, both are 4086.

In [30]:
df_model.shape, df_model_num.shape

((4086, 60), (4086, 37))

In [31]:
top_idx = np.argsort(similarity[500])[::-1][:5]
#top_idx = np.argsort(similarity[500])[:5]
top_idx

array([ 500,  345, 1741,  360, 1680])

In [32]:
similarity[500][345]

0.9999999998216362

In [33]:
[similarity[500][x] for x in top_idx]

[1.0000000000000009,
 0.9999999998216362,
 0.9999999629771724,
 0.9999999578233008,
 0.9999997656263523]

In [34]:
len(similarity), len(similarity[0]), similarity[0][345]

(4086, 4086, 0.9990430489087069)

In [35]:
df_model.iloc[[345]]

Unnamed: 0,listing_id,listing_url,last_scraped,listing_name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_picture_url,host_neighbourhood,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,has_license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,bathrooms_count,amenities_count,host_response_time_encoded,host_operate_years,comments,polarity
345,41218952,https://www.airbnb.com/rooms/41218952,2022-12-24,Jewel Box studio ~ an art-filled North Seattle...,This peaceful and colorful north Seattle studi...,My area of the North Seattle neighborhood of B...,https://a0.muscache.com/pictures/e0d6bfd8-f193...,192365131,https://www.airbnb.com/users/show/192365131,Joline,2018-05-29,"Seattle, WA","I’m a professional artist, a gardener, a lover...",within an hour,1.0,0.96,1,https://a0.muscache.com/im/pictures/user/0546a...,Bitter Lake,"['email', 'phone']",1,1,Bitter Lake,Other neighborhoods,47.7119,-122.35053,Entire guest suite,Entire home/apt,2,1.0,1.0,"[""Hot water"", ""Blender"", ""Shared patio or balc...",77.0,2,180,70,31,2,2020-02-24,2022-12-18,4.97,5.0,4.99,4.99,4.99,4.9,4.97,1,0,1,1,0,0,2.03,1.0,55,1,4,The Jewel Box is such a wonderful space. It wa...,0.93568


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f2a50dc6-ff6a-45ff-9dbe-d7a35bd1e393' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>