## Introduction

Notebook ini digunakan untuk membuat recommender system berdasarkan preferensi pelanggan AirBnB

## Import Libraries

In [88]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from feature_engine.outliers import Winsorizer
from sklearn.compose import ColumnTransformer

## Exploratory Data Analysis

In [2]:
df = pd.read_csv('listings_clustered.csv')
df.head()

Unnamed: 0,id,listing_url,picture_url,name,description,property_type,room_type,accommodates,number_of_reviews,bedrooms,beds,price,review_scores_rating,city,latitude,longitude,bathrooms,cluster
0,52438122,https://www.airbnb.com/rooms/52438122,https://a0.muscache.com/pictures/miso/Hosting-...,Stunning New Cottage minutes to downtown Ashev...,Come relax in this brand new cottage with tast...,Entire cottage,Entire home/apt,4,63,2.0,2.0,225.0,4.98,Asheville,35.56967,-82.63193,1.5,1
1,22119778,https://www.airbnb.com/rooms/22119778,https://a0.muscache.com/pictures/bac6ce5d-d2ff...,Large king suite with private balcony and soak...,Whether you're looking for a romantic Ashevill...,Private room in bed and breakfast,Private room,3,1,2.0,2.0,306.0,5.0,Asheville,35.60284,-82.56727,1.0,1
2,47812966,https://www.airbnb.com/rooms/47812966,https://a0.muscache.com/pictures/324713f3-ea1c...,Blue Ridge Magic: Artist's Hideaway (Brand New),New listing: Blue Ridge Magic named for our pa...,Entire rental unit,Entire home/apt,4,25,1.0,2.0,108.0,5.0,Asheville,35.58475,-82.57182,1.0,1
3,46284932,https://www.airbnb.com/rooms/46284932,https://a0.muscache.com/pictures/miso/Hosting-...,Simple and Cozy Bedroom in Central Area,Looking for a simple bedroom and bathroom to r...,Private room in bungalow,Private room,2,3,1.0,1.0,70.0,5.0,Asheville,35.5787,-82.61582,1.0,1
4,48366092,https://www.airbnb.com/rooms/48366092,https://a0.muscache.com/pictures/prohost-api/H...,"Hot tub, Fire pit, 5 miles to downtown Asheville","Beautiful 2 bedroom oasis. Custom built, fully...",Entire cottage,Entire home/apt,4,143,2.0,2.0,114.0,4.92,Asheville,35.55106,-82.51424,1.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28923 entries, 0 to 28922
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    28923 non-null  int64  
 1   listing_url           28923 non-null  object 
 2   picture_url           28923 non-null  object 
 3   name                  28923 non-null  object 
 4   description           28923 non-null  object 
 5   property_type         28923 non-null  object 
 6   room_type             28923 non-null  object 
 7   accommodates          28923 non-null  int64  
 8   number_of_reviews     28923 non-null  int64  
 9   bedrooms              28923 non-null  float64
 10  beds                  28923 non-null  float64
 11  price                 28923 non-null  float64
 12  review_scores_rating  28923 non-null  float64
 13  city                  28923 non-null  object 
 14  latitude              28923 non-null  float64
 15  longitude          

In [11]:
tfidf_vectorizer1 = TfidfVectorizer(stop_words='english')
tfidf_matrix1 = tfidf_vectorizer1.fit_transform(df['description'])

tfidf_vectorizer2 = TfidfVectorizer(stop_words='english')
tfidf_matrix2 = tfidf_vectorizer2.fit_transform(df['city'])

tfidf_matrix_combined = hstack([tfidf_matrix1, tfidf_matrix2])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28923 entries, 0 to 28922
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    28923 non-null  int64  
 1   listing_url           28923 non-null  object 
 2   picture_url           28923 non-null  object 
 3   name                  28923 non-null  object 
 4   description           28923 non-null  object 
 5   property_type         28923 non-null  object 
 6   room_type             28923 non-null  object 
 7   accommodates          28923 non-null  int64  
 8   number_of_reviews     28923 non-null  int64  
 9   bedrooms              28923 non-null  float64
 10  beds                  28923 non-null  float64
 11  price                 28923 non-null  float64
 12  review_scores_rating  28923 non-null  float64
 13  city                  28923 non-null  object 
 14  latitude              28923 non-null  float64
 15  longitude          

In [59]:
df.nunique()

id                      28923
listing_url             28923
picture_url             28767
name                    28567
description             28184
property_type              97
room_type                   4
accommodates               16
number_of_reviews         571
bedrooms                   13
beds                       25
price                    1272
review_scores_rating      160
city                       31
latitude                27200
longitude               27093
bathrooms                  22
cluster                     3
amenities               28297
dtype: int64

In [82]:
X = df[['description','city','accommodates','number_of_reviews','bedrooms','beds','price','review_scores_rating','bathrooms']].copy()

X_rec = X.drop(['description'], axis=1).copy()

In [83]:
num_col = ['accommodates','number_of_reviews','bedrooms','beds','price','review_scores_rating','bathrooms']

pipeline = Pipeline([
    ('winsorizer', Winsorizer(capping_method='iqr', tail='both', fold=3, variables=num_col)),
    ('scaler', RobustScaler())
    ])

num_scaled = pipeline.fit_transform(X[num_col])

# Add to the feature matrix
feature_matrix = hstack([tfidf_matrix_combined, num_scaled])

X_rec[num_col] = pipeline.fit_transform(X_rec[num_col])

In [115]:
X_rec = pd.get_dummies(X_rec)

In [74]:
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

In [76]:
df_rec = pd.DataFrame(cosine_sim, index=df['name'], columns=df['name'])

def sorting(name):
  tmp = df_rec[name].drop(index=name).sort_values(ascending=False).iloc[:5]
  print(f'You like {name}, so based on our recommender system, We recommend you to stay in:')
  for i,name in enumerate(tmp.index):
    print(f'{i+1}. {name}')

In [92]:
df.review_scores_rating.median()

4.88

In [119]:
def get_recommendations(name=None, city=None, price=150, accommodates=2, beds=1, bedrooms=1, bathrooms=1):

    # Create a user profile
    user_profile = {'name': name, 'city': city, 'price': price, 'accommodates':accommodates, 'beds':beds,
                    'bedrooms':bedrooms, 'bathrooms':bathrooms}

    # Convert the user profile to a DataFrame
    user_df = pd.DataFrame([user_profile])

    user_df = pd.get_dummies(user_df)

    # Compute similarity scores
    sim_scores = cosine_similarity(user_df, X_rec)



In [103]:
sorting('House in West Asheville/River Arts District')

You like House in West Asheville/River Arts District, so based on our recommender system, We recommend you to stay in:
1. Large 3BR Asheville Home - Quiet Neighborhood!
2. Comfortable Modern West Asheville home
3. Modern 3-bedroom house in a great location!
4. Cozy Asheville get away 15 minutes to downtown !!
5. Perfect vacation home 2


In [120]:
get_recommendations(city='New York City')

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 6 while Y.shape[1] == 38