## Setting up the Notebook

In [67]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise 

In [68]:
%load_ext autoreload
%autoreload 2
from cos_similarity import similarity

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the Data
Load preprocessed data

In [69]:
df_sample = pd.read_csv('data/clean_data.csv')
df_sample.head()

Unnamed: 0,property_type,tenure,built_year,num_beds,num_baths,size_sqft,lat,lng,price
0,0,0,1988.0,3.0,2.0,1115,1.414399,103.837196,514500.0
1,0,0,1992.0,4.0,2.0,1575,1.372597,103.875625,995400.0
2,2,2,2022.0,4.0,6.0,3070,1.298773,103.895798,8485000.0
3,2,2,2023.0,3.0,2.0,958,1.312364,103.803271,2626000.0
4,2,0,2026.0,2.0,1.0,732,1.273959,103.843635,1764000.0


## Binary Utility Matrix:
### Generate input data 
The 50 viewing records are generated in the following way: 
- 45 property records are generated by by using the Pairwise Item Similarity recommender system. Rnadomly choose 45 records out of the top 100 recommendations.
- 5 property records are selected from the rest of the dataset randomly.

In [70]:
row_id = 10
k = 100
top_100 = similarity(row_id, df_sample, k)  
# top_100

In [71]:
np.random.seed(0)
relavent = top_100.sample(n = 45)

In [72]:
np.random.seed(0)
unrelavent = pd.concat([df_sample, relavent]).drop_duplicates(keep=False).sample(n = 5)

In [73]:
history_binary = pd.concat([relavent, unrelavent])
history_binary

Unnamed: 0,property_type,tenure,built_year,num_beds,num_baths,size_sqft,lat,lng,price
12357,2,2,2023.0,2.0,1.0,667,1.312364,103.803271,1838600.0
16535,2,2,2022.0,2.0,1.0,635,1.344334,103.87869,1545600.0
3772,2,2,2023.0,2.0,1.0,646,1.329703,103.905683,1374900.0
14186,2,2,2023.0,2.0,1.0,753,1.312076,103.804055,2089500.0
3904,2,2,2022.0,2.0,1.0,635,1.344334,103.87869,1522500.0
1126,2,2,2022.0,2.0,1.0,635,1.344334,103.87869,1545600.0
15909,2,2,2023.0,2.0,1.0,689,1.308766,103.809065,1680000.0
2770,2,2,2022.0,2.0,1.0,635,1.344334,103.87869,1470000.0
12448,2,2,2023.0,2.0,1.0,581,1.312364,103.803271,1459500.0
12863,2,2,2022.0,2.0,1.0,635,1.344334,103.87869,1545600.0


### Calculate User Profile

In [74]:
# calculate the mean of property viewing history
history_binary_normalized = (history_binary-df_sample.mean())/df_sample.std()
profile_binary = history_binary_normalized.mean()
profile_binary = pd.DataFrame([profile_binary])
profile_binary

Unnamed: 0,property_type,tenure,built_year,num_beds,num_baths,size_sqft,lat,lng,price
0,0.762509,1.293592,0.757003,-0.726826,-0.979221,-0.076721,-0.064416,-0.001504,-0.012176


### Get top k recommendations

In [75]:
k = 3
result_binary = similarity(profile_binary, df_sample, k)
result_binary

Unnamed: 0,property_type,tenure,built_year,num_beds,num_baths,size_sqft,lat,lng,price
7412,2,2,2023.0,2.0,1.0,657,1.308766,103.809065,1869000.0
8035,2,2,2023.0,2.0,1.0,657,1.308766,103.809065,1869000.0
1268,2,2,2023.0,2.0,1.0,657,1.308766,103.809065,1876600.0


### Evaluate the performance by calculating intra-list similrity

In [76]:
result_binary_intra = pairwise.pairwise_distances(result_binary, metric='cosine')
result_binary_intra.sum()/(k*k - k)

7.097655796428626e-12

## Real-Valued Utility Matrix
### Generate input data 
The 50 viewing records are generated in the following way: 
- 25 property records are generated by by using the Pairwise Item Similarity recommender system. Rnadomly choose 25 records out of the top 50 recommendations. 'stay_time' is randomly assigned value range from 120 to 180. 

- 25 property records are selected from the rest of the dataset randomly with a random number between 30 - 90 as the 'stay-time'.

In [77]:
row_id = 10
k = 50
top_50 = similarity(row_id, df_sample, k)  
# top_50

In [78]:
np.random.seed(0)
interst = top_50.sample(n = 25)
interst = (interst-df_sample.mean())/df_sample.std()
interst['stay_time'] = np.random.randint(120, 180, interst.shape[0])
# interst

In [79]:
np.random.seed(0)
uninterst = pd.concat([df_sample, top_50]).drop_duplicates(keep=False).sample(n = 25)
uninterst = (uninterst-df_sample.mean())/df_sample.std()
uninterst['stay_time'] = np.random.randint(30, 90, uninterst.shape[0])
# uninterst

In [80]:
history_realvalue = pd.concat([interst, uninterst])
# history_realvalue

### Calculate the user rating based on the time user spend on each property page.

In [81]:
history_realvalue['rating'] = (history_realvalue['stay_time']-history_realvalue['stay_time'].mean())/history_realvalue['stay_time'].std()
# history_realvalue

### Generate user profile based on the normalized rating

In [82]:
history_normalize = history_realvalue[["property_type", "tenure", "built_year", "num_beds", "num_baths", "size_sqft", "lat", "lng", "price"]].multiply(history["rating"], axis="index")
# history_normalize

In [83]:
user_profile = history_normalize.sum(axis=0)
user_profile = pd.DataFrame([user_profile])
user_profile

Unnamed: 0,property_type,tenure,built_year,num_beds,num_baths,size_sqft,lat,lng,price
0,16.726666,28.305198,19.708695,-17.81722,-23.01488,-1.348344,-0.556777,-0.164371,-0.106868


### Get top k recommendations

In [85]:
k = 3
result_real_value = similarity(user_profile, df_sample, k)
result_real_value

Unnamed: 0,property_type,tenure,built_year,num_beds,num_baths,size_sqft,lat,lng,price
15681,2,2,2024.0,2.0,1.0,721,1.339338,103.763893,1746200.0
13626,2,2,2024.0,2.0,1.0,603,1.339338,103.763893,1702000.0
11481,2,2,2024.0,2.0,1.0,603,1.339338,103.763893,1344000.0


### Evaluate the performance by calculating intra-list similrity

In [87]:
result_real_value_intra = pairwise.pairwise_distances(result_real_value, metric='cosine')
result_real_value_intra.sum()/(k*k - k)

3.92936321628549e-08