In [553]:
import pandas as pd
from scipy.spatial.distance import pdist,squareform
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer #converting the text into feature vectors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

## Prepare the dataset

In [554]:
df=pd.read_csv('Properties_with_Descriptions.csv')
df.head()

Unnamed: 0,Property_ID,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,Garage,Jardin,Balcon,Meublé,Piscine,Etages,prix,Latitude,Longitude,Type_Land,Type_Other,Type_Residential,description
0,10,75.0,180.0,75.0,3.0,1.0,2.0,0,0,0,0,0,4.0,144278.0,36.800207,10.185776,0,0,1,This property offers 75 m² of living space on ...
1,11,85.0,180.0,192.0,3.0,1.0,3.0,0,0,0,0,0,2.0,278606.0,35.828828,10.640525,0,0,1,This property offers 85 m² of living space on ...
2,12,92.0,180.0,92.0,2.0,1.0,3.0,0,0,0,0,0,2.0,353233.0,36.401266,10.557283,0,0,1,This property offers 92 m² of living space on ...
3,13,100.0,180.0,100.0,3.0,1.0,2.0,0,0,1,0,0,2.0,248757.0,36.762436,9.833619,0,0,1,This property offers 100 m² of living space on...
4,14,152.0,152.0,192.0,4.0,2.0,3.0,1,0,0,0,0,1.0,520000.0,36.385974,10.458203,0,0,1,This property offers 152 m² of living space on...


In [555]:
df.columns

Index(['Property_ID', 'Habitable', 'Terrain', 'Total', 'Pièces',
       'Salles de bains', 'Chambres', 'Garage', 'Jardin', 'Balcon', 'Meublé',
       'Piscine', 'Etages', 'prix', 'Latitude', 'Longitude', 'Type_Land',
       'Type_Other', 'Type_Residential', 'description'],
      dtype='object')

In [556]:
df.set_index("Property_ID", inplace=True)
df.head()

Unnamed: 0_level_0,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,Garage,Jardin,Balcon,Meublé,Piscine,Etages,prix,Latitude,Longitude,Type_Land,Type_Other,Type_Residential,description
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10,75.0,180.0,75.0,3.0,1.0,2.0,0,0,0,0,0,4.0,144278.0,36.800207,10.185776,0,0,1,This property offers 75 m² of living space on ...
11,85.0,180.0,192.0,3.0,1.0,3.0,0,0,0,0,0,2.0,278606.0,35.828828,10.640525,0,0,1,This property offers 85 m² of living space on ...
12,92.0,180.0,92.0,2.0,1.0,3.0,0,0,0,0,0,2.0,353233.0,36.401266,10.557283,0,0,1,This property offers 92 m² of living space on ...
13,100.0,180.0,100.0,3.0,1.0,2.0,0,0,1,0,0,2.0,248757.0,36.762436,9.833619,0,0,1,This property offers 100 m² of living space on...
14,152.0,152.0,192.0,4.0,2.0,3.0,1,0,0,0,0,1.0,520000.0,36.385974,10.458203,0,0,1,This property offers 152 m² of living space on...


# Content based recommendation

## Calculate states similary score

### Jaccard similarity

In [557]:
#we use only categorical features
df_cat=df[['Garage', 'Jardin', 'Balcon', 'Meublé',
       'Piscine','Type_Land',
       'Type_Other', 'Type_Residential']]
df_cat.head(2)

Unnamed: 0_level_0,Garage,Jardin,Balcon,Meublé,Piscine,Type_Land,Type_Other,Type_Residential
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,0,0,0,0,0,0,0,1
11,0,0,0,0,0,0,0,1


In [558]:
jaccard_distance = pdist(df_cat.values,metric='jaccard')
print(jaccard_distance)

[0.  0.  0.5 ... 0.  0.5 0.5]


In [559]:
square_jaccard_distance=squareform(jaccard_distance)
print(square_jaccard_distance)

[[0.  0.  0.  ... 0.  0.  0.5]
 [0.  0.  0.  ... 0.  0.  0.5]
 [0.  0.  0.  ... 0.  0.  0.5]
 ...
 [0.  0.  0.  ... 0.  0.  0.5]
 [0.  0.  0.  ... 0.  0.  0.5]
 [0.5 0.5 0.5 ... 0.5 0.5 0. ]]


In [560]:
jaccard_similarity_array=1-square_jaccard_distance
print(jaccard_similarity_array)

[[1.  1.  1.  ... 1.  1.  0.5]
 [1.  1.  1.  ... 1.  1.  0.5]
 [1.  1.  1.  ... 1.  1.  0.5]
 ...
 [1.  1.  1.  ... 1.  1.  0.5]
 [1.  1.  1.  ... 1.  1.  0.5]
 [0.5 0.5 0.5 ... 0.5 0.5 1. ]]


In [561]:
jaccard_distance_df=pd.DataFrame(jaccard_similarity_array,
                                 index=df_jac.index,
                                 columns=df_jac.index)
jaccard_distance_df.head()

Property_ID,10,11,12,13,14,15,16,17,18,19,...,5000,5001,5002,5003,5004,5005,5006,5007,5008,5009
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.0,1.0,1.0,0.5,0.5,0.0,0.333333,1.0,0.5,1.0,...,0.5,0.5,1.0,0.5,1.0,0.5,0.333333,1.0,1.0,0.5
11,1.0,1.0,1.0,0.5,0.5,0.0,0.333333,1.0,0.5,1.0,...,0.5,0.5,1.0,0.5,1.0,0.5,0.333333,1.0,1.0,0.5
12,1.0,1.0,1.0,0.5,0.5,0.0,0.333333,1.0,0.5,1.0,...,0.5,0.5,1.0,0.5,1.0,0.5,0.333333,1.0,1.0,0.5
13,0.5,0.5,0.5,1.0,0.333333,0.0,0.666667,0.5,1.0,0.5,...,1.0,0.333333,0.5,0.333333,0.5,0.333333,0.666667,0.5,0.5,0.333333
14,0.5,0.5,0.5,0.333333,1.0,0.0,0.25,0.5,0.333333,0.5,...,0.333333,0.333333,0.5,0.333333,0.5,0.333333,0.25,0.5,0.5,0.333333


### Euclidien distnce (numerical values)

In [562]:
df_num=df[['Habitable', 'Terrain', 'Total', 'Pièces',
       'Salles de bains', 'Chambres','prix', 'Latitude', 'Longitude']]
df_num.head(3)

Unnamed: 0_level_0,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,prix,Latitude,Longitude
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10,75.0,180.0,75.0,3.0,1.0,2.0,144278.0,36.800207,10.185776
11,85.0,180.0,192.0,3.0,1.0,3.0,278606.0,35.828828,10.640525
12,92.0,180.0,92.0,2.0,1.0,3.0,353233.0,36.401266,10.557283


In [563]:
# Compute Euclidean distance
distance_matrix = euclidean_distances(df_num)
distance_matrix

array([[      0.        ,  134328.05133385,  208955.00138857, ...,
         283582.00592774, 1109455.06906665,  584722.04376524],
       [ 134328.05133385,       0.        ,   74627.06733709, ...,
         149254.02257611,  975127.0599153 ,  450394.03746974],
       [ 208955.00138857,   74627.06733709,       0.        , ...,
          74627.00773378,  900500.07540929,  375767.05489487],
       ...,
       [ 283582.00592774,  149254.02257611,   74627.00773378, ...,
              0.        ,  825873.06850611,  301140.04849074],
       [1109455.06906665,  975127.0599153 ,  900500.07540929, ...,
         825873.06850611,       0.        ,  524733.02730894],
       [ 584722.04376524,  450394.03746974,  375767.05489487, ...,
         301140.04849074,  524733.02730894,       0.        ]])

In [564]:
# Convert to similarity score
similarity_matrix = 1 / (1 + distance_matrix)
print(similarity_matrix)

[[1.00000000e+00 7.44440603e-06 4.78569648e-06 ... 3.52630439e-06
  9.01342584e-07 1.71021137e-06]
 [7.44440603e-06 1.00000000e+00 1.33997842e-05 ... 6.69994204e-06
  1.02550633e-06 2.22027313e-06]
 [4.78569648e-06 1.33997842e-05 1.00000000e+00 ... 1.33997949e-05
  1.11049284e-06 2.66121611e-06]
 ...
 [3.52630439e-06 6.69994204e-06 1.33997949e-05 ... 1.00000000e+00
  1.21083836e-06 3.32070306e-06]
 [9.01342584e-07 1.02550633e-06 1.11049284e-06 ... 1.21083836e-06
  1.00000000e+00 1.90572737e-06]
 [1.71021137e-06 2.22027313e-06 2.66121611e-06 ... 3.32070306e-06
  1.90572737e-06 1.00000000e+00]]


In [565]:
# Create a DataFrame
euclidean_distance_df = pd.DataFrame(similarity_matrix, 
                             index=df_num.index, 
                             columns=df_num.index)
euclidean_distance_df.head(3)

Property_ID,10,11,12,13,14,15,16,17,18,19,...,5000,5001,5002,5003,5004,5005,5006,5007,5008,5009
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.0,7e-06,5e-06,1e-05,3e-06,1e-06,1e-06,3e-06,5e-06,3e-06,...,5e-06,9e-06,2e-06,4e-06,1.2e-05,2.5e-05,6e-06,4e-06,9.013426e-07,2e-06
11,7e-06,1.0,1.3e-05,3.4e-05,4e-06,2e-06,1e-06,4e-06,1.2e-05,7e-06,...,1.9e-05,4e-05,3e-06,1e-05,2.1e-05,1.1e-05,3.5e-05,7e-06,1.025506e-06,2e-06
12,5e-06,1.3e-05,1.0,1e-05,6e-06,2e-06,2e-06,6e-06,0.000148,1.3e-05,...,4.3e-05,1e-05,4e-06,4e-05,8e-06,6e-06,2.2e-05,1.3e-05,1.110493e-06,3e-06


### Cosine Similarity (Textuel)

In [566]:
df_text=df[['description']]
df_text.head(3)

Unnamed: 0_level_0,description
Property_ID,Unnamed: 1_level_1
10,This property offers 75 m² of living space on ...
11,This property offers 85 m² of living space on ...
12,This property offers 92 m² of living space on ...


In [567]:
tfidvec  = TfidfVectorizer()
vectorized_data=tfidvec.fit_transform(df_text['description'])
print(tfidvec.get_feature_names_out())

['100' '100000' '101' '102' '103' '104' '105' '106' '1069123' '107' '108'
 '110' '111' '112' '1125554' '113' '114' '1144282' '115' '116' '117' '118'
 '119' '1194032' '120' '120000' '1200000' '122' '122985' '123' '124' '125'
 '1253733' '127' '129' '1293536' '13' '130' '131' '132' '133' '134' '135'
 '136' '137' '138' '139' '139303' '140' '1400000' '141' '1432525' '144'
 '144278' '1444010' '145' '145273' '146' '147' '148' '149' '15' '150'
 '1500000' '152' '153234' '154' '155' '1550' '156' '157' '157214' '158'
 '1582857' '159' '159204' '160' '161' '163' '164' '164180' '1649636' '165'
 '166' '167' '168' '169' '170' '173' '174' '174130' '175' '177' '179'
 '179106' '180' '180000' '181' '182' '183' '184' '184081' '188' '189056'
 '190' '193' '194028' '195' '196' '198' '199' '199004' '200' '201'
 '201493' '202' '204' '204974' '205' '207920' '208955' '210' '213'
 '213930' '220' '220000' '221' '223881' '228856' '230' '230598' '233'
 '233831' '236' '238807' '240' '240000' '240456' '243584' '243782'

In [568]:
tfidvec_df=pd.DataFrame(vectorized_data.toarray(),
                        index=df.index,
                        columns=tfidvec.get_feature_names_out())
tfidvec_df.head()

Unnamed: 0_level_0,100,100000,101,102,103,104,105,106,1069123,107,...,rooms,serves,space,spans,suitable,swimming,the,this,tnd,use
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.089263,0.0,0.089263,0.1068,0.0,0.0,0.178527,0.089263,0.089263,0.102454
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.098033,0.0,0.098033,0.117292,0.0,0.0,0.196065,0.098033,0.098033,0.112519
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.080459,0.0,0.080459,0.096266,0.0,0.0,0.160918,0.080459,0.080459,0.092348
13,0.502429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.096123,0.0,0.096123,0.115007,0.0,0.0,0.192246,0.096123,0.096123,0.110327
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067953,0.0,0.067953,0.0,0.0,0.0,0.067953,0.067953,0.067953,0.077995


In [569]:
#find the similarity between all items

cosine_similarity_array= cosine_similarity(tfidvec_df)
cosine_similarity_df=pd.DataFrame(cosine_similarity_array,
                                  index=tfidvec_df.index,
                                  columns=tfidvec_df.index)
cosine_similarity_df.head()

Property_ID,10,11,12,13,14,15,16,17,18,19,...,5000,5001,5002,5003,5004,5005,5006,5007,5008,5009
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.0,0.348442,0.285978,0.350235,0.194636,0.292414,0.280725,0.34923,0.322223,0.23001,...,0.220944,0.294517,0.366068,0.349101,0.37547,0.296712,0.303918,0.328267,0.319687,0.273623
11,0.348442,1.0,0.314072,0.384642,0.213757,0.321141,0.308303,0.383539,0.353878,0.252607,...,0.242649,0.32345,0.40203,0.383396,0.412356,0.325861,0.333775,0.360516,0.351092,0.300504
12,0.285978,0.314072,1.0,0.315689,0.175438,0.263571,0.253035,0.314783,0.290439,0.207323,...,0.19915,0.265467,0.32996,0.314666,0.338434,0.267445,0.273941,0.295887,0.288154,0.246634
13,0.350235,0.384642,0.315689,1.0,0.263396,0.323952,0.408761,0.385512,0.469187,0.254212,...,0.339636,0.381149,0.405519,0.449973,0.414477,0.382447,0.43463,0.362371,0.352899,0.35696
14,0.194636,0.213757,0.175438,0.263396,1.0,0.173055,0.221243,0.214241,0.253949,0.148048,...,0.200889,0.221493,0.240779,0.262619,0.230338,0.223209,0.22863,0.20138,0.196117,0.215647


### Combine all metrics

In [570]:
print(cosine_similarity_df.shape)
print(jaccard_distance_df.shape)
print(euclidean_distance_df.shape)

(5000, 5000)
(5000, 5000)
(5000, 5000)


In [571]:
# Compute the average similarity score
average_similarity_df = (cosine_similarity_df + jaccard_distance_df + euclidean_distance_df) / 3
average_similarity_df.head()

Property_ID,10,11,12,13,14,15,16,17,18,19,...,5000,5001,5002,5003,5004,5005,5006,5007,5008,5009
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.0,0.449483,0.428661,0.283415,0.231546,0.097472,0.204686,0.449744,0.274076,0.410005,...,0.240316,0.264842,0.455357,0.283035,0.458494,0.265579,0.212419,0.442757,0.439896,0.257875
11,0.449483,1.0,0.438029,0.294892,0.23792,0.107048,0.213879,0.461181,0.28463,0.417538,...,0.247556,0.274497,0.467344,0.294469,0.470792,0.27529,0.222381,0.453507,0.450364,0.266835
12,0.428661,0.438029,1.0,0.271899,0.225148,0.087858,0.195457,0.438263,0.263529,0.402445,...,0.233064,0.255159,0.443321,0.271569,0.446147,0.255817,0.202432,0.431967,0.429385,0.248879
13,0.283415,0.294892,0.271899,1.0,0.198911,0.107984,0.358476,0.295172,0.489732,0.251406,...,0.44655,0.238228,0.301841,0.261105,0.304844,0.238599,0.367105,0.287459,0.2843,0.230098
14,0.231546,0.23792,0.225148,0.198911,1.0,0.057686,0.157082,0.238125,0.195763,0.21602,...,0.178076,0.184943,0.246931,0.198653,0.243447,0.185515,0.159545,0.233797,0.232039,0.182995


## Collaborative recommendation

### Preparing items vector represenation

In [572]:
#textuel features
tfidvec_df.head(2)

Unnamed: 0_level_0,100,100000,101,102,103,104,105,106,1069123,107,...,rooms,serves,space,spans,suitable,swimming,the,this,tnd,use
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.089263,0.0,0.089263,0.1068,0.0,0.0,0.178527,0.089263,0.089263,0.102454
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.098033,0.0,0.098033,0.117292,0.0,0.0,0.196065,0.098033,0.098033,0.112519


In [573]:
#categorical features
df_cat.head(2)

Unnamed: 0_level_0,Garage,Jardin,Balcon,Meublé,Piscine,Type_Land,Type_Other,Type_Residential
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,0,0,0,0,0,0,0,1
11,0,0,0,0,0,0,0,1


In [574]:
#numerical features
scaler = MinMaxScaler()

df_num = pd.DataFrame(scaler.fit_transform(df_num), columns=df_num.columns, index=df_num.index)
print(df_num.head())  # This is the normalized numeric data


             Habitable   Terrain     Total    Pièces  Salles de bains  \
Property_ID                                                             
10            0.117462  0.644884  0.213699  0.333333         0.000000   
11            0.133363  0.644884  0.622576  0.333333         0.000000   
12            0.144495  0.644884  0.273109  0.000000         0.000000   
13            0.157216  0.644884  0.301066  0.333333         0.000000   
14            0.239904  0.404127  0.622576  0.666667         0.333333   

             Chambres      prix  Latitude  Longitude  
Property_ID                                           
10           0.333333  0.087267  0.600151   0.176639  
11           0.666667  0.168713  0.559328   0.181807  
12           0.666667  0.213961  0.583385   0.180861  
13           0.333333  0.150615  0.598564   0.172636  
14           0.666667  0.315076  0.582742   0.179735  


In [575]:
property_vectors = pd.concat([tfidvec_df, df_cat, df_num], axis=1)
property_vectors.head()

Unnamed: 0_level_0,100,100000,101,102,103,104,105,106,1069123,107,...,Type_Residential,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,prix,Latitude,Longitude
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.117462,0.644884,0.213699,0.333333,0.0,0.333333,0.087267,0.600151,0.176639
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.133363,0.644884,0.622576,0.333333,0.0,0.666667,0.168713,0.559328,0.181807
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.144495,0.644884,0.273109,0.0,0.0,0.666667,0.213961,0.583385,0.180861
13,0.502429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.157216,0.644884,0.301066,0.333333,0.0,0.333333,0.150615,0.598564,0.172636
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.239904,0.404127,0.622576,0.666667,0.333333,0.666667,0.315076,0.582742,0.179735


### user dataset

In [576]:
df_users=pd.read_csv('User_Interactions_Dataset.csv')
df_users.head()

Unnamed: 0,user_id,property_id,interaction_type,timestamp
0,46,34,click,2024-01-01 00:00:00
1,42,366,click,2024-01-01 01:00:00
2,59,256,save,2024-01-01 02:00:00
3,1,215,click,2024-01-01 03:00:00
4,88,284,click,2024-01-01 04:00:00


In [577]:
# Define interaction weights
interaction_weights = {
    "click": 1,
    "save": 3,
    "invest": 5
}

# Convert user interactions into weighted property vectors
df_users['weight'] = df_users['interaction_type'].map(interaction_weights)
df_users.head()

Unnamed: 0,user_id,property_id,interaction_type,timestamp,weight
0,46,34,click,2024-01-01 00:00:00,1
1,42,366,click,2024-01-01 01:00:00,1
2,59,256,save,2024-01-01 02:00:00,3
3,1,215,click,2024-01-01 03:00:00,1
4,88,284,click,2024-01-01 04:00:00,1


In [578]:
merged_df = df_users.merge(property_vectors, left_on="property_id", right_index=True, how="inner")
print(df_users.shape)
merged_df.head()

(1000, 5)


Unnamed: 0,user_id,property_id,interaction_type,timestamp,weight,100,100000,101,102,103,...,Type_Residential,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,prix,Latitude,Longitude
0,46,34,click,2024-01-01 00:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.317822,0.825451,0.622576,0.333333,0.0,0.333333,0.909272,0.604561,0.177917
1,42,366,click,2024-01-01 01:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.009487,0.38693,0.622576,0.333333,0.0,0.666667,0.333265,0.603293,0.177317
2,59,256,save,2024-01-01 02:00:00,3,0.0,0.0,0.0,0.0,0.0,...,1,0.25385,0.479536,0.622576,0.666667,0.333333,0.666667,0.321139,0.602447,0.177764
3,1,215,click,2024-01-01 03:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.20333,0.206363,0.622576,0.333333,0.0,0.333333,0.287791,0.583551,0.180984
4,88,284,click,2024-01-01 04:00:00,1,0.0,0.0,0.0,0.0,0.0,...,0,0.526133,0.644884,0.622576,0.666667,0.0,0.666667,0.160463,0.582742,0.179735


In [579]:
# Identify the feature columns (from column 5 onwards)
feature_columns = merged_df.columns[5:]  # From column index 5 (0-based index)
# Multiply each feature column by the weight column
merged_df[feature_columns] = merged_df[feature_columns].multiply(merged_df["weight"], axis=0)
merged_df.head()


Unnamed: 0,user_id,property_id,interaction_type,timestamp,weight,100,100000,101,102,103,...,Type_Residential,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,prix,Latitude,Longitude
0,46,34,click,2024-01-01 00:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.317822,0.825451,0.622576,0.333333,0.0,0.333333,0.909272,0.604561,0.177917
1,42,366,click,2024-01-01 01:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.009487,0.38693,0.622576,0.333333,0.0,0.666667,0.333265,0.603293,0.177317
2,59,256,save,2024-01-01 02:00:00,3,0.0,0.0,0.0,0.0,0.0,...,3,0.761549,1.438607,1.867727,2.0,1.0,2.0,0.963417,1.80734,0.533291
3,1,215,click,2024-01-01 03:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.20333,0.206363,0.622576,0.333333,0.0,0.333333,0.287791,0.583551,0.180984
4,88,284,click,2024-01-01 04:00:00,1,0.0,0.0,0.0,0.0,0.0,...,0,0.526133,0.644884,0.622576,0.666667,0.0,0.666667,0.160463,0.582742,0.179735


In [580]:
#let's take user 65 for exemple
user_id=65
df_user=merged_df[merged_df['user_id']==user_id]
print(df_user.shape)
df_user.head()

(15, 453)


Unnamed: 0,user_id,property_id,interaction_type,timestamp,weight,100,100000,101,102,103,...,Type_Residential,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,prix,Latitude,Longitude
159,65,277,click,2024-01-07 15:00:00,1,0.0,0.0,0.0,0.0,0.0,...,0,0.513411,0.644884,0.622576,0.666667,0.0,0.666667,0.133179,0.602691,0.176899
239,65,122,invest,2024-01-10 23:00:00,5,0.0,0.0,0.0,0.0,0.0,...,5,1.286979,3.22442,1.76743,1.666667,0.0,3.333333,0.964232,2.796638,0.909033
268,65,234,click,2024-01-12 04:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.314642,0.644884,0.647038,1.0,0.333333,1.0,0.259209,0.600151,0.176639
338,65,243,save,2024-01-15 02:00:00,3,0.0,0.0,0.0,0.0,0.0,...,3,0.60045,1.934652,1.186266,2.0,0.0,2.0,0.59664,1.806623,0.532581
355,65,351,click,2024-01-15 19:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.173117,0.644884,0.615586,0.666667,0.0,0.666667,0.054086,0.583385,0.180861


### Content-Based Recommendations

In [581]:
# Compute user vector as the mean of features (from column 5 onward)
user_vector = df_user[feature_columns].mean().values.reshape(1, -1)
print(user_vector)

[[0.10048579 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.03957972
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.01696277 0.         0.         0.         0.04665312 0.
  0.         0.         0.         0.10032412 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.03350219 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.04725862 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.26375724 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.20310818
  0.         0.         

In [582]:
# Compute cosine similarity between user vector and all property vectors
similarity_scores = cosine_similarity(user_vector, property_vectors)
print(similarity_scores)


[[0.81488592 0.87655273 0.79892753 ... 0.83923056 0.84767605 0.77731559]]


In [583]:
# Convert similarity scores to DataFrame
similarity_df = pd.DataFrame(similarity_scores.flatten(), index=property_vectors.index, columns=['similarity'])
similarity_df.head()

Unnamed: 0_level_0,similarity
Property_ID,Unnamed: 1_level_1
10,0.814886
11,0.876553
12,0.798928
13,0.859094
14,0.751987


In [584]:
# Get properties that the user has saved or invested in
exclude_properties = merged_df.loc[merged_df['interaction_type'].isin(['save', 'invest']), 'property_id'].values
# Filter out those properties from recommendations
filtered_similarity_df = similarity_df.drop(index=exclude_properties)
filtered_similarity_df.head()

Unnamed: 0_level_0,similarity
Property_ID,Unnamed: 1_level_1
10,0.814886
11,0.876553
23,0.905133
24,0.867774
25,0.867774


Normalization ensures that different similarity scores (content-based & collaborative filtering) contribute fairly to the final hybrid score. Without normalization, one score type might dominate the other, making the hybrid model unbalanced.

In [585]:
# Normalize similarity scores using Min-Max Scaling
filtered_similarity_df['similarity'] = (filtered_similarity_df['similarity'] - filtered_similarity_df['similarity'].min()) / \
                                       (filtered_similarity_df['similarity'].max() - filtered_similarity_df['similarity'].min())


In [586]:
# Assign content-based weight (e.g., 70%)
filtered_similarity_df['final_score'] = filtered_similarity_df['similarity'] * 0.7
filtered_similarity_df.head()

Unnamed: 0_level_0,similarity,final_score
Property_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
10,0.794719,0.556303
11,0.914332,0.640032
23,0.969768,0.678837
24,0.897303,0.628112
25,0.897303,0.628112


### Collaborative

In [587]:
merged_df.head()

Unnamed: 0,user_id,property_id,interaction_type,timestamp,weight,100,100000,101,102,103,...,Type_Residential,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,prix,Latitude,Longitude
0,46,34,click,2024-01-01 00:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.317822,0.825451,0.622576,0.333333,0.0,0.333333,0.909272,0.604561,0.177917
1,42,366,click,2024-01-01 01:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.009487,0.38693,0.622576,0.333333,0.0,0.666667,0.333265,0.603293,0.177317
2,59,256,save,2024-01-01 02:00:00,3,0.0,0.0,0.0,0.0,0.0,...,3,0.761549,1.438607,1.867727,2.0,1.0,2.0,0.963417,1.80734,0.533291
3,1,215,click,2024-01-01 03:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.20333,0.206363,0.622576,0.333333,0.0,0.333333,0.287791,0.583551,0.180984
4,88,284,click,2024-01-01 04:00:00,1,0.0,0.0,0.0,0.0,0.0,...,0,0.526133,0.644884,0.622576,0.666667,0.0,0.666667,0.160463,0.582742,0.179735


In [589]:
user_vectors = merged_df.groupby('user_id')[feature_columns].mean()
user_vectors

Unnamed: 0_level_0,100,100000,101,102,103,104,105,106,1069123,107,...,Type_Residential,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,prix,Latitude,Longitude
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.900000,0.342904,0.607051,0.743247,0.733333,0.133333,0.766667,0.378646,0.698106,0.215444
2,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.057690,0.000000,...,0.875000,0.350964,0.641660,0.651974,0.541667,0.041667,0.708333,0.277585,0.729677,0.263804
3,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,2.000000,0.532877,1.036973,1.213454,1.333333,0.000000,1.166667,0.626770,1.347782,0.387366
4,0.038355,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.199275,...,2.000000,0.401436,0.963027,1.042211,1.166667,0.047619,1.285714,0.461780,1.194641,0.355711
5,0.124087,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,2.000000,0.640930,1.326455,0.973534,1.044444,0.311111,1.155556,0.738102,1.176163,0.437202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.135887,0.000000,0.000000,0.000000,0.0,0.045616,0.0,0.0,0.044394,0.000000,...,2.400000,0.497693,1.152193,1.207269,1.333333,0.166667,1.566667,0.616637,1.413661,0.428121
97,0.000000,0.474065,0.000000,0.437331,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,2.333333,0.543665,2.149613,1.645990,2.222222,0.000000,2.222222,0.543910,1.818933,0.605810
98,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,2.000000,0.499580,1.215606,1.271186,1.375000,0.208333,1.500000,0.692098,1.435941,0.427648
99,0.000000,0.000000,0.047208,0.000000,0.0,0.036928,0.0,0.0,0.000000,0.000000,...,1.750000,0.364027,1.051877,0.937242,0.888889,0.000000,1.194444,0.345076,1.066747,0.357497


In [590]:
# Compute cosine similarity between all users
cosine_sim_matrix = cosine_similarity(user_vectors)

# Convert to a DataFrame for easy lookup
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, 
                             index=user_vectors.index, 
                             columns=user_vectors.index)
cosine_sim_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.914131,0.939224,0.958397,0.934578,0.938134,0.939034,0.919806,0.903692,0.953604,...,0.927172,0.938837,0.960573,0.94179,0.945616,0.9404,0.918669,0.927929,0.936385,0.91428
2,0.914131,1.0,0.913032,0.907712,0.925186,0.911195,0.946467,0.915809,0.913096,0.916217,...,0.947917,0.926276,0.93179,0.918265,0.928436,0.923642,0.920664,0.910383,0.926341,0.936061
3,0.939224,0.913032,1.0,0.940082,0.932751,0.908182,0.916963,0.906512,0.879856,0.934116,...,0.931805,0.894038,0.941861,0.917608,0.918583,0.949642,0.879739,0.936067,0.943475,0.899474
4,0.958397,0.907712,0.940082,1.0,0.935904,0.954353,0.929725,0.903053,0.889788,0.963541,...,0.929854,0.929957,0.961276,0.963228,0.956735,0.953735,0.923291,0.913075,0.956952,0.922797
5,0.934578,0.925186,0.932751,0.935904,1.0,0.935152,0.919834,0.909784,0.877959,0.94169,...,0.968051,0.892799,0.96312,0.926643,0.923092,0.96212,0.877521,0.928757,0.93151,0.906332


In [603]:
similar_users = cosine_sim_df[user_id].drop(user_id).sort_values(ascending=False).head(5).index.to_numpy()
similar_users

array([66, 32, 57, 69, 10], dtype=int64)

In [604]:
# Get property interactions of similar users
similar_users_data = merged_df[merged_df['user_id'].isin(similar_users)]
similar_users_data.head()

Unnamed: 0,user_id,property_id,interaction_type,timestamp,weight,100,100000,101,102,103,...,Type_Residential,Habitable,Terrain,Total,Pièces,Salles de bains,Chambres,prix,Latitude,Longitude
10,10,255,click,2024-01-01 10:00:00,1,0.0,0.0,0.0,0.0,0.0,...,0,0.257396,0.644884,0.622576,0.666667,0.0,0.666667,0.84864,0.602759,0.176456
24,57,35,click,2024-01-02 00:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.209691,0.240757,0.622576,0.666667,0.0,0.666667,0.199874,0.602447,0.177764
50,32,425,click,2024-01-03 02:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.20015,0.644884,0.622576,0.666667,0.0,0.666667,0.221985,0.583385,0.180861
57,66,263,click,2024-01-03 09:00:00,1,0.0,0.0,0.0,0.0,0.0,...,1,0.257396,0.644884,0.353486,0.333333,0.0,0.666667,0.192846,0.559328,0.181807
76,32,54,save,2024-01-04 04:00:00,3,1.610925,0.0,0.0,0.0,0.0,...,3,0.471647,1.934652,1.867727,1.0,0.0,2.0,0.813829,1.750155,0.542582


In [609]:
collaborative_scores = similar_users_data.groupby('property_id')['weight'].sum().to_frame()
collaborative_scores.columns = ['collaborative_score']
collaborative_scores.head(3)

Unnamed: 0_level_0,collaborative_score
property_id,Unnamed: 1_level_1
13,3
17,3
25,1


In [611]:
# Normalize collaborative scores (0 to 1 scale)
scaler = MinMaxScaler()
collaborative_scores['collaborative_score'] = scaler.fit_transform(collaborative_scores)
collaborative_scores['collaborative_score']=collaborative_scores['collaborative_score']
collaborative_scores['collaborative_score'] *= 0.3
collaborative_scores.head(3)


Unnamed: 0_level_0,collaborative_score
property_id,Unnamed: 1_level_1
13,0.15
17,0.15
25,0.0


In [612]:
# Ensure the property_id index is set correctly for merging
filtered_similarity_df = filtered_similarity_df.rename(columns={'final_score': 'content_score'})
# Merge both recommendation datasets on property_id
hybrid_recommendations = collaborative_scores.merge(
    filtered_similarity_df[['content_score']], 
    left_index=True, 
    right_index=True, 
    how='outer'
).fillna(0)  # Fill missing values with 0


In [614]:
# Compute final hybrid recommendation score
hybrid_recommendations['final_score'] = (
    hybrid_recommendations['content_score'] + 
    hybrid_recommendations['collaborative_score']
)  # Since each is already weighted

# Sort recommendations by highest final score
hybrid_recommendations = hybrid_recommendations.sort_values(by='final_score', ascending=False)
hybrid_recommendations.head(10)

Unnamed: 0,collaborative_score,content_score,final_score
425,0.075,0.663607,0.738607
527,0.0,0.7,0.7
1186,0.0,0.7,0.7
718,0.0,0.7,0.7
2539,0.0,0.7,0.7
4530,0.0,0.7,0.7
968,0.0,0.7,0.7
2971,0.0,0.7,0.7
1928,0.0,0.7,0.7
3234,0.0,0.7,0.7
