In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
df = pd.read_csv("melb_data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [4]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [5]:
df = df.drop(['Suburb', 'Type', 'Method', 'SellerG', 'Date', 'Distance', 'Postcode', 'BuildingArea', 'YearBuilt'], axis=1)

In [6]:
cols = ['Rooms', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'Propertycount']
df['description'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [7]:
df.head()

Unnamed: 0,Address,Rooms,Price,Bedroom2,Bathroom,Car,Landsize,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,description
0,85 Turner St,2,1480000.0,2.0,1.0,1.0,202.0,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0,2.0 2.0 1.0 1.0 202.0 4019.0
1,25 Bloomburg St,2,1035000.0,2.0,1.0,0.0,156.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0,2.0 2.0 1.0 0.0 156.0 4019.0
2,5 Charles St,3,1465000.0,3.0,2.0,0.0,134.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0,3.0 3.0 2.0 0.0 134.0 4019.0
3,40 Federation La,3,850000.0,3.0,2.0,1.0,94.0,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0,3.0 3.0 2.0 1.0 94.0 4019.0
4,55a Park St,4,1600000.0,3.0,1.0,2.0,120.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0,4.0 3.0 1.0 2.0 120.0 4019.0


In [8]:
df['description'] = df['description'].fillna('')

In [9]:
tf = TfidfVectorizer(ngram_range=(1, 2),min_df=0)
tfidf_matrix = tf.fit_transform(df['description'])

In [10]:
tfidf_matrix.shape

(13580, 12164)

In [11]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [12]:
cosine_sim[0]

array([1.        , 0.21319185, 0.21168196, ..., 0.        , 0.        ,
       0.        ])

In [13]:
# df = df.reset_index()
Address = df['Address']
indices = pd.Series(df.index, index=df['Address'])

In [14]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    house_indices = [i[0] for i in sim_scores]
    return df.iloc[house_indices]

In [18]:
get_recommendations('85 Turner St').head(5)

Unnamed: 0,Address,Rooms,Price,Bedroom2,Bathroom,Car,Landsize,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,description
8,6/241 Nicholson St,1,300000.0,1.0,1.0,1.0,0.0,Yarra,-37.8008,144.9973,Northern Metropolitan,4019.0,1.0 1.0 1.0 1.0 0.0 4019.0
10,411/8 Grosvenor St,2,700000.0,2.0,2.0,1.0,0.0,Yarra,-37.811,145.0067,Northern Metropolitan,4019.0,2.0 2.0 2.0 1.0 0.0 4019.0
12,123/56 Nicholson St,2,750000.0,2.0,2.0,1.0,0.0,Yarra,-37.8078,144.9965,Northern Metropolitan,4019.0,2.0 2.0 2.0 1.0 0.0 4019.0
14,7/20 Abbotsford St,1,441000.0,1.0,1.0,1.0,0.0,Yarra,-37.8016,144.9988,Northern Metropolitan,4019.0,1.0 1.0 1.0 1.0 0.0 4019.0
21,13/11 Nicholson St,3,900000.0,3.0,2.0,2.0,0.0,Yarra,-37.8093,144.9959,Northern Metropolitan,4019.0,3.0 3.0 2.0 2.0 0.0 4019.0


In [17]:
df.head(6)

Unnamed: 0,Address,Rooms,Price,Bedroom2,Bathroom,Car,Landsize,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,description
0,85 Turner St,2,1480000.0,2.0,1.0,1.0,202.0,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0,2.0 2.0 1.0 1.0 202.0 4019.0
1,25 Bloomburg St,2,1035000.0,2.0,1.0,0.0,156.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0,2.0 2.0 1.0 0.0 156.0 4019.0
2,5 Charles St,3,1465000.0,3.0,2.0,0.0,134.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0,3.0 3.0 2.0 0.0 134.0 4019.0
3,40 Federation La,3,850000.0,3.0,2.0,1.0,94.0,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0,3.0 3.0 2.0 1.0 94.0 4019.0
4,55a Park St,4,1600000.0,3.0,1.0,2.0,120.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0,4.0 3.0 1.0 2.0 120.0 4019.0
5,129 Charles St,2,941000.0,2.0,1.0,0.0,181.0,Yarra,-37.8041,144.9953,Northern Metropolitan,4019.0,2.0 2.0 1.0 0.0 181.0 4019.0
