In [15]:
import time

import requests

from bs4 import BeautifulSoup

from os.path import join as pjoin
import os
from pathlib import Path

import pandas as pd
import numpy as np

import html

from lxml import html

# For persisting indexes in an external file
import pickle

import math

import nltk

# For word tokenization
from nltk.tokenize import RegexpTokenizer
# For stop words list
from nltk.corpus import stopwords
# For word stemming
from nltk.stem.snowball import SnowballStemmer



from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [16]:
# Common utility functions

def extract_number(n):
    n = n.replace('.', '')
    n = n.replace('+', '')
    n_list = [str(s) for s in n.split() if s.isdigit()]

    s = ''.join(n_list).strip()
    
    if not s:
        s = 0

    return int(s)

# Utility functions for reading and writing files using pickel python package
def read_file_from_pickle(file):
    file_content = {}
    
    if file.is_file():
        with open(file, "rb") as f:
            file_content = pickle.load(f)
            f.close()
    
    return file_content

def write_file_to_pickle(file, content):
    with open(file, "wb") as f:
        pickle.dump(content, f)
        f.close()

# Apply Jaccard similarity to find out 3 most similar clusters
def get_jaccard(a, b):
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))
        


In [17]:
# Import all the persisted data at once
# Data to import: 
# Listing data, Individual listing links, Listing Index 
# Words, Vocabulary, listing_content, iindex_tf_idf
# information_dataset, description_dataset

# Path to the current working directory to refer to all the files relatively
my_path = os.path.dirname(os.path.realpath('__file__'))

# Datastructures for holding the listings and other metadata
# Please create a directory(in your current working directory) with name 'indexes'  

#Holds individual listings data for the listing pages downloaded
listings_file = Path(os.path.join(my_path, "indexes/listings.pkl"))
listings_persist = {}

if listings_file.is_file():
    with open(listings_file, "rb") as listings:
        listings_persist = pickle.load(listings)
        listings.close()

#Holds the URLs of individual listings for extracting complete description of a particular listing
listing_links_file = Path(os.path.join(my_path, "indexes/listing_links.pkl"))
listing_links_persist = read_file_from_pickle(listing_links_file)


#Holds the order of individual listing
listing_index_file = Path(os.path.join(my_path, "indexes/listing_index.pkl"))
listing_index_persist = read_file_from_pickle(listing_index_file)
     

# Retrieving persisted information for listing content and word map (words and vocabulary)
content_file = Path(os.path.join(my_path, "indexes/listing_content.pkl"))
listing_content_persist = read_file_from_pickle(content_file)



vocabulary_file = Path(os.path.join(my_path, "indexes/vocabulary.pkl"))
vocabulary_persist = read_file_from_pickle(vocabulary_file)

words_file = Path(os.path.join(my_path, "indexes/words.pkl"))
words_persist = read_file_from_pickle(words_file)
        
index_file = Path(os.path.join(my_path, "indexes/iindex_tf_idf.pkl"))
iindex_tf_idf_persist = read_file_from_pickle(index_file)


# Information data set
information_ds_file = Path(os.path.join(my_path, "indexes/information_dataset.pkl"))
information_ds_persist = read_file_from_pickle(information_ds_file)

# Description data set - containing tf-idf values
description_ds_file = Path(os.path.join(my_path, "indexes/description_dataset.pkl"))
description_ds_persist = read_file_from_pickle(description_ds_file)
        

# page_download

We decide to go through all html pages and save all in a data directory

In [18]:
# Please create a directory data 
# Checking if the pages are already downloaded
# Check the first file 
listing_page_file = Path(os.path.join(my_path, "data/listing_0.html"))

if listing_page_file.is_file() == False:

    # If there are no files then start downloading each html file with a delay of 3 seconds
    print('Downloading group listings pages...')
    
    url_root = 'https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag='


    for i in range(1000):

        cur_url = url_root + str(i)

        cur_content = requests.get(cur_url)

        res_text = BeautifulSoup(cur_content.text, "lxml")

        cur_html_file= open("data/listing_" + str(i) + ".html", "w")
        cur_html_file.write(str(res_text))
        cur_html_file.close()
        
        # 3 seconds delay for the next page download attempt
        time.sleep(3)
else:
    print('Group listings pages already downloaded')

Group listings pages already downloaded


In [19]:
# Downloading individual listings pages from the listing links persisted
# Please create a directory data_detail
# Checking if the pages are already downloaded
# Check the first file 
listing_detail_file = Path(os.path.join(my_path, "data_detail/listing_detail_0.html"))


if listing_detail_file.is_file() == False:
    
    print('Downloading individual listing pages...')

    links_list = []

    if(len(listing_links_persist.keys()) != 0):

        # Getting listing page links
        for key in  listing_links_persist:
            cur_link = listing_links_persist[key]

            # Check if the link is relative
            # If yes make it absolute link
            # Need to add better checks here
            if(cur_link[0] == "/"):
                cur_link = "https://www.immobiliare.it" + cur_link

            links_list.append(cur_link)

        # Downloading the pages
        for i in range(690, len(links_list)):

            cur_url = links_list[i]

            cur_content = requests.get(cur_url)

            res_text = BeautifulSoup(cur_content.text, "lxml")

            cur_detail_file = os.path.join(my_path, "data_detail/listing_detail_" + str(i) + ".html")

            cur_html_file= open(cur_detail_file, "w")
            cur_html_file.write(str(res_text))
            cur_html_file.close()

            # Wait for 3 seconds before downloading the next page
            time.sleep(3)
else:
    print('Individual listing pages already downloaded')
    

Individual listing pages already downloaded


# Scrapping the information in Group listing pages



In [20]:
# If this is True then the listing description is updated from 
# the individual pages
desc_flag = True

if(len(listing_index_persist.keys()) == 0):

    print("Indexes are being created")

    l_index = 0
    listing_index_persist['listing_ids'] = []

    # Every page has 25 listings so
    # 410*25 will be more than 10000 listings
    for i in range(1, 410):

        cur_listing_page = BeautifulSoup(open(os.path.join(my_path, 'data/listing_' + str(i) + '.html')), 'html.parser')

        listing_container = cur_listing_page.find(class_="annunci-list")
        
        # Need to improve exception handling in this loop
        for cur_listing in listing_container.find_all(class_=["listing-item", "js-row-detail"], recursive=False):

            listing_dict = {
                "id": "",
                "listing_id": "",
                "title": "",
                "price": 0,
                "locali": 0,
                "superficie": 0,
                "bagni": 0,
                "piano": 0,
                "immobile": "",
                "listing_link": "",
                "description": ""
            }

            listing_body = cur_listing.find(class_="listing-item_body")

            if(listing_body):

                listing_dict['id'] = l_index
                listing_dict['listing_id'] = cur_listing.get("data-id")

                listing_dict['title'] = listing_body.find(class_="titolo").text.strip()

                listing_dict["listing_link"] = listing_body.find("a", {"id": "link_ad_" + listing_dict['listing_id']}).get("href")

                listing_dict['description'] = listing_body.find(class_="descrizione").text.strip()

                # Extracting the listing features 
                listing_features = listing_body.find(class_=["listing-features", "list-piped"])

                listing_links_persist[listing_dict['listing_id']] = listing_dict["listing_link"]

                for cur_feature in listing_features.find_all(class_="lif__item", recursive=False):

                    feature_cls_list = cur_feature.get("class")

                    # Extract listing price
                    if 'lif__pricing' in feature_cls_list:
                        listing_dict['price'] = extract_number(cur_feature.text.strip())
                    else:
                        # Extract other features information
                        # @TODO: Need to refine locali to contain a list: example: 1-5 should be [1,2,3,4,5]
                        feature_name = cur_feature.find(class_="lif--muted")

                        # @TODO: Need to do this more efficiently
                        if(feature_name):
                            feature_name = feature_name.text.strip()

                            if feature_name in listing_dict:
                                feature_value = cur_feature.find(class_="text-bold").text.strip()
                                listing_dict[feature_name] = extract_number(feature_value)


                listing_index_persist['listing_ids'].append(listing_dict['listing_id'])

                l_index += 1
                listings_persist[listing_dict['listing_id']] = listing_dict


    # Remove duplicate listing entries
    listing_index_persist['listing_ids'] = list(set(listing_index_persist['listing_ids']))

    # Persist the listings object and dictionary using pickel library
    
    #Save listings data
    write_file_to_pickle(listings_file, listings_persist)

    #Save individual listings links data
    write_file_to_pickle(listing_links_file, listing_links_persist)

    #Save index of listings
    write_file_to_pickle(listing_index_file, listing_index_persist)

else:
    print("Indexes are already created")


print("No of links:")
print(len(listing_links_persist.keys()))

print("No of listings:")
print(len(listings_persist.keys()))

print("No of listings in the listing index file:")
print(len(listing_index_persist['listing_ids']))


if desc_flag:

    # Parse the detail pages 
    # And update the description of individual listings
    for i in range(len(listing_links_persist.keys())):
        cur_detail_page = BeautifulSoup(open(os.path.join(my_path, 'data_detail/listing_detail_' + str(i) + '.html')), 'html.parser')

        cur_page_contact = cur_detail_page.find('div',{"id":"up-contact-box"})
        if cur_page_contact:
            cur_page_elem = cur_page_contact.find(class_="info-agenzia")

            if cur_page_elem:
                cur_page_id = cur_page_elem.get("data-annuncio")

                cur_page_description =  cur_detail_page.find(class_="description-text")

                if cur_page_description:
                    cur_page_description = cur_page_description.text.strip()

                    cur_page_description = "".join(cur_page_description.splitlines())

                    if cur_page_id in listings_persist:
                        listings_persist[cur_page_id]['description'] = cur_page_description
                    else:
                        pass
                        #print("Page key not found in the persisted data")
                else:
                    pass
                    #print("Page Description not found")
            else:
                pass
                #print("Page ID not found")
        else:
            pass
            #print("Contact not found")

    #Save listings data with new content (complete listing description)
    write_file_to_pickle(listings_file, listings_persist)


Indexes are already created
No of links:
9987
No of listings:
9987
No of listings in the listing index file:
9987


# create_information_ds

In [21]:
#Preparing information data set
if(len(information_ds_persist.keys()) == 0):

    information_ds_persist['dataset'] = []

    # Get the persisted listings data
    for listing_id in listing_index_persist['listing_ids']:
        cur_listing = listings_persist[listing_id]

        listing_info = [cur_listing['price'], cur_listing['locali'], cur_listing['superficie'], cur_listing['bagni'], cur_listing['piano']]

        information_ds_persist['dataset'].append(listing_info)
    
    #Save information data set
    write_file_to_pickle(information_ds_file, information_ds_persist)

else:
    print("Information data set already present")


print(len(information_ds_persist['dataset']))

Information data set already present
9987


# Description dataset
## Create vocabulary

In [23]:
#First we import stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('italian'))
#To remove punctuation we use regexptokenizer, but we leave dollar symbol $ because maybe is used in some queries
tokenizer = RegexpTokenizer(r'\w+|\$')
#we create the stemmer
ps = SnowballStemmer('italian')

list_len = len(listing_index_persist['listing_ids'])

if(len(listing_content_persist.keys()) == 0):
    
    listing_word_map = {}
    
    # We reach here if we don't have indexes already present
    print("Vocabulary is being created...")
 
    for i in range(list_len):
        
        cur_list_id = listing_index_persist['listing_ids'][i]
        
        cur_list_obj = listings_persist[cur_list_id]

        # Extract all the text in the individual listing
        # For listing title
        t1 = cur_list_obj['title']
        
        # For listing content
        t2 = cur_list_obj['description']
        
        t = t1+ ' ' +t2
        t = t.lower()
        t = tokenizer.tokenize(t)
        
        # This array will contain all the valid words in a given review after removing 
        # all the stop words, punctuations, stemming etc..,, we will use this information
        # to find out the term frequency there by tf-idf values
        listing_words = []
        
        for r in t :
            if not r in stop_words:
                sr = r #ps.stem(r) - avoid stemming for now for the wordcloud
                
                listing_words.append(sr)
                
                if not  sr in listing_word_map:
                    listing_word_map[sr] = [i]
                else:
                    listing_word_map[sr]+=[i]
                    
                    
        listing_content_persist[i] = ' '.join(listing_words)
    
    # Saving the content and indexes for the first time
    # We made use of pickel python module
    #Saving content dictionary
    write_file_to_pickle(content_file, listing_content_persist)
    
    # Word and Vocabulary indexes based on word map
    c = 0
    for key in listing_word_map:
        words_persist[key] = c
        vocabulary_persist[c] = listing_word_map[key]
        c += 1
    
    #Save vocabulary and words
    write_file_to_pickle(vocabulary_file, vocabulary_persist)
    write_file_to_pickle(words_file, words_persist)
else:
    print("Vocabulary data set already present")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vamsigunturi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Vocabulary is being created...


# Calculate TF-IDF values for the words in the vocabulary

In [24]:
if(len(iindex_tf_idf_persist.keys()) == 0):
    
    print("Inverted Indexes are being calculated")

    word_iindex = {}

    #Creating inverted index using tf-idf and consine similarity
    for word in words_persist:
        word_doc_list = vocabulary_persist[words_persist[word]]
        word_iindex[word] = []

        # Store indexes based on number of times a particular word is present in a given document
        for doc in word_doc_list:
            doc_content = listing_content_persist[doc]
            # Pushing the term frequency with document id
            word_iindex[word].append([doc, doc_content.split().count(word)])

    # Store indexes based on tf-idf
    docs_length = len(listing_content_persist.keys())
    iindex_tf_idf_persist = word_iindex

    for key, word in iindex_tf_idf_persist.items():
        # find out the relative importance of a particular terms relating it to document count
        idf= math.log10( docs_length / len(word) )

        for elem in word:
            # Add the document score corresponding to a particular term which we then use in the 
            # search results ranking of documents
            elem[1] = idf * elem[1]
    

    # Persisting the indexes calculated 
    write_file_to_pickle(index_file, iindex_tf_idf_persist)
else:
    print("Inverted Indexes already present")
    

Inverted Indexes are being calculated


# create_description_ds

In [25]:
print(len(description_ds_persist.keys()))

# Preparing Description data set
# Create description data set
# Extract the words in individual listings
# Create a matrix with rows as listings and columns as words
# Combile listing title and description
# Remove stop words
# Calculate the term frequency
# Calculate the td*idf score for that word in that document
# Which gives the description data set for the 10000 listings saved
if(len(description_ds_persist.keys()) == 0):

    print("Description data set is being created...")

    list_len = len(listing_index_persist['listing_ids'])
    description_ds = []
    
    #Build the description data set
    for i in range(list_len):
        
        cur_list_id = listing_index_persist['listing_ids'][i]
        
        cur_list_obj = listings_persist[cur_list_id]

        cur_word_list = []
        
        #Initialize each word tf-idf with 0's
        for word in words_persist:
            cur_word_list.append(0)

        # @TODO: Need to optimize the number of verfications done here
        for key, word in iindex_tf_idf_persist.items():
            # elem[0] - list_id
            # elem[1] - tf-idf
            for elem in word:
                # Update tf-idf of that word for that listing 
                if(elem[0] == i):
                    cur_word_list[words_persist[key]] = elem[1]
        
        description_ds.append(cur_word_list)

    description_ds_persist['dataset'] = description_ds


    # Persisting the indexes calculated 
    write_file_to_pickle(description_ds_file, description_ds_persist)
    
else:
    print("Description data set already present")
    
#print(description_ds_persist['dataset'][789])


0
Description data set is being created...
[0, 0, 0, 0, 0, 0, 0, -0.16226229223178787, 0, 0, 0, 0, 0, 0, 0, 0, 0.23788306131214793, 0.85268703624569, 0, -0.07387997068429795, 0, 0.2018213197232541, 0.14059719844774432, 0, 0, 0, 0.05678223818311766, 0.2971768867142356, 0, 0, 0, 0, 0, 0, 0, -0.18706217543606954, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5980937252743059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6585945000639983, 0.06272029166512609, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.13891737241458352, 0, 0.31611778795444717, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5561720624176348, 0, 0, 0, 0, 0, 0, 0, 0, 1.546116709829292, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.026307196276631126, 0, 0, 0, 0, 0, 0, 0, 0.9372292410566172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.3310491331863297, 

# Apply K-means clustering to listings data in information and description dataset

In [26]:
# Wrapper functions for clustering and wordcloud
def get_listing_content(i, sflag):
    listing_words = ''
    if sflag:
        listing_words = listing_content_persist[i]
    else:
        listing_id = listing_index_persist['listing_ids'][i]
        listing_data = listings_persist[listing_id]
        listing_words = listing_data['description']

    return listing_words + ' '

def get_wc_save_path(i, sflag):

    f_name_prefix = ''

    if(sflag):
        f_name_prefix = "wordcloud"
    else:
        f_name_prefix = "wordcloud_all"

    return f_name_prefix + "/cluster_" + str(i)


# data - is the dataset
# k  - number of clusters
# @TODO: Need to use Elbow method to decide on
# Optimal number of clusters

def cluster_documents(data, k):   
    
    #use k-means to clusterize the songs
    kmeans = KMeans(n_clusters=k, init='random') # initialization
    kmeans.fit(data) # actual execution
    c = kmeans.predict(data)
    c_list = list(c)

    clustered_list = []

    # Creating a multi dimentional array based on k
    for c in range(k):
        clustered_list.append([])

    # Extract the listing ids from indexes
    index = 0
    for i in c_list:
        clustered_list[i].append(index)
        index += 1
    
    return clustered_list

    '''
    # Applying elbow method
    # To find optimal clusters
    # Increasing the clusters by a fraction of 100
    Ks = range(1, 51)
    km = [KMeans(n_clusters=i*100) for i in Ks]
    score = [km[i].fit(data).score(data) for i in range(len(km))]

    return score
    '''

def create_wordcloud(clist, stopwords_flag):

    c_index = 0   
    
    for cluster in clist:
        
        cur_cluster_words = " "
        
        # Extracting all the words of the listings in current cluster
        for list_id in cluster:
            cur_cluster_words +=  get_listing_content(list_id, stopwords_flag)
        
        #strg_cloud = ' '.join(strg_cloud.split())
        
        wordcloud = WordCloud(width = 300, height = 300, margin = 0, collocations=False).generate(cur_cluster_words)
        
        plt.imshow(wordcloud, interpolation = "bilinear")
        plt.axis("off")
        plt.margins(x=0,y=0)
        plt.savefig(get_wc_save_path(c_index, stopwords_flag))
        #plt.show()

        c_index += 1  

def compare_clusters(c1, c2):

    jac_score_list = []
    comb_list = []
    cmp_output = {}

    for i in range(len(c1)):
        for j in range(len(c2)):
            # Adding the score of each cluster combination to jac_score_list
            jac_score_list.append(get_jaccard(set(c1[i]), set(c2[j])))
            comb_list.append([i,j])

    print(jac_score_list)
    print(comb_list)

    cmp_output['score_list'] = jac_score_list
    cmp_output['comb_list'] = comb_list
    
    return cmp_output


def get_similar_clusters(top_3_list, c1, c2):
    similar_clusters = []

    for t in top_3_list:
        t_list = list(t)
        c_list_1 = c1[t_list[1][0]]
        c_list_2 = c2[t_list[1][1]]

        s_list = list(set(c_list_1 + c_list_2))

        similar_clusters.append(s_list)

    return similar_clusters

'''
print('Applying Elbow method to get the count of Optimal clusters: ')
# Invocation for Elbow method
score_list = cluster_documents(information_ds_persist['dataset'], 10)

print(score_list)
print(max(score_list))
'''

# @TODO: Based on the optimial cluster count from Elbow method
print('Clustering for Information Dataset: ')
ids_c_list = cluster_documents(information_ds_persist['dataset'], 10)

print('Clustering for Description Dataset: ')
dds_c_list = cluster_documents(description_ds_persist['dataset'], 10)

# Jaccard similarity
print('Applying Jacard similarity for the clusters: ')
cmp_output  = compare_clusters(ids_c_list, dds_c_list)

top_3_tuple = sorted(zip(cmp_output['score_list'], cmp_output['comb_list']), reverse=True)[:3]
top_3_list = list(top_3_tuple)

top_c_list = get_similar_clusters(top_3_list, ids_c_list, dds_c_list)

#Creating wordcloud with top 3 similar clusters
# Wordcloud with all the words
create_wordcloud(top_c_list, False)

# Wordcloud without stopwords
create_wordcloud(top_c_list, True)

Clustering for Information Dataset: 
Clustering for Description Dataset: 
Applying Jacard similarity for the clusters: 
[0.0, 0.0, 0.0, 0.0, 0.0, 0.004461506536625856, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003484320557491289, 0.0062402496099844, 0.0, 0.0, 0.056752278376139186, 0.0, 0.010526315789473684, 0.008368200836820083, 0.0, 0.0, 0.002717391304347826, 0.002288329519450801, 0.0, 0.0, 0.0368180875337067, 0.0, 0.005449591280653951, 0.0, 0.0, 0.0, 0.0, 0.005076142131979695, 0.0, 0.0, 0.012031946893475781, 0.0, 0.0078125, 0.0036363636363636364, 0.0, 0.001256281407035176, 0.0, 0.0057670126874279125, 0.006211180124223602, 0.01053864168618267, 0.07903175752560257, 0.0, 0.0012468827930174563, 0.008492569002123142, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00031126789790412947, 0.0, 0.0, 0.0, 0.0, 0.0005561735261401557, 0.002777777777777778, 0.010242587601078167, 0.0005521811154058532, 0.004849137931034483, 0.17829537256518602, 0.0, 0.0, 0.015608740894901144, 0.0, 0.0007692307692307692, 0.0, 0.00601277715144682

# Clustering

In [27]:
pd_information_ds_persist = pd.DataFrame(information_ds_persist["listings"])
aaa = pd_information_ds_persist.replace('', float('nan'))
aaa.head()
    

KeyError: 'listings'

In [None]:
aaa.iloc[3]

In [None]:
for i in range(len(aaa)):
    print(aaa.iloc[i])
    numbers = []
    if float('nan') in aaa.iloc[i]:
        print('yes')
        #numbers.append(i)

In [None]:
numbers

## Information

In [None]:
pd_information_ds_persist = pd.DataFrame(information_ds_persist["listings"])
pd_information_ds_persist = pd_information_ds_persist.replace('', float('nan'))
pd_information_ds_persist = pd_information_ds_persist.dropna(how='any')
print(len(pd_information_ds_persist))

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++')
    kmeans.fit(pd_information_ds_persist)
    wcss.append(kmeans.inertia_) 
plt.plot(range(1,11), wcss)
plt.plot([1, 10],[wcss[0], wcss[9]] )

In [None]:
plt.plot(range(2,11), wcss[1:])
plt.plot([2, 10],[wcss[1], wcss[9]] )

In [None]:
distances = []

# from 2 clusters 
p1=np.array([2,wcss[1]])
p2=np.array([10,wcss[9]])

for i in range(1,10):
    p3 = np.array([i+1,wcss[i]])
    d = abs(np.cross(p2-p1,p3-p1)/np.linalg.norm(p2-p1))
    distances.append(d)
distances

## Description

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++')
    kmeans.fit(description_ds_persist['dataset'])
    wcss.append(kmeans.inertia_) 
plt.plot(range(1,11), wcss)

# Wordcloud

In [None]:
# Code dump
"""
# Extract only the number in the price
def extract_price(price_str):
    price_str = price_str.replace('.', '')
    str_list = [str(s) for s in price_str.split() if s.isdigit()]

    return ''.join(str_list)
"""
