#### Import library

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, regexp_tokenize
import numpy as np
import time
from sklearn.cluster import KMeans
from tqdm import tqdm

## Parsing page with BeautifulSoup

Get data from website and create a BeautifulSoup class to parse document

In [48]:
#create lists where store the data
title=[]
price=[]
n_room=[]
mq=[]
n_bathroom=[]
plan=[]
desc=[]
#iterate to take data from different page
count=1
for num in tqdm(range(1,40)):
    url = str('www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag='+str(num))
    r = requests.get('https://' +url,verify=False)
    data = r.text
    soup = BeautifulSoup(data,'html.parser')
    
    #iterate on the main class which contain the data that we want
    for announcement in soup.find_all('div', class_='listing-item_body--content'):
        #find all the features
        total_features = announcement.find('ul', class_='listing-features list-piped')
        if len(total_features)==7:
            #announcement_title= announcement.find('p', class_='titolo text-primary').text.strip()
            title.append('announcement_'+str(count))
            feature = announcement.find_all('li')
            feature_price = feature[0].text.strip()
            if ('%' in feature_price or 'EZZO SU RICHIESTA' in feature_price )== True:
                title.pop()
            else:
                price.append(feature_price[2:].replace('.',''))
                feature_n_room = feature[1].text[0]
                n_room.append(feature_n_room)
                feature_mq = feature[2].text[0:-12]
                mq.append(feature_mq)
                feature_n_bathroom = feature[3].text[0:-5]
                n_bathroom.append(feature_n_bathroom)
                feature_plan = feature[4].text[0:-7].strip()
                #check if there is a character inside plan data, if yes pop last element in the others lists to mantain dimension
                if feature_plan.isalpha()==True:
                    title.pop()
                    price.pop()
                    n_room.pop()
                    mq.pop()
                    n_bathroom.pop()
                    count+=0
                else:    
                    plan.append(feature_plan)
                    count+=1
                    #find <a element where <href is present
                    link=announcement.find('a', href=True)
                    #check to solve the problem where the <href didn't have the complete link
                    if ('https://' in link['href'])==True:
                        ann=requests.get(link['href'])
                    else:
                        ann=requests.get('https://www.immobiliare.it'+link['href'])
                    data1 = ann.text
                    #create another bs4 element to access in the specific link and take commplete text from announcement
                    soup1 = BeautifulSoup(data1,'html.parser')
                    #access to link and take complete text
                    a=soup1.find('div', attrs={'role':'contentinfo'}).text.strip()
                    desc.append(a)
        

    time.sleep(1)

            


  0%|                                                                                           | 0/39 [00:00<?, ?it/s]
  3%|██                                                                               | 1/39 [01:35<1:00:44, 95.92s/it]
  5%|████▎                                                                              | 2/39 [02:14<48:36, 78.83s/it]
  8%|██████▍                                                                            | 3/39 [02:51<39:46, 66.28s/it]
 10%|████████▌                                                                          | 4/39 [03:32<34:14, 58.70s/it]
 13%|██████████▋                                                                        | 5/39 [04:12<30:04, 53.09s/it]
 15%|████████████▊                                                                      | 6/39 [04:50<26:38, 48.45s/it]
 18%|██████████████▉                                                                    | 7/39 [05:25<23:38, 44.32s/it]
 21%|█████████████████                 

Now we store the data in a pandas DataFrame

In [104]:
table=pd.DataFrame({'Annuncio':title,
                    'Prezzo':price,
                    'Camere':n_room,
                    'Superficie':mq,
                    'Bagni':n_bathroom,
                    'Piano':plan})
#used to remove a \n inside "Piano" coloumn
table =table.set_index('Annuncio')
table=table.replace({r'\n': '',r'\+' : '',r'\xa0': ''}, regex=True)
table.head(5)

Unnamed: 0_level_0,Prezzo,Camere,Superficie,Bagni,Piano
Annuncio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
announcement_1,225000,2,50,1,1
announcement_2,1100000,5,225,3,3
announcement_3,149000,3,75,1,1
announcement_4,1300000,4,115,2,9
announcement_5,115000,2,58,1,3


Take all the announcement from page

In [19]:
#only see if the dataframe is the data are correct, then i remove this
annuncio=pd.DataFrame({'Annuncio':desc})
annuncio

Unnamed: 0,Annuncio
0,Gregorio VII / San Pietro a 200 mt da Piazza S...
1,Camilluccia - Nelle immediate vicinanze di Via...
2,Roma - zona Tor Pignattara - via Amedeo Cencel...
3,TORRE SPACCATA/TORRE MAURA A pochi passi dalla...
4,Monteverde Vecchio e più precisamente in via V...
5,CASSIA VILLA SAN PIETRO VIA VIBIO MARIANO TRIP...
6,Quartiere Centocelle a pochi passi dalla moder...
7,Rif: strindberg219 - RINNOVAMENTO - 75 MQ - TE...
8,NUOVO SALARIO- VIA FOSDINOVO PIAZZA MINUCCIANO...


Function to preprocess all text in the announcement

In [51]:
def preprocess(text):
    text = text.lower()
    # removing '\n'
    text = text.replace('\\n', ' ')
    # removing punctuation
    tokenizer = regexp_tokenize(text, "[\w\$]+")
    # filter the non stopwords
    filtered = [w for w in tokenizer if not w in stopwords.words('italian')]
    ps = PorterStemmer()
    # removing the stem
    filtered = [ps.stem(word) for word in filtered]
    return filtered

Pre process all the announcement and put in a list

In [52]:
processed_list=[]
for i in desc:
    processed_list.append(preprocess(i))    

we have a list of word processed, now i convert it to a list of string

In [53]:
b=[' '.join(i) for i in processed_list]

In [54]:
ren=[int(i) for i in range(1,len(b)+1)]

In [55]:
#N.B.need to find how to increse the docId in relation of document!!!!!!!!!
#creating dataframe with wordId and all text from the announcement processed
df1 = pd.DataFrame({'wordId': ren, 
               'parole': b})

we have used pandas and numpy to compute Tf-IDF in all dataframe

In [56]:
# Tokenize and generate count vectors
word_vec = df1.parole.apply(str.split).apply(pd.value_counts).fillna(0)
# Compute term frequencies
tf = word_vec.divide(np.sum(word_vec, axis=1), axis=0)
# Compute inverse document frequencies
idf = np.log10(len(tf) / word_vec[word_vec > 0].count())
# Compute TF-IDF vectors
tfidf = np.multiply(tf, idf.to_frame().T)
#L2 (Euclidean) normalization
l2_norm = np.sum(np.sqrt(tfidf), axis=1)
#Normalized TF-IDF vectors
tfidf_norm = (tfidf.T / l2_norm).T
#put in a dataframe
second_mat=pd.DataFrame(tfidf_norm)
word_column=['word_'+str(i) for i in range(1,len(second_mat.columns)+1) ]
second_mat.columns=word_column
second_mat=second_mat.set_index(table.index)
second_mat.head(10)


Unnamed: 0_level_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,word_10,...,word_6054,word_6055,word_6056,word_6057,word_6058,word_6059,word_6060,word_6061,word_6062,word_6063
Annuncio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
announcement_1,0.012263,0.006953,0.009357,0.004356,0.004766,0.004766,0.003655,0.00117,0.000373,0.000319,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001261,0.001078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000306,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000343,0.000293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000304,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000247,0.000212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000937,0.000801,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Saving data to csv

In [57]:
second_mat.to_csv(r'C:\Users\Daniele\Desktop\Matrix_TfIdf.csv')
table.to_csv(r'C:\Users\Daniele\Desktop\Matrix.csv')

# Clustering

In [4]:
b=pd.read_csv(r'C:\Users\Daniele\Desktop\Matrix.csv')
table=b
table=table.set_index(['Annuncio'])
table.head(10)

Unnamed: 0_level_0,Prezzo,Camere,Superficie,Bagni,Piano
Annuncio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
announcement_1,225000,2,50,1,1
announcement_2,1100000,5,225,3,3
announcement_3,149000,3,75,1,1
announcement_4,1300000,4,115,2,9
announcement_5,115000,2,58,1,3
announcement_6,695000,5,250,3,3
announcement_7,1380000,5,140,2,2
announcement_8,370000,4,139,2,5
announcement_9,650000,5,170,3,1
announcement_10,800000,5,220,3,4


In [5]:
a=pd.read_csv(r'C:\Users\Daniele\Desktop\Matrix_TfIdf.csv')
second_mat=a
second_mat=second_mat.set_index(['Annuncio'])
second_mat.head(10)

Unnamed: 0_level_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,word_10,...,word_6054,word_6055,word_6056,word_6057,word_6058,word_6059,word_6060,word_6061,word_6062,word_6063
Annuncio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
announcement_1,0.012263,0.006953,0.009357,0.004356,0.004766,0.004766,0.003655,0.00117,0.000373,0.000319,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001261,0.001078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000306,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000343,0.000293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000304,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000247,0.000212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
announcement_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000937,0.000801,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Convert DataFrame to matrix
dataset_array =second_mat.values
dataset_array1=table.values
# Using sklearn
km =KMeans(n_clusters=8, init='k-means++', tol=0.0001).fit(dataset_array)
km1=KMeans(n_clusters=8, init='k-means++', tol=0.0001).fit(dataset_array1)
# Get cluster assignment labels
labels = km.labels_
labels1 = km1.labels_
#put the result of cluster in a dataframe
results = pd.DataFrame([second_mat.index,labels]).T
results1 = pd.DataFrame([table.index,labels]).T

In [7]:
final_result=pd.merge(results, results1, on=0)
final_result

Unnamed: 0,0,1_x,1_y
0,announcement_1,4,4
1,announcement_2,7,7
2,announcement_3,7,7
3,announcement_4,7,7
4,announcement_5,7,7
5,announcement_6,7,7
6,announcement_7,7,7
7,announcement_8,7,7
8,announcement_9,7,7
9,announcement_10,7,7


In [16]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection / union)

In [25]:
cluster1=km.cluster_centers_
cluster2=km1.cluster_centers_    


In [34]:
for i in range(len(cluster1)):
    print(jaccard_similarity(cluster1[i],cluster2[i]))


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


# Find the duplicates!