In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import time

from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
import string
import nltk
from nltk.stem.snowball import SnowballStemmer

import json
import math

from datetime import datetime
import numpy as np

In [2]:
df_1= pd.read_pickle("df_1.pkl")
op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\inverted_index.txt', 'r', encoding="utf-8")
inverted_index = json.loads(op.read())
op.close()

# Homework 4

## 1) Does basic house information reflect house's description?

In this assignment we will perform a clustering analysis of house announcements in Rome from Immobiliare.it. Be careful you may notice that the announcement is written in Italian. Don't worry about it, you don't need to understand what's in it.

![alt text](https://images.adsttc.com/media/images/5b76/1d5f/f197/cc80/ea00/01b2/slideshow/T16_1158b.jpg?1534467412)

### Scraping
The first thing that we have to do is to create our dataset. The website that we will scrape is: [immobiliare.it](https://www.immobiliare.it). In particular, we retrieve announcements starting from this [link](https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag=1).


#### 1) Information
The first matrix will have this format: <img src="https://latex.codecogs.com/gif.latex?$m_{ij}&space;=&space;value$" title="$m_{ij} = value$" /> where <img src="https://latex.codecogs.com/gif.latex?$i&space;\in&space;\{announcement_1,&space;...,&space;announcement_n\}$" title="$i \in \{announcement_1, ..., announcement_n\}$" /> and <img src="https://latex.codecogs.com/gif.latex?$j&space;\in&space;\{price,&space;locali,&space;superficie,&space;bagni,&space;piano&space;\}$" title="$j \in \{price, locali, superficie, bagni, piano \}$" />
*n* is the number of the announcements. 

It's possible that not all the announcements will have all the fields mentioned above, if it's the case we don't take it into account. 

We create an empty dataframe where we will store all the informations.

In [2]:
df= pd.DataFrame(index=['price', 'locali', 'superficie', 'bagni', 'piano'] )
df

price
locali
superficie
bagni
piano


We take informations of all the house in the first $700$ pages (every page contains $25$ announcements) and store them in our dataframe.

In [3]:
count=1
for i in range (1,700):
    # requests.get helps us to enter the page. 
    # Then BeatifulSoup gives us the xml format.
    content= requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i))
    soup = BeautifulSoup(content.text, "lxml")
    #we take separatly every announce and take all infos that we need.
    for j in soup.findAll("div", class_="listing-item_body"):
        annuncio='annuncio_'+str(count)
        locali= None
        superficie= None
        bagni = None
        piano = None
        try:
            price=j.find("li", class_="lif__item lif__pricing").contents[-1]
            price= int(price.replace('\n', ' ').replace('€', ' ').replace('.', '').replace(' ', ''))
        except:
            price=None
        resto=j.findAll("li", class_="lif__item")
        for k in range (len(resto)):
            if (len((re.findall(string=str(resto[k]), pattern=r'locali'))))==1:
                locali=resto[k].span.contents[0].replace('\xa0', ' ')
            if (len((re.findall(string=str(resto[k]), pattern=r'superficie'))))==1:
                superficie=int(resto[k].span.contents[0].replace('\xa0', ' ').replace('.', ''))
            if (len((re.findall(string=str(resto[k]), pattern=r'bagni'))))==1:
                bagni=resto[k].span.contents[0].replace('\xa0', ' ')
            if (len((re.findall(string=str(resto[k]), pattern=r'piano'))))==1:
                piano=resto[k].abbr.contents[0].replace('\xa0', ' ').replace('\n', ' ') 
    
        df[annuncio]=[price, locali, superficie, bagni, piano]
        count+=1

In [4]:
df

Unnamed: 0,annuncio_1,annuncio_2,annuncio_3,annuncio_4,annuncio_5,annuncio_6,annuncio_7,annuncio_8,annuncio_9,annuncio_10,...,annuncio_17463,annuncio_17464,annuncio_17465,annuncio_17466,annuncio_17467,annuncio_17468,annuncio_17469,annuncio_17470,annuncio_17471,annuncio_17472
price,192000,225000,1350000,135000,229000,249000,1399000,279000,339000,699000,...,,269000.0,215000,246600,235000,254000.0,359000,529000,760000.0,220000
locali,1-5,2,4,2,5,2,5+,3,2,5+,...,5+,4.0,2,4,4,3.0,4,3,4.0,2
superficie,46,50,200,60,169,75,500,110,70,174,...,620,160.0,70,112,160,90.0,109,108,160.0,52
bagni,,1,2,1,3+,1,3+,1,1,3,...,3+,1.0,1,2,2,1.0,2,2,2.0,1
piano,,1,2,5,,1,,T,3,A,...,,,7,2,T,,1,T,,4


Now we drop all the columns that have `None` values:

In [7]:
df_1=df.dropna(axis='columns')

In [58]:
df_1=df_1.T

It's very useful to save our final dataframe in an external `pickle` file, so we can easly load it when in need.

In [59]:
df_1.to_pickle("df_1.pkl")

In [61]:
df_1= pd.read_pickle("df_1.pkl")
df_1.head()

Unnamed: 0,price,locali,superficie,bagni,piano
annuncio_2,225000,2,50,1,1
annuncio_3,1350000,4,200,2,2
annuncio_4,135000,2,60,1,5
annuncio_6,249000,2,75,1,1
annuncio_8,279000,3,110,1,T


In [63]:
df_1.describe()

Unnamed: 0,price,locali,superficie,bagni,piano
count,12942,12942,12942,12942,12942
unique,799,6,348,4,15
top,199000,3,90,1,1
freq,226,4777,634,6766,2905


#### 2) Description
The second matrix will have this format: <img src="https://latex.codecogs.com/gif.latex?$m_{ij}&space;=&space;tfIdf_{ij}$" title="$m_{ij} = tfIdf_{ij}$" /> where <img src="https://latex.codecogs.com/gif.latex?$i&space;\in&space;\{announcement_1,&space;...,&space;announcement_n\}$" title="$i \in \{announcement_1, ..., announcement_n\}$" /> and <img src="https://latex.codecogs.com/gif.latex?$j&space;\in&space;\{word_1,&space;...,word_m\}$" title="$j \in \{word_1, ...,word_m\}$" />

*n* is the number of the announcements and *m* is the cardinality of the vocabulary. 


The first thing that we hve to do is to retrive the infos: in particular we want to get **full** description of every announce (so we have to go *inside* the url of every house).

In [3]:
count=1
for i in range (1,700):        
    # requests.get helps us to enter the page. 
    # Then BeatifulSoup gives us the xml format.
    try:
        content= requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i))
    except:
        time.sleep(100)
        content= requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i))      
    soup = BeautifulSoup(content.text, "lxml")
    for j in soup.findAll("div", class_="listing-item_body"):
        for link in j.find_all('a'):
            if link.get('href') is not None and (link.get('href').startswith('https')):
                url=(link.get('href'))
                content=requests.get(url)
                soup = BeautifulSoup(content.text, "lxml")
                desc=soup.find("div", class_="col-xs-12 description-text text-compressed")
                try:
                    descrizione=str(desc.contents[1])
                    op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\doc\annuncio_' + str(count) + '.tsv', 'w', encoding="utf-8")
                    op.write(descrizione)
                    op.close()
                except: pass
        count+=1

Now we do some preprocessing: 
    
    creating functions:

In [2]:

# list of stopwords
stopWords = set(stopwords.words('italian'))
# the stemmer used
ps = SnowballStemmer("italian")

# list of punctuation used
string.punctuation = string.punctuation + '–“”’'


# preprocess takes a list of words as input and RETURNS a list of stemmed words without stopwords and punctuation
def preprocess(l):
    final = []
    for i in l:
        if not((ps.stem(i) in stopWords) or (ps.stem(i) in (string.punctuation) )):
            final.append(ps.stem(i))
    return (final)

    trying our functions:

In [33]:
op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\doc\annuncio_3.tsv', 'r', encoding="utf-8")
for line in op:
    ou = line.strip().split('\t')
    sentence = ou[0].replace('div', ' ').replace('/', ' ').replace('br', ' ')#.replace('\\r', ' ').replace('\\t', ' ')
    print(preprocess(word_tokenize(sentence)))
op.close()

[]
['fleming', 'strad', 'priv', 'via', 'alessandr', 'fleming', 'siam', 'liet', 'proporr', 'vend', 'via', 'esclus', 'quart', 'residenzial', 'fleming', 'immobil', 'rappresent', '200mq', 'oltre', 'spaz', 'estern', 'pertint', 'affacc', 'riserv', 'verd', 'panoram', 'vist', 'san', 'pietr', 'post', 'second', 'pian', 'alto', 'tre', "un'eleg", 'palazzin', 'cortin', 'perfett', 'stat', 'manutenzion', 'serviz', 'portiner', 'cas', 'recent', 'restaur', 'import', 'stud', 'architettur', 'present', 'rifin', 'material', 'preg', 'ogni', 'particol', 'parquet', 'ogni', 'ambient', 'marm', 'bagn', 'armad', 'soppalc', 'finitur', 'falegnamer', 'misur', 'climatizz', 'canalizz', 'infiss', 'tagl', 'termic', 'serrand', 'elettr', 'port', 'blind', 'impiant', 'allarm', 'sorvegl', 'intern', 'compost', 'ingress', 'guardarob', 'ostip', 'salon', 'dopp', 'lumin', 'vetr', 'access', 'terrazzin', 'ampi', 'cucin', 'abit', 'isol', 'central', 'balcon', 'zon', 'nott', 'compost', 'due', 'cam', 'matrimonial', 'camer', 'padronal', 

Now we have to create a vocabulary that contains all the words of our corpus (we store it in an external `json` file):

In [34]:
def vocabularization(vocabulary, final, index):
    for word in final:
        if not(word in vocabulary):
            vocabulary[word] = str(index)
            index = index + 1
    return(vocabulary, index)

In [68]:
vocabulary= {}
index = 0

for i in range(17472):
    try:
        op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\doc\annuncio_' + str(i) + '.tsv', 'r', encoding="utf-8")
        for line in op:
            ou = line.strip().split('\t')
            sentence = ou[0].replace('div', ' ').replace('/', ' ').replace('br', ' ')        
            #preprocessing data deleting stop words, punctuations, ecc. 
            final = preprocess(word_tokenize(sentence))
            # IF  word not in vocabulary -> add the word
            vocabulary, index = vocabularization(vocabulary, final, index)            
        op.close()
    except: pass
op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\vocabulary.txt', 'w', encoding="utf-8")
op.write(json.dumps(vocabulary))
op.close()

In [2]:
# opening vocabulary.txt
op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\vocabulary.txt', 'r', encoding="utf-8")
vocabulary = json.loads(op.read())

We create an inverted index that associate words and announcement that contains that word:
It will be a dictionary of this format:

```
{
term_id_1:[announcement_1, announcement_2, announcement_4],
term_id_2:[announcement_1, announcement_3, announcement_5, announcement_6],
...}
```



We also want to store it in a separate file, `inverted_index_0.txt` , and load it in memory with json when needed.

In [129]:
# from the vocabulary, the inverted index is created
inverted_index_0 = {}

for file in range(17472):
    try:
        op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\doc\annuncio_' + str(file) + '.tsv', 'r', encoding="utf-8")
        for line in op:
            ou = line.strip().split('\t')
            sentence = ou[0].replace('div', ' ').replace('/', ' ').replace('br', ' ')        
            #preprocessing data deleting stop words, punctuations, ecc. 
            final = preprocess(word_tokenize(sentence))
            for word in final:
                index = vocabulary[word]
                if not (index in inverted_index_0):
                    inverted_index_0[index] = ['annuncio_' + str(file)]
                elif not('annuncio_' + str(file) in inverted_index_0[index]):
                    inverted_index_0[index] = inverted_index_0[index] + ['annuncio_' + str(file)]
        op.close()
    except: pass

op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\inverted_index_0.txt', 'w', encoding="utf-8")
op.write(json.dumps(inverted_index_0))
op.close()

In [3]:
op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\inverted_index_0.txt', 'r', encoding="utf-8")
inverted_index_0 = json.loads(op.read())


In [72]:
# from the vocabulary the inverted index is created. In this case the index are mapped to a tuple (doc, tf-idf)

inverted_index = {}
for file in range(17472):
    try:        
        op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\doc\annuncio_' + str(file) + '.tsv', 'r', encoding="utf-8")
        desc=[]
        for line in op:
            ou = line.strip().split('\t')
            sentence = ou[0].replace('div', ' ').replace('/', ' ').replace('br', ' ')        
            #preprocessing data deleting stop words, punctuations, ecc. 
            final = preprocess(word_tokenize(sentence))
            # CREATING INVERTED INDEX with tf-idf values
            desc.extend(final)
        for word in desc:
            index = vocabulary[word]               
            tf = desc.count(word) / len(desc)               
            idf = math.log( 17420 / len(inverted_index_0[vocabulary[word]]))
            tf_idf = tf*idf
            if not (index in inverted_index):
                inverted_index[index] = [('annuncio_' + str(file), tf_idf )]
            elif not(('annuncio_' + str(file), tf_idf)  in inverted_index[index]):
                inverted_index[index] = inverted_index[index] + [('annuncio_' + str(file), tf_idf)]      
        op.close()
    except:pass
op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\inverted_index.txt', 'w', encoding="utf-8")
op.write(json.dumps(inverted_index))
op.close()

In [4]:
op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\inverted_index.txt', 'r', encoding="utf-8")
inverted_index = json.loads(op.read())


In [30]:
righe=[]
for i in range (17473):
    righe.append('annuncio_'+str(i))
df_2=pd.DataFrame(0,columns=[], index=righe)
df_2.head()

annuncio_0
annuncio_1
annuncio_2
annuncio_3
annuncio_4


In [31]:
for word in range(17473):
    #print(word, str(datetime.now()))
    array=np.zeros(17473)
    for ann in inverted_index[str(word)]:
        #print(ann, str(datetime.now()))
        annuncio=(ann[0])
        tf_idf=(ann[1])
        array[int(annuncio.replace('annuncio_', ''))]=tf_idf
        df_2[word]=array
df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17463,17464,17465,17466,17467,17468,17469,17470,17471,17472
annuncio_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_2,0.068564,0.022727,0.029379,0.025256,0.023229,0.055761,0.032966,0.004129,0.011766,0.018419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_3,0.0,0.0,0.0,0.0,0.011615,0.0,0.0,0.004129,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17463,17464,17465,17466,17467,17468,17469,17470,17471,17472
annuncio_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_2,0.068564,0.022727,0.029379,0.025256,0.023229,0.055761,0.032966,0.004129,0.011766,0.018419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_3,0.0,0.0,0.0,0.0,0.011615,0.0,0.0,0.004129,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17463,17464,17465,17466,17467,17468,17469,17470,17471,17472
annuncio_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_2,0.068564,0.022727,0.029379,0.025256,0.023229,0.055761,0.032966,0.004129,0.011766,0.018419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_3,0.0,0.0,0.0,0.0,0.011615,0.0,0.0,0.004129,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
coldf1=df_1.columns

In [53]:
df_2=df_2.T.filter(coldf1).T
df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17463,17464,17465,17466,17467,17468,17469,17470,17471,17472
annuncio_2,0.068564,0.022727,0.029379,0.025256,0.023229,0.055761,0.032966,0.004129,0.011766,0.018419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_3,0.0,0.0,0.0,0.0,0.011615,0.0,0.0,0.004129,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
annuncio_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003804,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
df_2.to_pickle("df_2.pkl")



In [55]:
df_2= pd.read_pickle("df_2.pkl")
