In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import time

# Homework 4

## 1) Does basic house information reflect house's description?

In this assignment we will perform a clustering analysis of house announcements in Rome from Immobiliare.it. Be careful you may notice that the announcement is written in Italian. Don't worry about it, you don't need to understand what's in it.

![alt text](https://images.adsttc.com/media/images/5b76/1d5f/f197/cc80/ea00/01b2/slideshow/T16_1158b.jpg?1534467412)

### Scraping
The first thing that we have to do is to create our dataset. The website that we will scrape is: [immobiliare.it](https://www.immobiliare.it). In particular, we retrieve announcements starting from this [link](https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag=1).


#### 1) Information
The first matrix will have this format: <img src="https://latex.codecogs.com/gif.latex?$m_{ij}&space;=&space;value$" title="$m_{ij} = value$" /> where <img src="https://latex.codecogs.com/gif.latex?$i&space;\in&space;\{announcement_1,&space;...,&space;announcement_n\}$" title="$i \in \{announcement_1, ..., announcement_n\}$" /> and <img src="https://latex.codecogs.com/gif.latex?$j&space;\in&space;\{price,&space;locali,&space;superficie,&space;bagni,&space;piano&space;\}$" title="$j \in \{price, locali, superficie, bagni, piano \}$" />. *n* is the number of the announcements. It's possible that not all the announcements will have all the fields mentioned above, if it's the case don't take it into account. 

We create an empty dataframe where we will store all the informations.

In [2]:
df= pd.DataFrame(index=['price', 'locali', 'superficie', 'bagni', 'piano'] )
df

price
locali
superficie
bagni
piano


We take informations of all the house in the first $700$ pages (every page contains $25$ announcements).

In [3]:
count=1
for i in range (1,700):
    # requests.get helps us to enter the page. 
    # Then BeatifulSoup gives us the xml format.
    content= requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i))
    soup = BeautifulSoup(content.text, "lxml")
    for j in soup.findAll("div", class_="listing-item_body"):
        annuncio='annuncio_'+str(count)
        locali= None
        superficie= None
        bagni = None
        piano = None
        try:
            price=j.find("li", class_="lif__item lif__pricing").contents[-1]
            price= int(price.replace('\n', ' ').replace('€', ' ').replace('.', '').replace(' ', ''))
        except:
            price=None
        resto=j.findAll("li", class_="lif__item")
        for k in range (len(resto)):
            if (len((re.findall(string=str(resto[k]), pattern=r'locali'))))==1:
                locali=resto[k].span.contents[0].replace('\xa0', ' ')
            if (len((re.findall(string=str(resto[k]), pattern=r'superficie'))))==1:
                superficie=int(resto[k].span.contents[0].replace('\xa0', ' ').replace('.', ''))
            if (len((re.findall(string=str(resto[k]), pattern=r'bagni'))))==1:
                bagni=resto[k].span.contents[0].replace('\xa0', ' ')
            if (len((re.findall(string=str(resto[k]), pattern=r'piano'))))==1:
                piano=resto[k].abbr.contents[0].replace('\xa0', ' ').replace('\n', ' ') 
    
        df[annuncio]=[price, locali, superficie, bagni, piano]
        count+=1

In [4]:
df

Unnamed: 0,annuncio_1,annuncio_2,annuncio_3,annuncio_4,annuncio_5,annuncio_6,annuncio_7,annuncio_8,annuncio_9,annuncio_10,...,annuncio_17463,annuncio_17464,annuncio_17465,annuncio_17466,annuncio_17467,annuncio_17468,annuncio_17469,annuncio_17470,annuncio_17471,annuncio_17472
price,192000,225000,1350000,135000,229000,249000,1399000,279000,339000,699000,...,,269000.0,215000,246600,235000,254000.0,359000,529000,760000.0,220000
locali,1-5,2,4,2,5,2,5+,3,2,5+,...,5+,4.0,2,4,4,3.0,4,3,4.0,2
superficie,46,50,200,60,169,75,500,110,70,174,...,620,160.0,70,112,160,90.0,109,108,160.0,52
bagni,,1,2,1,3+,1,3+,1,1,3,...,3+,1.0,1,2,2,1.0,2,2,2.0,1
piano,,1,2,5,,1,,T,3,A,...,,,7,2,T,,1,T,,4


In [7]:
df_1=df.dropna(axis='columns')

In [9]:
df_1

Unnamed: 0,annuncio_2,annuncio_3,annuncio_4,annuncio_6,annuncio_8,annuncio_9,annuncio_10,annuncio_12,annuncio_13,annuncio_15,...,annuncio_17457,annuncio_17458,annuncio_17459,annuncio_17461,annuncio_17465,annuncio_17466,annuncio_17467,annuncio_17469,annuncio_17470,annuncio_17472
price,225000,1350000,135000,249000,279000,339000,699000,189000,570000,215000,...,199000,750000,265000,139000,215000,246600,235000,359000,529000,220000
locali,2,4,2,2,3,2,5+,4,5,3,...,3,4,3,2,2,4,4,4,3,2
superficie,50,200,60,75,110,70,174,168,160,67,...,110,119,90,40,70,112,160,109,108,52
bagni,1,2,1,1,1,1,3,3+,2,1,...,2,1,1,1,1,2,2,2,2,1
piano,1,2,5,1,T,3,A,T,2,1,...,2,2,3,4,7,2,T,1,T,4


In [10]:
df_1.to_pickle("df_1.pkl")

In [14]:
df_1= pd.read_pickle("df_1.pkl")
df_1

Unnamed: 0,annuncio_2,annuncio_3,annuncio_4,annuncio_6,annuncio_8,annuncio_9,annuncio_10,annuncio_12,annuncio_13,annuncio_15,...,annuncio_17457,annuncio_17458,annuncio_17459,annuncio_17461,annuncio_17465,annuncio_17466,annuncio_17467,annuncio_17469,annuncio_17470,annuncio_17472
price,225000,1350000,135000,249000,279000,339000,699000,189000,570000,215000,...,199000,750000,265000,139000,215000,246600,235000,359000,529000,220000
locali,2,4,2,2,3,2,5+,4,5,3,...,3,4,3,2,2,4,4,4,3,2
superficie,50,200,60,75,110,70,174,168,160,67,...,110,119,90,40,70,112,160,109,108,52
bagni,1,2,1,1,1,1,3,3+,2,1,...,2,1,1,1,1,2,2,2,2,1
piano,1,2,5,1,T,3,A,T,2,1,...,2,2,3,4,7,2,T,1,T,4


In [3]:
count=1
for i in range (1,700):        
    # requests.get helps us to enter the page. 
    # Then BeatifulSoup gives us the xml format.
    try:
        content= requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i))
    except:
        time.sleep(100)
        content= requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i))      
    soup = BeautifulSoup(content.text, "lxml")
    for j in soup.findAll("div", class_="listing-item_body"):
        for link in j.find_all('a'):
            if link.get('href') is not None and (link.get('href').startswith('https')):
                url=(link.get('href'))
                content=requests.get(url)
                soup = BeautifulSoup(content.text, "lxml")
                desc=soup.find("div", class_="col-xs-12 description-text text-compressed")
                try:
                    descrizione=str(desc.contents[1])
                    op = open(r'C:\Users\Egon\Desktop\Universita\ADM\HW4\doc\annuncio_' + str(count) + '.tsv', 'w', encoding="utf-8")
                    op.write(descrizione)
                    op.close()
                except: pass
        count+=1