# Dataset 1 

In [14]:
from pymongo import MongoClient, collection
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen as ureq
import re
import logging

### Setting MongoDB parameters

In [4]:
client = MongoClient()
my_key = "AnqLqo5TT9JDfzfqKrDUEYUqJ6HWZJy8"
base = "https://api.mlab.com/api/1"
url_db = base + "/databases?apiKey=" + my_key
response = requests.get(url_db)
databases = json.loads(response.text)
adm_database = databases[0]
adm_database

'adm_hm4'

In [5]:
# Set structure for database
headers = {'content-type':'application/json'}

We manually created the collection 'flats' in the "adm_hm4" database (MongoDB)

In [6]:
collection = 'flats'
col_url = base + '/databases/' + adm_database + "/collections/" + collection +'/?apiKey=' + my_key

# Pushing data into database
# payload = json.dumps(flat_data)
# response = requests.post(col_url, data=payload, headers=headers)

Now, we extract the interesting data from websites and push into our database

In [8]:
def scrap_data(soup):
    """ 
    This function is retrieving data about : price, locali, superficie, bagni, piano.
    Input: The Beautiful Soup object.
    Output: The dictionary object with integer values. 
    """
    # find the html tag with price
    price = soup.find_all('li',class_='features__price')[0].get_text()
    # extract and preprocess string to get price
    price = price.replace("€", "").replace(" ", "").replace(".", "")
    # find the html tag with the info about: locali, superficie, bagni, piano
    data = soup.find_all('ul','list-inline list-piped features__list')[0].get_text()
    # preprocess string and find numbers
    data = data.replace('m2','')
    numbers = re.compile('\d+(?:\.\d+)?')
    data = numbers.findall(data)
    # if found all numbers -> return them as a dictionary
    if len(data)==4:
         return [{'price':int(price),'locali': int(data[0]), 'superficie': int(data[1]), "bagni": int(data[2]), "piano": int(data[3])}]

In [9]:
def threadExtract(url):
    """
    This function is getting html content, scrapping data and saving it into our database.
    Input: String: "url" 
    """
    logging.info('extracting' + url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    data_dict = scrap_data(soup) #output should be a dictionary
    
    collection = 'flats'
    col_url = base + '/databases/' + adm_database + "/collections/" + collection +'/?apiKey=' + my_key

    # Pushing data into database
    payload = json.dumps(data_dict)
    response = requests.post(col_url, data=payload, headers=headers)

    logging.info("Done extracting")

#### Thread-based parallelism :

In [None]:
links=[] # list with links used while retrieving data

stop = False
i = 1 
while stop == False:
    # Loading the initial web-page
    content = requests.get("https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i))
    soup = BeautifulSoup(content.text, 'html.parser')
    
    # Scrapping data from each link from the initial website until we get 10000 links
    for link in soup.find_all('a', href=True):
        url = link['href']
        if url.startswith('https://www.immobiliare.it/') and url.endswith('.html'):
            try:
                threadExtract(url)
                links.append(url)
                if len(links >= 10000): stop = True
            except: continue
    i += 1

In [28]:
# display what is inside the database
response = requests.get(col_url)
result = json.loads(response.text)
result

In [30]:
import numpy as np

# Convert data into matrix
dataset1 = np.matrix(list(map(lambda x:list(x.values())[1:], result)))

In [38]:
len(dataset1) #number of rows

364