Get Flata Data: Functions
==========

In [1]:
import pandas as pd
import numpy as np
import datetime
import re

import requests
import warnings

from bs4 import BeautifulSoup

from stem import Signal
from stem.control import Controller

warnings.filterwarnings('ignore')

### Request functions

In [2]:
def set_new_ip():
    """Change IP using TOR"""
    
    with Controller.from_port(port=9051) as controller:
        controller.authenticate(password='ju4n4n4290')
        controller.signal(Signal.NEWNYM)

In [3]:
def get_current_ip():
    """get current ip"""
    
    local_proxy = 'socks5://localhost:9050'
    socks_proxy = {
        'http': local_proxy,
        'https': local_proxy
    }
    
    current_ip = requests.get(url='http://icanhazip.com/',
                              proxies=socks_proxy,
                              verify=False)
    
    return current_ip.text.strip()

In [4]:
def test_change_ip():
    """test if the IP changes properly"""
    
    old_ip = get_current_ip()
    set_new_ip()
    new_ip = get_current_ip()
    
    if old_ip == new_ip:
        return "Error: IP has not changed"

In [5]:
def get_soup(url):
    """get soup requesting a url through Tor and Privoxy"""
    
    local_proxy = 'socks5://localhost:9050'
    socks_proxy = {
        'http': local_proxy,
        'https': local_proxy
    }
    
    r = requests.get(url, proxies=socks_proxy, verify=False)

    page = r.content
    soup = BeautifulSoup(page, 'html5lib')
    
    return soup

### Get Flat ID and Prices Data

In [6]:
def get_number_of_properties_for_sale(city):
    """
    Get the number of properties for sale at the moment of requesting. 
    This function has only been tested for the cities of Mostoles, Leganes, Fuenlabrada and Getafe
    """
    
    set_new_ip()
    test_change_ip()
    
    url_search = "https://www.idealista.com/venta-viviendas/"+city+"-madrid/con-pisos/"
    html_search = get_soup(url_search)
    
    string_properties = html_search.find_all("span", class_="breadcrumb-info")[2].get_text().replace(".","")
    
    properties = int(string_properties)
    print("There are {} properties in {}".format(properties,city))
    
    return properties

In [7]:
def number_of_pages(number_of_properties):
    """Number of pages to scrap in the first search page"""
    
    pages = int(number_of_properties/30)+1 # there are 30 properties for page
    print("There are {} pages in the first search page".format(pages))
    
    return pages

In [8]:
def get_search_links(number_of_pages,city):
    """Get the links of the search page for scraping"""
    
    links = []
    for page in range(number_of_pages):
        page += 1
        
        url = "https://www.idealista.com/venta-viviendas/{}-madrid/con-pisos/pagina-{}.htm".format(city,page)
        links.append(url)
    
    return links

In [9]:
def process_properties(ids_properties,prices_properties):
    """
    Save id and price properties in a dataframe.
    Transform prices into integers and get the link of the flat from id
    """
    properties = pd.DataFrame({
        "price" : prices_properties,
        "id" : ids_properties
    })
    properties['id'] = properties['id'].astype(int)
    
    properties['price'] = properties['price'].map(lambda x:int(x.replace("€","").replace(".","")))
    
    properties['link'] = properties['id'].map(lambda x:"https://www.idealista.com/inmueble/{}/".format(x))
    
    return properties

In [10]:
def get_id_and_price(search_links):
    """Get the id and price of each flat from search links"""
    
    ids_properties = []
    prices_properties = []

    set_new_ip()
    test_change_ip()
    
    counter = 0
    
    for link in search_links:

        counter += 1
        print("{} / {}".format(counter,len(search_links)))

        flat_and_price = get_soup(link)

        flat_ids = flat_and_price.find_all("a", class_="item-link")
        flat_prices = flat_and_price.find_all("span", class_="item-price h2-simulated")
        
        for ids in flat_ids:
            print(ids.get("href").split("/")[2])
            id_flat = ids.get("href").split("/")[2]
            
            ids_properties.append(id_flat)

        for prices in flat_prices:
            print(prices.get_text())
            price_flat = prices.get_text()
            
            prices_properties.append(price_flat)
    
    # process ids and prices
    properties = process_properties(ids_properties, prices_properties)
    
    return properties

In [1]:
def merge_old_and_new_properties(city):
    """Get the update properties and merge them to the old ones"""
    
    date = str(datetime.datetime.now())[:10]
    path_old = "./data/clean/{}/{}_properties.csv".format(city, city)
    path_new = "./data/raw/{}/properties_{}.csv".format(city, date)
    
    properties_old = pd.read_csv(path_old, sep = "^")
    properties_new = pd.read_csv(path_new, sep = "^")
    
    print("Merging old and new properties...")
    properties_merged = properties_old.merge(properties_new, left_on="id", right_on="id", how="outer")
    
    # drop columns with links
    cols_with_links = properties_merged.columns.str.contains("link")
    properties_merged = properties_merged.ix[:,~cols_with_links]
    
    # update price
    price_update = "price_{}".format(date)
    properties_merged[price_update] = properties_merged['price']
    properties_merged.drop("price", axis = 1, inplace = True)
    
    return properties_merged

In [12]:
def get_properties(city):
    """
    This function gets the properties (ID, price and link to idealista web page) of the input city
    Write properties in raw data folder
    Merge new properties to the old ones and write the merged dataframe in clean data folder 
    with the following structure:
    
    |  id  |  price_date1  |  price_dateX  |  ...  |
    
    """
    
    # first search for getting the number of properties in the city at idealista 
    # and the number of pages we have to scrap
    number_of_properties = get_number_of_properties_for_sale(city)
    pages = number_of_pages(number_of_properties)
    
    # creating links we are going to scrap
    links = get_search_links(pages, city)
    
    # scraping pages for getting id and price. Then, create the link of each flat
    properties_id_and_price = get_id_and_price(links)
    
    # date of today for creating a new folder where we are going to save the dataframe
    date = str(datetime.datetime.now())[:10]
    
    # writing raw data
    path_raw = "./data/raw/{}/properties_{}.csv".format(city, date)
    properties_id_and_price.to_csv(path_raw, sep = "^", index = False)
    
    # merging old and new properties in a dataframe
    properties_update = merge_old_and_new_properties(city)
    path_clean = "./data/clean/{}/{}_properties.csv".format(city, city)
    properties_update.to_csv(path_clean, sep = "^", index = False)
    
    return properties_update

### Get Attributes Data from Flats

In [14]:
def try_set_new_ip(n_times, link):
    """
    This functions is specific for getting attributes at each property page (link)
    It tries to get the price by url request and if it does not get any information change the IP
    and repeat the process n times.
    """
    attempts = 0
    
    while attempts < n_times:
        try:
            soup = get_soup(link)
            price = soup.find_all('span', class_='h3-simulated txt-bold')[0].get_text()
            break
        
        except:
            "list index out of range"
            print("Trying to set another IP")
            set_new_ip()
            test_change_ip()
            attempts += 1

In [16]:
def get_new_links(city):
    """
    Get the links of new flats published in a city
    IMPORTANT!!!: get_propeties(city) has to be run in order to get the new flats previously
    """
    path = "./data/clean/{}/{}_properties.csv".format(city, city)
    properties_new = pd.read_csv(path, sep = "^")
    
    ids_new = properties_new[np.isnan(properties_new.ix[:,-2])]['id']
    
    links_new = ids_new.map(lambda x:"https://www.idealista.com/inmueble/{}/".format(x))
    
    return links_new

In [17]:
def get_attributes(property_links):
    """
    This function goes over property links and gets some flat attributes.
    The IP has to be changed each n requests for avoiding to be banned.
    """
    
    # Attributes to scrap
    att_id = []
    att_price = []
    att_main = {}
    att_build = {}
    att_equipment = {}
    att_location = {}
    
    set_new_ip()
    test_change_ip()
    
    counter = 0
    print("There are {} new properties".format(len(property_links)))
    
    for link in property_links:
        
        print(link)
        counter += 1
        print("{} / {}".format(counter,len(property_links)))

        print(get_current_ip())
        if counter % 100 == 0:
            set_new_ip()

        html_flat = get_soup(link)

        try_set_new_ip(10, link)

        # id
        id_number = link.split("/")[4]
        att_id.append(id_number) # id
        
        print(id_number)

        # price
        try: 
            price = html_flat.find_all('span', class_='h3-simulated txt-bold')[0].get_text()
    
        except:
            "list index out of range"
            price = [None]
        
        att_price.append(price)
        print(price)

        # attributes
        flat_attributes = html_flat.find_all('div', class_='details-property_features')
        
        # main attributes
        try:
            number_of_main_attributes = len(flat_attributes[0].find_all("li"))

            ids_main = []
            for main_attribute in range(number_of_main_attributes):
                flat_main_attribute = flat_attributes[0].find_all("li")[main_attribute].get_text()
                ids_main.append(flat_main_attribute)
                
                att_main[id_number] = ids_main
        
        except:
            "list index out of range"
            att_main[id_number] = [None]
        
        # build attributes
        try:    
            number_of_build_attributes = len(flat_attributes[1].find_all("li"))

            ids_build = []
            for build_attribute in range(number_of_build_attributes):
                flat_build_attribute = flat_attributes[1].find_all("li")[build_attribute].get_text()
                ids_build.append(flat_build_attribute)
                
                att_build[id_number] = ids_build
        
        except:
            "list index out of range"
            att_build[id_number] = [None]
        
        # equipment attributes
        try:
            number_of_equipment_attributes = len(flat_attributes[2].find_all("li"))

            ids_equipment = []
            for equipment_attribute in range(number_of_equipment_attributes):
                flat_equipment_attribute = flat_attributes[2].find_all("li")[equipment_attribute].get_text()
                ids_equipment.append(flat_equipment_attribute)
                
                att_equipment[id_number] = ids_equipment
                
        except:
            "list index out of range"
            att_equipment[id_number] = [None]
        
        # location
        try:
            location = html_flat.find_all('div', class_='ide-box-detail overlay-box')[2].find_all("li")
            number_of_location_attributes = len(location)
            
            ids_location = []
            for location_attribute in range(number_of_location_attributes):
                flat_location_attribute = location[location_attribute].get_text()
                ids_location.append(flat_location_attribute)
                
                att_location[id_number] = ids_location
        
        except:
            "list index out of range"
            att_location[id_number] = [None]
        
    # processing attributes
    att_main_to_dict = pd.DataFrame.from_dict(att_main,orient='index')
    att_build_to_dict = pd.DataFrame.from_dict(att_build,orient='index')
    att_equipment_to_dict = pd.DataFrame.from_dict(att_equipment,orient='index')
    att_location_to_dict = pd.DataFrame.from_dict(att_location,orient='index')
    
    attributes = pd.concat([att_main_to_dict, att_build_to_dict, att_equipment_to_dict, att_location_to_dict],
                                axis = 1)
    
    attributes['price'] = att_price

    return attributes

### Writing Files

In [18]:
def parse_columns_names(attributes_columns):
    """parsing column names in the form att_main_X, att_build_X, att_equipment_X, att_location_X and price"""
    
    
    names = ["att_main","att_build","att_equipment","att_location"]
    
    new_columns = []
    
    counter_name = 0
    counter_column = 0

    for column_name in attributes_columns:
        try:
            new_name = names[counter_name]+"_"+str(column_name)
            new_columns.append(new_name)

            if attributes_columns[counter_column] >= attributes_columns[counter_column+1]:
                counter_name += 1
            counter_column += 1
        except:
            "'>=' not supported between instances of 'int' and 'str'"
    
    new_columns[-1] = "price"
    
    return new_columns

In [19]:
def process_attributes(attributes):
    """process attributes dataframe so that all the attributes dataframes have the same structure"""
    
    structure = ['att_main_0','att_main_1', 'att_main_2', 'att_main_3', 'att_main_4',
                 'att_main_5', 'att_main_6', 'att_main_7', 'att_main_8', 'att_main_9',
                 'att_main_10', 'att_main_11', 'att_main_12', 'att_main_13', 'att_main_14', 'att_main_15',
                 'att_build_0','att_build_1', 'att_build_2', 'att_build_3','att_equipment_0', 'att_equipment_1',
                 'att_equipment_2', 'att_equipment_3', 'att_equipment_4','att_location_0', 'att_location_1',
                 'att_location_2', 'att_location_3', 'att_location_4', 'att_location_5', 'att_location_6',
                 'att_location_7', 'att_location_8', 'price']
    
    cols = attributes.columns
    
    for column_name in structure:
        if column_name not in attributes.columns:
            attributes[column_name] = np.nan
    
    return attributes[structure]

In [20]:
def write_new_attributes(city):
    """update flats and concat new flats to the old ones"""
    
    # get new flats
    attributes_new = get_attributes(get_new_links(city))
    
    # parse column names
    attributes_new.columns = parse_columns_names(attributes_new.columns)
    
    # process attributes in the proper structure
    print("New flats processing...")
    attributes_new = process_attributes(attributes_new)
    print(attributes_new.head())
    
    # Writing new flats in the raw folder
    date = str(datetime.datetime.now())[:10]
    attributes_new.to_csv("data/raw/{}/attributes_{}.csv".format(city,date))
    
    # read old flats
    print("Reading old flats")
    attributes_old = pd.read_csv("data/clean/{}/{}_attributes.csv".format(city, city), sep = "^").set_index("Unnamed: 0")
    print(attributes_old.head())
    
    # concat new flats to the old ones
    attributes = pd.concat([attributes_old, attributes_new], axis = 0)
    print(attributes_new.head())
    
    # write attributes
    print("Updating new flats")
    attributes.to_csv("data/clean/{}/{}_attributes.csv".format(city, city), sep = "^")
    print("Flats updated")