## Scrapping urbania

The goal of this notebook is to scrape data from https://urbania.pe/ and store it on a mongo databse

### Libraries import

In [4]:
import math
import numpy as np
import pymongo
import re 
import urllib.request as urllib2
import datetime

from time import sleep
from bs4 import BeautifulSoup

from constants import *

import pandas as pd

In [5]:
# bsae urls to use
base_url = 'https://urbania.pe{}'
apartments_url = 'https://urbania.pe/buscar/venta-de-departamentos'


### Define database connections

In [6]:
uri = "mongodb://{}:{}@{}/{}".format(username, password, hostlist, database)
client = pymongo.MongoClient(uri)
db = client['urbania']

In [8]:
list(db['admin'].find())

[{'_id': ObjectId('5deb3aba3dadd940d2148a36'), 'name': 'gonzalo'},
 {'_id': ObjectId('5dec891299df02824e747081'), 'name': 'jay'}]

In [9]:
apartmentsdb = db['apartments']

### Here we put the logic to get the data

In [10]:
def getSoupObj(url):
    sleep(1.5)
    try:
        r = urllib2.urlopen(url)
    except UnicodeEncodeError as e:
        return None
    html = r.read()
    htmltext = html.decode('ISO-8859-2')
    return BeautifulSoup(htmltext, 'lxml')

In [11]:
dictionary = {
    'Tipos Dptos.': 'apartment_type',
    'Dpto.': 'department',
    'Provincia': 'province',
    'Distrito': 'district',
    'Urb.': 'urbanization',
    'Torres': 'towers',
    'Inmuebles': 'properties',
    'Ascensores': 'elevators',
    'NÂ°pisos': 'n_floors',
    'Cocheras': 'garages',
    'Referencia': 'reference',
    'Construye': 'builds',
    'Comercializa': 'sells',
    'Financia': 'finances'
}

In [45]:
def get_apartment(apartment):
    
    apartment_url = base_url.format(apartment['url'])
    apartment['url'] = apartment_url
    req = urllib2.Request(apartment_url)
    apartmentObj = getSoupObj(req)
    
    if apartmentObj == -1:
        return
    
#     proyect info

    get_text(apartment, apartmentObj, 'title', 'info-title', 'h2')
    get_text(apartment, apartmentObj, 'address', 'info-location', 'h2')
    get_text(apartment, apartmentObj, 'description', 'section-description', 'div')

    status_info = apartmentObj.find('div', {'class': 'status done current'})
    status_info = status_info.find('span', {'class': 'label'})
    
    apartment['status'] = status_info.text
    
    get_status(apartment, apartmentObj)

    price_info = apartmentObj.find('div', {'class': 'status-columns status-columns-price'})
    get_rows_info(price_info, apartment, 'data-price')
    get_text(apartment, apartmentObj, 'publisher', 'publisher-subtitle', 'h3')
#   detailed info (units)
    print('detailed info(units)')
    get_details(apartment, apartmentObj)
    


In [14]:
#get the status of the project(apartment) in general information like the area, the phase, number of rooms, etc
def get_status(obj, objBs):

    status_columns = objBs.find('div', {'class': 'status-columns'})
    columns=status_columns.findAll('div', {'class': 'column'})

    pd.Series(columns).apply(get_columns_info, args= (obj,))

In [15]:
#get columns info from the status table
def get_columns_info(column, obj):

    rows=column.findAll('div', {'class': 'row'})
    pd.Series(rows).apply(get_rows_info, args=(obj, 'data'))

In [16]:
#get rows info from columns table
def get_rows_info(row, obj, data_class):

    label = row.find('span', {'class': 'label'})
    data = row.find('span', {'class': data_class})
    obj[label.text] = data.text

In [19]:
#general function to get the text from html class
def get_text(obj, objBs, feature, tag, marker, inplace=True):
    new_feature = objBs.find(marker, {'class': tag})
#     print(new_feature)
    if inplace:
        obj[feature] = new_feature.text
    else:
        return new_feature.text

In [54]:
# get the detailed information from the project like the services, ammenities, etc
def get_details(obj, ObjBs):
    
    sections = ObjBs.findAll('section', {'class': 'general-section article-section'})
    for section in sections:
        title = section.find('div', {'class': 'section-title'})
        print(title.text)
        try:
            bullets = section.find('ul', {'class': 'section-bullets'})
            print(bullets.text)
            new_section = {}
            
        except:
            print('this one has units')
    

In [21]:
#because of the dirty data we need to remove some characters that make noise to the data
def remove_blanks(current_list, pattern):
    final_list = [item for item in current_list if elem != pattern]
    return final_list

Features we are going to collect 
- 'Dormitorios',
- 'Entrega',
- 'Estacionamientos',
- 'Metros techados',
- 'Metros totales',
- 'Precio desde', 
- 'Unidades',
- '_id', 
- 'address', 
- 'description', 
- 'status', 
- 'title', 
- 'url'


In [55]:
tihs = 10000
count = 1
apartments = []
fails = 0
threshold = 100000000
counter = 0
# count is for page counter
while count < 5:
    hdr = {'User-Agent': 'Mozilla/5.0'}
    print(apartments_url)
    req = urllib2.Request(apartments_url, headers=hdr)
    apartmentsBsObj = getSoupObj(req)
    print(type(apartmentsBsObj))
#     objs = apartmentsBsObj.findAll('article', {'class': 'b-card-item js-card-item b-card-item--proyecto-destacado'})
     
    objs = apartmentsBsObj.findAll('div', {'class': 'posting-card highlighted'})
#     objsSeen = apartmentsBsObj
    print('In page {} we found {} apartments'.format(count, len(objs)))
    for obj in objs:
#         the objA is the object with <a> label in obj
        counter +=1
        apartment = {}
        apartment['url'] = obj['data-to-posting']
        apartment['_id'] = int(apartment['url'].split('-')[-1])
#         if apartment['_id'] < threshold:
        try:
            get_apartment(apartment)
        except:

            threshold = apartment['_id'] if (apartment['_id'] < threshold) else threshold
            fails+=1
            print(apartment['url'])
            print('damn so')
#         else:
#             print('xd')
#         print(apartment)
        apartments.append(apartment)
        
        if counter == 10:
            break
#         break

        
    count +=1
    break

pd.DataFrame(apartments)

https://urbania.pe/buscar/venta-de-departamentos
<class 'bs4.BeautifulSoup'>
In page 1 we found 18 apartments
detailed info(units)

Unidades

this one has units

CaracterĂ­sticas generales


Acceso discapacitados 
Cerca a colegios 
Intercomunicador 
Pisos Totales: 13
Cerca al Mar (a menos de 5 cdras) 
Antisismico 
Parques Cercanos 
Frente al mar (primera fila) 
Seguridad 
Av. acceso asfaltada 


Servicios


Juegos infantiles 
Agua potable 
Servicios bĂĄsicos (agua/luz) 


Ambientes


Desague 
Hall de ingreso 
Club House 
SalĂłn de usos mĂşltiples 
Cerco  Electrico 
Parque Interno 


Exteriores


Area de BBQ 
JardĂ­n Privado 

detailed info(units)

Unidades

this one has units

CaracterĂ­sticas generales


Piscina 
Ascensor(es): 4
Pisos Totales: 17


Servicios


Juegos infantiles 


Ambientes


Sala de mĂşsica y cine 
Hall de ingreso 
Parque Interno 


Exteriores


Area de BBQ 

detailed info(units)

Unidades

this one has units

CaracterĂ­sticas generales


Acceso discapacitados 
En co

Unnamed: 0,Dormitorios,Entrega,Estacionamientos,Metros techados,Metros totales,Precio desde,Unidades,_id,address,description,publisher,status,title,url
0,\n2 a 3\n,\nMarzo 2020\n,\n - \n,\n50 a 105 mÂ˛\n,\n50 a 105 mÂ˛\n,\nConsultar precio\n,\n54\n,8003929,"Jr. Hermilio Valdizan NÂ°150 Magdalena, Magdal...",\n\nALTEA es un exclusivo proyecto situado en ...,INMOBILIARIA EUREKA SAC [Empresa],\n\t\t\t\t\tPre-venta en construcciĂłn\n\t\t\t\t,Altea Condominio,https://urbania.pe/inmueble/proyecto-altea-con...
1,\n1 a 3\n,\nDiciembre 2020\n,\n - \n,\n39 a 75 mÂ˛\n,\n39 a 97 mÂ˛\n,"\nS/ 226,700\n",\n70\n,8004141,"Avenida Costanera 2982, San Miguel, Lima",\n\nĂvida es un proyecto que contarĂĄ con 2 t...,EDIFICACIONES INMOBILIARIAS S.A.C. [Empresa],\n\t\t\t\t\tPre-venta en construcciĂłn\n\t\t\t\t,Ăvida,https://urbania.pe/inmueble/proyecto-avida-lim...
2,\n1 a 3\n,\nDiciembre 2020\n,\n - \n,\n63 a 180 mÂ˛\n,\n63 a 233 mÂ˛\n,"\nS/ 570,000\n",\n14\n,8002712,"Calle Durero 490, San Borja, Lima",\n\nUn condominio que busca recuperar el estil...,COMPASS FONDO DE INVERSION INMOBILIARIO I [Em...,\n\t\t\t\t\tPre-venta en construcciĂłn\n\t\t\t\t,"Condominio ""Hometown"" en San Borja",https://urbania.pe/inmueble/proyecto-condomini...
3,\n1 a 3\n,\nInmediata\n,\n - \n,\n48 a 82 mÂ˛\n,\n48 a 82 mÂ˛\n,"\nS/ 279,200\n",\n3\n,8003865,"Calle Fortunato Quezada 455, MARANGA, San Miguel",\n\nResidencial El Prado es un proyecto que co...,EDIFICACIONES INMOBILIARIAS S.A.C. [Empresa],\n\t\t\t\t\tVenta en estreno\n\t\t\t\t,Residencial El Prado,https://urbania.pe/inmueble/proyecto-residenci...
4,\n1 a 3\n,\nJunio 2021\n,\n - \n,\n42 a 75 mÂ˛\n,\n42 a 75 mÂ˛\n,"\nS/ 218,708\n",\n16\n,8004784,"Natalio Sanchez 287, JesĂşs MarĂ­a, Lima",\n\nNuestra misiĂłn: alinear en una misma fĂłr...,Al punto [Empresa],\n\t\t\t\t\tPre-venta en planos\n\t\t\t\t,"Suburbia, Toma JesĂşs Maria",https://urbania.pe/inmueble/proyecto-suburbia-...
5,\n1 a 3\n,\nDiciembre 2020\n,\n - \n,\n46 a 122 mÂ˛\n,\n46 a 158 mÂ˛\n,"\nS/ 336,382\n",\n51\n,8004528,"AV. GENERAL FELIPE SALAVERRY NÂ° 2146 , JesĂşs...",\n\nES UN MODERNO EDIFICIO DE MĂS ALTO NIVEL ...,San Martin [Empresa],\n\t\t\t\t\tPre-venta en construcciĂłn\n\t\t\t\t,Edificio Altovento,https://urbania.pe/inmueble/proyecto-edificio-...
6,\n1 a 2\n,\nMayo 2020\n,\n1 a 2\n,\n91 a 179 mÂ˛\n,\n91 a 248 mÂ˛\n,"\nUSD 218,368\n\t\t\t\t\t (USD 232,185)\n\t\t\t\t",\n17\n,8002847,"Calle Chiclayo 175, Miraflores, Lima",\n\nGardino se ubica en una calle tranquila de...,VALICO VALICO [Empresa],\n\t\t\t\t\tPre-venta en construcciĂłn\n\t\t\t\t,Edificio Gardino - ÂĄVisita Piloto Amoblado!,https://urbania.pe/inmueble/proyecto-edificio-...
7,\n1 a 3\n,\nDiciembre 2021\n,\n - \n,\n37 a 67 mÂ˛\n,\n37 a 67 mÂ˛\n,"\nS/ 160,000\n",\n13\n,8004797,"Calle faisanes 337, MARISCAL ANDRES AVELINO CA...",\n\nEl proyecto Faisanes se encuentra ubicado ...,Actual Inmobiliaria [Empresa],\n\t\t\t\t\tPre-venta en planos\n\t\t\t\t,Faisanes,https://urbania.pe/inmueble/proyecto-faisanes-...
8,\n1 a 2\n,\nDiciembre 2020\n,\n - \n,\n90 a 173 mÂ˛\n,\n90 a 245 mÂ˛\n,"\nUSD 213,650\n",\n9\n,8004247,"Av. Paseo de la RepĂşblica 5817 Miraflores, Mi...","\n\nRepĂşblica es un edificio moderno, ubicado...",Actual Inmobiliaria [Empresa],\n\t\t\t\t\tPre-venta en planos\n\t\t\t\t,RepĂşblica,https://urbania.pe/inmueble/proyecto-republica...
9,\n1 a 3\n,\nEnero 2021\n,\n - \n,\n53 a 88 mÂ˛\n,\n78 a 127 mÂ˛\n,"\nS/ 533,155\n",\n14\n,8004554,"Jr. Costa Rica 116 , JesĂşs MarĂ­a, Lima",\n\nBenesse es un proyecto de 21 departamentos...,EDIFICACIONES INMOBILIARIAS S.A.C. [Empresa],\n\t\t\t\t\tPre-venta en planos\n\t\t\t\t,Benesse,https://urbania.pe/inmueble/proyecto-benesse-l...


In [39]:
fails

10