In [1]:
from bs4 import BeautifulSoup

In [2]:
# common util function
import re
def prettify_text(data):
    """Given a string, replace unicode chars and make it prettier"""

    # format it nicely: replace multiple spaces with just one
    data = re.sub(' +', ' ', data)
    # format it nicely: replace multiple new lines with just one
    data = re.sub('(\r?\n *)+', '\n', data)
    # format it nicely: replace bullet with *
    data = re.sub(u'\u2022', '* ', data)
    # format it nicely: replace registered symbol with (R)
    data = re.sub(u'\xae', ' (R) ', data)
    # format it nicely: remove trailing spaces
    data = data.strip('\'')
    # format it nicely: encode it, removing special symbols
    # data = data.encode('utf8', 'ignore') # Why is everything encoded???

    return str(data)

In [142]:
import requests
test_url = "https://www.apartments.com/la-jolla-international-gardens-san-diego-ca/jfps2s6/" # From page 2

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page = requests.get(test_url, headers=headers)

In [143]:
soup = BeautifulSoup(page.content, 'html.parser')
soupp = soup.prettify()

In [144]:
soup = soup.find('div', class_='sectionContainer')

In [148]:
def extract_availability(target):
    """Return the processed text of the listings"""
    content = []

    content.append(target.find('span', {'class': 'modelName'}).text)
    content.append(target.find('span', {'class': "rentLabel"}).text)
    content.append(target.find('span', {'class': "detailsTextWrapper"}).text)
    content.append(target.find('span', {'class': "detailsTextWrapper leaseDepositLabel"}).text)

    return str(content)

def get_availability(soup, fields):
    """Get all the listings from availability section"""
    fields["availability"] = []

    if soup is None: return

    temp_obj = soup.find('section', {'id': 'availabilitySection'}).find('div', {'id': 'pricingView'})
    obj = temp_obj.find('div', {'class': "tab-section active"})



    if obj is not None:
        for listing in soup.findAll('div', {'class': "row"}):
            target = listing.find('div', {'class': 'column1'})
            fields['availability'].append(target.get_text(strip='\n', separator=', '))


In [149]:
field = {}
get_availability(soup, field)

In [150]:
field

{'availability': ['Florentine, $2,395 – $2,545, 1 bed, ,, 1 bath, ,, 652 sq ft, 6 – 12 Month Lease, ,, Not Available, Tour This Floor Plan, View Florentine, Photos, View Florentine, Floor Plan, View Florentine, Virtual Tours',
  'Babilonia, $3,065 – $3,205, 2 beds, ,, 2 baths, ,, 1,004 sq ft, 6 – 12 Month Lease, ,, Not Available, Tour This Floor Plan, View Babilonia, Photos, View Babilonia, Floor Plan, View Babilonia, Virtual Tours',
  'Versailles, $3,260 – $3,455, 2 beds, ,, 2 baths, ,, 1,215 sq ft, 6 – 12 Month Lease, ,, Not Available, Tour This Floor Plan, View Versailles, Photos, View Versailles, Floor Plan, View Versailles, Virtual Tours',
  'Florentine, $2,395 – $2,545, 1 bed, ,, 1 bath, ,, 652 sq ft, 6 – 12 Month Lease, ,, Not Available, Tour This Floor Plan, View Florentine, Photos, View Florentine, Floor Plan, View Florentine, Virtual Tours',
  'Babilonia, $3,065 – $3,205, 2 beds, ,, 2 baths, ,, 1,004 sq ft, 6 – 12 Month Lease, ,, Not Available, Tour This Floor Plan, View Babi

In [60]:
def get_pet_policy(soup, fields):
    """Given a beautifulSoup parsed page, extract the pet policy details"""
    if soup is None:
        fields['petPolicy'] = ''
        return

    # the pet policy
    data = soup.find('section', {'id': 'feesSection'}).find('div', {'class': 'feespolicies'})
    
    
    if data is None:
        data = ''
    else:
        data = data.get_text(strip=True, separator=', ')
        data = prettify_text(data)

    # format it nicely: remove the trailing whitespace
    fields['petPolicy'] = data

In [61]:
fields = {}

get_pet_policy(soup, fields)

In [62]:
fields

{'petPolicy': 'Cats Allowed, Restrictions:, $250 deposit for one cat. $400 for two cats. Maximum of two cats., Pet Limit, 2, Pet interview, Not required, Spayed/Neutered, Required, Declawed, Not required, Pet deposit, $250'}

In [50]:
soup.find('section', {'id': 'feesSection'})\
    .find('div', {'class': 'feespolicies'}).text

'\n\n\n\n\nCats Allowed\n\n\n\n\nRestrictions: $250 deposit for one cat. $400 for two cats. Maximum of two cats.\r\n                                    \n\n\n\nPet Limit\n2\n\n\n\n\nPet interview\nNot required\n\n\n\n\nSpayed/Neutered\nRequired\n\n\n\n\nDeclawed\nNot required\n\n\n\n\nPet deposit\n$250\n\n\n\n\n\n'

In [168]:
def get_parking_info(soup, fields):
    """Given a beautifulSoup parsed page, extract the parking details"""

    fields['parking'] = ''

    if soup is None: return

    obj = soup.find('div', {"id": "profileV2FeesWrapper"}).findAll('div', {"class": "feespolicies"})

    if obj is not None:
        for i in obj:
            if i.find('h4', {'class': "header-column"}).text == "Parking":
                data = prettify_text(i.parent.parent.findNext('div', {'class':'component-body'}).get_text(strip='\n', separator=', '))
        # format it nicely: remove trailing spaces
        if data is None:
            fields['parking'] = ""
        else:
            fields['parking'] = data


In [169]:
fields = {}
get_parking_info(soup, fields)

In [170]:
fields

{'parking': 'Restrictions:, $250 deposit for 1 cat. $400 deposit for 2 cats.No monthly Pet rent., $250 deposit for 1 cat. $400 deposit for 2 cats., Pet Limit, 2, Pet interview, Not required, Spayed/Neutered, Not required, Declawed, Not required, Pet deposit, $250'}

In [84]:
soup.find('div', {"id": "profileV2FeesWrapper"}).findAll('div', {"class": "feespolicies"})[1].get_text(strip='\n', separator=', ')


'Parking, Garage, Assigned Parking, 1 spot for 1 Bedrooms &\n2 spots for 2 Bedroom.\nGuest parking on property.'

In [132]:
def get_transport_education(soup, fields):
    """Given a beautifulSoup parsed page, extract the transportation and education details"""

    fields['transportation'] = ''
    fields['education'] = ''

    if soup is None: return

    education = soup.find('div', {"id": "educationContainer"}).find('div', {"class": "spec"}).get_text(strip='\n', separator=', ')

    transport = soup.find('section', {"id": "transportationSection"}).find('div', {"class": "spec"}).get_text(strip='\n', separator=', ')

    if education is not None:
        ed_data = prettify_text(education)

    if transport is not None:
        tran_data = prettify_text(transport)

        # format it nicely: remove trailing spaces
        fields['transportation'] = tran_data
        fields['education'] = ed_data

In [133]:
fields = {}
get_transport_education(soup, fields)

Colleges, Walk, Distance, Colleges, Walk, Distance, University of California, San Diego, 31 min, 1.6 mi, University of California, San Diego, 47 min, 2.4 mi, San Diego Mesa College, 117 min, 6.0 mi


In [134]:
fields

{'transportation': 'Transit / Subway, Walk, Distance, Transit / Subway, Walk, Distance, Morena/Linda Vista, 176 min, 9.1 mi, Fashion Valley Transit Center, 179 min, 9.2 mi, Hazard Center, 181 min, 9.3 mi, Old Town Transit Center, 188 min, 9.7 mi, Mission Valley Center, 191 min, 9.9 mi, Commuter Rail, Walk, Distance, Commuter Rail, Walk, Distance, Sorrento Valley, 66 min, 3.4 mi, Old Town Transit Center, 190 min, 9.8 mi, Solana Beach, 214 min, 11.1 mi, San Diego (Santa Fe Depot), 254 min, 13.1 mi, Encinitas, 291 min, 15.0 mi, Airports, Walk, Distance, Airports, Walk, Distance, San Diego International, 259 min, 13.4 mi, McClellan-Palomar, 446 min, 23.0 mi',
 'education': 'Colleges, Walk, Distance, Colleges, Walk, Distance, University of California, San Diego, 31 min, 1.6 mi, University of California, San Diego, 47 min, 2.4 mi, San Diego Mesa College, 117 min, 6.0 mi'}