In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import os
import re

In [3]:
html_list = [f for f in os.listdir('htmls/') if os.path.isfile(os.path.join('htmls', f))]

In [30]:
html_string = open('htmls/' + html_list[10], 'r').read()

In [31]:
soup = bs(html_string, 'lxml')

In [32]:
selector = 'head > meta'

Get meta title of webpage

In [33]:
soup.find_all('meta', attrs={'name': 'title'})[0]['content']

'For Rent: 535 W 23rd St. in West Chelsea'

Get description

In [34]:
soup.find_all('meta', attrs={'property' : 'og:description'})

[<meta content="535 W 23rd St #S6M is a rental unit in West Chelsea, Manhattan priced at $4,895." property="og:description"/>]

Get URL

In [35]:
soup.find_all('meta', attrs={'property' : 'og:url'})

[<meta content="https://streeteasy.com/building/the-tate/s06m" property="og:url"/>]

Get geographic meta data

In [36]:
soup.find_all('meta', attrs={'name' : 'ICBM'})

[<meta content="40.74850082, -74.00530243" name="ICBM"/>]

In [37]:
soup.find_all('meta', attrs={'name' : 'geo.region'})

[<meta content="US-NY" name="geo.region"/>]

Get Main Listing Details

In [38]:
soup.select('div.SiteBlock-menu > div > div > div > span > a')[0].attrs['href'] # main hyperlink

'https://streeteasy.com/building/the-tate'

In [39]:
soup.select('div.SiteBlock-menu > div > div > div')[0].text

'\n\n535 W 23rd Street #S6M\n\n\n  $4,895\n  \n    3 rooms\n  \n\n    1 bed\n  \n\n    1 bath\n  \n'

In [40]:
re.search('[0-9.,]+', soup.select('div.details > div.details_info_price > div.price')[0].text).group()

'4,895'

In [41]:
soup.select('div.details > div.details_info_price > div.concessions > div > div > div > div') #promotional pricing?

[]

In [42]:
soup.select('div.details_info > span')

[<span class="detail_cell first_detail_cell">3 rooms</span>,
 <span class="detail_cell">1 bed</span>,
 <span class="detail_cell last_detail_cell">1 bath</span>,
 <span class="nobreak"></span>,
 <span class="nobreak">in <a href="https://streeteasy.com/for-rent/west-chelsea">West Chelsea</a></span>]

Get Amenities two parts, Highlights and Building Amenities

In [43]:
soup.select('section.DetailsPage-contentBlock > div.AmenitiesBlock > ul > li > div > div')

[<div class="Text">
                   Doorman
                 </div>,
 <div class="Text">
                   Elevator
                 </div>,
 <div class="Text">
                   Pets Allowed
                 </div>,
 <div id="google_ads_iframe_/7449/Streeteasy/property_details/rent_general/rtt_main_p1_0__container__" style="border: 0pt currentColor; border-image: none; width: 100%; height: auto; display: inline-block;"><iframe data-google-container-id="1" data-is-safeframe="true" data-load-complete="true" frameborder="0" height="0" id="google_ads_iframe_/7449/Streeteasy/property_details/rent_general/rtt_main_p1_0" loading="eager" marginheight="0" marginwidth="0" scrolling="no" src="https://tpc.googlesyndication.com/safeframe/1-0-37/html/container.html" style="border: 0px currentColor; border-image: none; vertical-align: bottom; min-width: 100%;" title="3rd party ad content" width="0"></iframe></div>]

In [44]:
soup.select('section.DetailsPage-contentBlock > div.AmenitiesBlock > ul > li.AmenitiesBlock-item')

[<li class="AmenitiesBlock-item">
                   Bike Room
               </li>,
 <li class="AmenitiesBlock-item">
                   Community Recreation Facilities
               </li>,
 <li class="AmenitiesBlock-item">
                   Concierge
               </li>,
 <li class="AmenitiesBlock-item">
                   Gym
               </li>,
 <li class="AmenitiesBlock-item">
                   Laundry in Building
               </li>,
 <li class="AmenitiesBlock-item">
                   Live-in Super
               </li>,
 <li class="AmenitiesBlock-item">
                   Smoke-free
               </li>,
 <li class="AmenitiesBlock-item">
 </li>,
 <li class="AmenitiesBlock-item">
 <div data-google-query-id="CM2QrpXKkugCFZMnhwod7eIPOw" id="rtt_main_p1" style="-ms-zoom: 1; opacity: 1;">
 <div id="google_ads_iframe_/7449/Streeteasy/property_details/rent_general/rtt_main_p1_0__container__" style="border: 0pt currentColor; border-image: none; width: 100%; height: auto; display:

In [45]:
soup.select('article.right-two-fifths > section > h1 > a')[0].text

'535 W 23rd Street #S6M'

In [46]:
soup.select('div.details_info > span')

[<span class="detail_cell first_detail_cell">3 rooms</span>,
 <span class="detail_cell">1 bed</span>,
 <span class="detail_cell last_detail_cell">1 bath</span>,
 <span class="nobreak"></span>,
 <span class="nobreak">in <a href="https://streeteasy.com/for-rent/west-chelsea">West Chelsea</a></span>]

In [47]:
details = {}
for tag in soup.select('div.details_info > span'):
    if 'room' in tag.text: details['rooms'] = tag.text
    if 'bed' in tag.text: details['beds'] = tag.text 
    if 'bath' in tag.text: details['bath'] = tag.text
    if tag.text == 'in':
        details['Neighborhood'] = tag.select('span > a')[0].attrs['href']

In [48]:
[tag.text.strip() for tag in soup.select('section.DetailsPage-contentBlock > div.AmenitiesBlock > ul > li > div > div.Text')]

['Doorman', 'Elevator', 'Pets Allowed']

In [49]:
[tag.text.strip() for tag in soup.select('section.DetailsPage-contentBlock > div.AmenitiesBlock > ul > li.AmenitiesBlock-item')]

['Bike Room',
 'Community Recreation Facilities',
 'Concierge',
 'Gym',
 'Laundry in Building',
 'Live-in Super',
 'Smoke-free',
 '',
 '',
 'Garden',
 'Patio',
 'Roof Deck']

In [50]:
soup.select('div.details_info > span.nobreak > a')[0].text

'West Chelsea'

In [51]:
soup.select('article.left-three-fifths > section > div > div > div')[0].text.strip()

'Available Now'

In [52]:
soup.select('article.left-three-fifths > section > div > div > div.Vitals-data')

[<div class="Vitals-data">
           Available Now
       </div>,
 <div class="Vitals-data">
           72 Days
         </div>,
 <div class="Vitals-data u-capitalize" title="03/09/20">
         ?
         $70
         (1.4%)
         2 days ago
     </div>]

In [65]:
tags = soup.select('div.full > section > div.Nearby > div.Nearby-half > div.Nearby-transportation > ul > li.Nearby-transportationItem')

In [75]:
subways_list = []

for tag in tags:
    lines_dict = {}
    lines_dict['lines'] = [letter for letter in tag.text.strip().split('\n') if len(letter) == 1]
    lines_dict['distance'] = tag.select('span > b')[0].text
    lines_dict['station'] = tag.select('span.Text')[0].text.strip().split('\n')
    subways_list.append(lines_dict)

In [76]:
subways_list

[{'lines': ['C', 'E'], 'distance': '0.43 miles', 'station': 'at 23rd St  '},
 {'lines': ['A', 'C', 'E', 'L'],
  'distance': '0.55 miles',
  'station': 'at 14th St-8th Av  '},
 {'lines': ['1'], 'distance': '0.6 miles', 'station': 'at 28th St  '},
 {'lines': ['1'], 'distance': '0.6 miles', 'station': 'at 23rd St  '},
 {'lines': ['7'],
  'distance': '0.62 miles',
  'station': 'at 34th St - Hudson Yards  '}]