# Scraping Data 
#### renthop.com
https://www.renthop.com/nyc/apartments-for-rent

It is first setup of scraper that I will use for this website.

In [1]:
import numpy as np 
import pandas as pd 
import requests 
import matplotlib.pyplot as plt 
%matplotlib inline 

A quick retrieve of web page to verify that the link works

In [2]:
url = "https://www.renthop.com/nyc/apartments-for-rent"

In [3]:
req = requests.get(url) 
# req.content # Displays Web's Content

Installing libraries

In [4]:
# !pip install beautifulsoup4

In [5]:
# !pip install html5lib

Parsing HTML

In [6]:
import html5lib
from bs4 import BeautifulSoup 

soup = BeautifulSoup(req.content, "html5lib")
# soup # Display's Parsed HTML 

Retrieving relevant HTML tags Which Contains listing information

In [7]:
listing_divs = soup.select('div[class*=search-info]') # 'search-info' is related class for those divs
# listing_divs # Display's All retrieved divs 

In [8]:
len(listing_divs) # Total Number of divs

20

In [9]:
# listing_divs[0] # Looking at individual div

### Retrieving Individual Elements

1. Retrieving URL of the listing

In [10]:
# BeautifulSoup.select? # Perform a CSS selection operation on the current element

In [11]:
listing_divs[0].select('a[id*=title]') # Select all the id's with string 'title' within 'a' tag

[<a class="font-size-11 listing-title-link b" href="https://www.renthop.com/listings/43-10-crescent-street/1209/14481520" id="listing-14481520-title">43-10 Crescent Street, Apt 120...</a>]

In [12]:
# We are only interseted in only the href one
listing_link = listing_divs[0].select('a[id*=title]')[0]['href']
listing_link

'https://www.renthop.com/listings/43-10-crescent-street/1209/14481520'

2. Parsing Address 

In [13]:
address = listing_divs[0].select('a[id*=title]')[0].string  # The string content with in 'a' tag
address

'43-10 Crescent Street, Apt 120...'

3. Parsing neighborhood

In [14]:
listing_divs[0].select('div[id*=hood]') # Select all the id's with string 'hood' within 'a' tag

[<div class="font-size-9 overflow-ellipsis" id="listing-14481520-neighborhoods" style="margin-top: -1px;">
 Long Island City, Northwestern Queens, Queens
 </div>]

In [15]:
listing_divs[0].select('div[id*=hood]')[0].string

'\nLong Island City, Northwestern Queens, Queens\n'

In [16]:
neighborhood = listing_divs[0].select('div[id*=hood]')[0].string.replace('\n','')  # Removing '\n'
neighborhood

'Long Island City, Northwestern Queens, Queens'

In [17]:
print(listing_link) 
print(address) 
print(neighborhood)

https://www.renthop.com/listings/43-10-crescent-street/1209/14481520
43-10 Crescent Street, Apt 120...
Long Island City, Northwestern Queens, Queens


So Far Good.

4. Parsing rental, Number of Beds and Baths 

Table have 'info' string in 'id' tag. <br>
Table contains a 'tr' containing 3 'td's with rental, beds and baths data stored as a string within each 'td'

In [18]:
listing_divs[0].select('table[id*=info]') # Parsing table with 'id' containing 'info' string

[<table id="listing-14481520-info">
 <tbody><tr>
 <td class="font-size-11 b" id="listing-14481520-price" style="padding: 0px 10px 0px 0px; vertical-align: bottom;">
 $2,518
 </td>
 <td class="font-size-11 b" style="border-left: 1px solid #eeeeee; padding: 0px 10px 0px 10px; vertical-align: bottom;">
 <span style="color: #444444;">
 Studio
 </span>
 </td>
 <td class="font-size-11 b" style="border-left: 1px solid #eeeeee; padding: 0px 10px 0px 10px; vertical-align: bottom;">
 <span style="color: #444444;">1 Bath</span>
 </td>
 </tr>
 </tbody></table>]

In [19]:
listing_divs[0].select('table[id*=info] tr') # Parsing table with 'id' containing 'info; string only'tr'

[<tr>
 <td class="font-size-11 b" id="listing-14481520-price" style="padding: 0px 10px 0px 0px; vertical-align: bottom;">
 $2,518
 </td>
 <td class="font-size-11 b" style="border-left: 1px solid #eeeeee; padding: 0px 10px 0px 10px; vertical-align: bottom;">
 <span style="color: #444444;">
 Studio
 </span>
 </td>
 <td class="font-size-11 b" style="border-left: 1px solid #eeeeee; padding: 0px 10px 0px 10px; vertical-align: bottom;">
 <span style="color: #444444;">1 Bath</span>
 </td>
 </tr>]

In [20]:
# looping all 'td's 
# And Fetching each individual spec data

listing_specs = listing_divs[0].select('table[id*=info] tr') 
for spec in listing_specs: 
    spec_data = spec.text.strip().replace(' ', '_').split() 
    print(spec_data) 

['$2,518', 'Studio', '1_Bath']


### Putting All things together 

In [21]:
listing_list = [] 
for idx in range(len(listing_divs)): 
    indv_listing = [] 
    current_listing = listing_divs[idx] 
    link = current_listing.select('a[id*=title]')[0]['href'] 
    address = current_listing.select('a[id*=title]')[0].string 
    neighborhood = current_listing.select('div[id*=hood]')[0].string.replace('\n','') 

    indv_listing.append(link) 
    indv_listing.append(address) 
    indv_listing.append(neighborhood) 

    listing_specs = current_listing.select('table[id*=info] tr') 
    for spec in listing_specs: 
        try: 
            indv_listing.extend(spec.text.strip().replace(' ', '_').split()) 
        except: 
            indv_listing.extend(np.Unknownn) 
    listing_list.append(indv_listing)

In [22]:
# listing_list

In [23]:
# column_Unknownmes = ['link','address','neighborhood','price','beds','baths','Unknown']
# pd.DataFrame(listing_list, columns=column_Unknownmes).head() # Just to view in nice format

So far, this is just one web page with 20 listings.

### Parsing Multiple Pages

https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page=2&sort=hopscore&q=&search=0
<br>This how url looks like

Looping through pages based on page number 

In [24]:
url_prefix = "https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page=" 
page_no = 1 
url_suffix = "&sort=hopscore&q=&search=0" 
  
for i in range(3): 
    target_page = url_prefix + str(page_no) + url_suffix 
    print(target_page) 
    page_no += 1 

https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page=1&sort=hopscore&q=&search=0
https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page=2&sort=hopscore&q=&search=0
https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page=3&sort=hopscore&q=&search=0


In [25]:
def parse_data(listing_divs): 
    listing_list = [] 
    for idx in range(len(listing_divs)): 
        indv_listing = [] 
        current_listing = listing_divs[idx] 
        href = current_listing.select('a[id*=title]')[0]['href'] 
        addy = current_listing.select('a[id*=title]')[0].string 
        hood = current_listing.select('div[id*=hood]')[0].string.replace('\n','') 
 
        indv_listing.append(href) 
        indv_listing.append(addy) 
        indv_listing.append(hood) 
 
        listing_specs = current_listing.select('table[id*=info] tr') 
        for spec in listing_specs: 
            try: 
                values = spec.text.strip().replace(' ', '_').split() 
                clean_values = [x for x in values if x != '_'] # Not getting  '_' these values 
                indv_listing.extend(clean_values) 
            except: 
                indv_listing.extend(np.Unknownn) 
        listing_list.append(indv_listing) 
    return listing_list 

In [26]:
from IPython.display import clear_output


all_pages_parsed = []

for i in range(100):
    
    target_page = url_prefix + str(page_no) + url_suffix
    
    # Cleart the ouput and then print new one
    print(target_page)
    clear_output(wait=True)

    r = requests.get(target_page)
    
    soup = BeautifulSoup(r.content, 'html5lib')
    
    listing_divs = soup.select('div[class*=search-info]')
    
    one_page_parsed = parse_data(listing_divs)
    
    all_pages_parsed.extend(one_page_parsed)
    
    page_no += 1
    
print("Completed")

Completed


In [27]:
# Each page has 20 lisitings and we parsed 20 pages
100 * 20 # Total Listings

2000

In [29]:
listings  = pd.DataFrame(all_pages_parsed, columns=['url', 'address', 'neighborhood', 'rent', 'beds', 'baths', "Unknown"],) 
listings.replace('None', np.nan, inplace=True)
listings.head(3)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,Unknown
0,https://www.renthop.com/listings/gold-st/12c/1...,Gold St.,"Downtown Brooklyn, Northwestern Brooklyn, Broo...","$3,395",1_Bed,1_Bath,
1,https://www.renthop.com/listings/gold-st/9d/15...,Gold St.,"Downtown Brooklyn, Northwestern Brooklyn, Broo...","$3,071",1_Bed,1_Bath,
2,https://www.renthop.com/listings/141-east-56th...,"141 East 56th Street, Apt 11A","Midtown East, Midtown Manhattan, Manhattan","$4,925",2_Bed,1_Bath,


## Inspecting Data

We have few flex rooms values,  that why some of bath is skewed to next column

In [37]:
listings[listings['Unknown'].notnull()].head(2)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,Unknown
3,https://www.renthop.com/listings/west-street/2...,west street,"Financial District, Downtown Manhattan, Manhattan","$3,750",1_Bed,/_Flex_3_,1_Bath
13,https://www.renthop.com/listings/e-39th-st/232...,E 39th St,"Murray Hill, Midtown Manhattan, Manhattan","$3,350",1_Bed,/_Flex_2_,1_Bath


- Fixing Flex Rooms and Baths issue

In [38]:
listings["Unknown"], listings["baths"] = np.where(listings["Unknown"].notnull() , 
                                                  [listings["baths"], listings["Unknown"]], 
                                                  [listings["Unknown"], listings["baths"]])

In [53]:
# Renaming 
new_columns = listings.columns.values
new_columns[6] = 'flexs'
listings.columns = new_columns

# Checking Results
listings[listings['flexs'].notnull()].head(3)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
3,https://www.renthop.com/listings/west-street/2...,west street,"Financial District, Downtown Manhattan, Manhattan","$3,750",1_Bed,1_Bath,/_Flex_3_
13,https://www.renthop.com/listings/e-39th-st/232...,E 39th St,"Murray Hill, Midtown Manhattan, Manhattan","$3,350",1_Bed,1_Bath,/_Flex_2_
14,https://www.renthop.com/listings/west-43rd-str...,West 43rd Street,"Hell's Kitchen, Midtown Manhattan, Manhattan","$3,000",Studio,1_Bath,/_Flex_1_


Saving data as csv

In [74]:
import os

dir = 'Data'
if(os.path.isdir(dir) is False):
    os.makedirs("Data")
    
listings.to_csv("Data\listings.csv", index = False)