# Scraping Data 
#### renthop.com
https://www.renthop.com/nyc/apartments-for-rent


Scraped more pages and cleaned up the code for just scrapping

In [15]:
import numpy as np 
import pandas as pd 
import requests 
import matplotlib.pyplot as plt 
from IPython.display import clear_output
import html5lib
from bs4 import BeautifulSoup 


%matplotlib inline 

https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page=2&sort=hopscore&q=&search=0

In [16]:
def parse_data(listing_divs): 
    listing_list = [] 
    for idx in range(len(listing_divs)): 
        indv_listing = [] 
        current_listing = listing_divs[idx] 
        href = current_listing.select('a[id*=title]')[0]['href'] 
        addy = current_listing.select('a[id*=title]')[0].string 
        hood = current_listing.select('div[id*=hood]')[0].string.replace('\n','') 
 
        indv_listing.append(href) 
        indv_listing.append(addy) 
        indv_listing.append(hood) 
 
        listing_specs = current_listing.select('table[id*=info] tr') 
        for spec in listing_specs: 
            try: 
                values = spec.text.strip().replace(' ', '_').split() 
                clean_values = [x for x in values if x != '_'] # Not getting  '_' these values 
                indv_listing.extend(clean_values) 
            except: 
                indv_listing.extend(np.Unknownn) 
        listing_list.append(indv_listing) 
    return listing_list 

In [17]:
def scrap_pages(number_pages = 100):
    url_prefix = "https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page=" 
    page_no = 1 
    url_suffix = "&sort=hopscore&q=&search=0" 
    all_pages_parsed = []

    for i in range(number_pages):
    
        target_page = url_prefix + str(page_no) + url_suffix

        # Cleart the ouput and then print new one
        print(target_page)
        clear_output(wait=True)

        r = requests.get(target_page)

        soup = BeautifulSoup(r.content, 'html5lib')

        listing_divs = soup.select('div[class*=search-info]')

        one_page_parsed = parse_data(listing_divs)

        all_pages_parsed.extend(one_page_parsed)

        page_no += 1
        
    print("Completed")
    return all_pages_parsed
        

In [18]:
all_pages_parsed = scrap_pages(1000)

Completed


In [None]:
# Data Frame 
listings  = pd.DataFrame(all_pages_parsed, columns=['url', 'address', 'neighborhood', 'rent', 'beds', 'baths', "Unknown"],) 
listings.replace('None', np.nan, inplace=True)

## Fixing Data

We have few flex rooms values,  that why some of bath is skewed to next column

#### Fixing Flex Rooms and Baths issue

In [32]:
listings["Unknown"], listings["baths"] = np.where(listings["Unknown"].notnull() , 
                                                  [listings["baths"], listings["Unknown"]], 
                                                  [listings["Unknown"], listings["baths"]])

In [None]:
# Renaming 
new_columns = listings.columns.values
new_columns[6] = 'flexs'
listings.columns = new_columns


#### Saving data as csv

In [34]:
import os

dir = 'Data'
if(os.path.isdir(dir) is False):
    os.makedirs("Data")
    
listings.to_csv("Data\listings_2.csv", index = False)