# I have webscraped the front page of “Trulia” Housing and "Appartments.com" and get a distribution of low/high end housing costs

In [1]:
# Import Packages
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# We talk about user agents later so run and ignore
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" 
,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36 "
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9 "
,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" 
,"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36"] 

headers = {
    'User-Agent': user_agent_list[3]}

In [None]:
# Do your Request
URL = "https://www.trulia.com/for_rent/New_York,NY/"
page = requests.get(URL, headers = headers)

In [4]:
# Response of request object
print(page)

<Response [200]>


In [5]:
# The Type of the response object
print(type(page))

<class 'requests.models.Response'>


In [6]:
# Convert to the soup object
soup = BeautifulSoup(page.content, "html.parser")

In [7]:
# Type of Soup object
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [8]:
# Make the HTML Pretty
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Apartments For Rent in New York, NY - 13,362 Long Term Rentals | Trulia
  </title>
  <meta content="Search 13,362 Rental Properties in New York, New York. Explore rentals by neighborhoods, schools, local guides and more on Trulia!" name="description"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=5.0, viewport-fit=cover" name="viewport"/>
  <meta content="5953837487" property="fb:admins"/>
  <meta content="183577541666001" property="fb:app_id"/>
  <meta content="Trulia" property="application-name"/>
  <meta content="https://www.trulia.com" property="msapplication-starturl"/>
  <meta content="Trulia: Real Estate Search" property="msapplication-tooltip"/>
  <meta content="/browserconfig.xml" property="msapplication-config"/>
  <meta content="en_US" property="og:locale"/>
  <meta content="Trulia Real Estate Search" property="og:site_name"/>
  <meta content="website" property="og:type"/

In [9]:
# Find a tag
soup.find("span")

<span class="Text__TextBase-sc-27a633b1-0 cVaxpe">New York</span>

In [10]:
# Find all span tags
spans = soup.findAll("span")

In [11]:
# This is equivalent to a list
type(spans)

bs4.element.ResultSet

In [12]:
# First element of the list
spans[0]

<span class="Text__TextBase-sc-27a633b1-0 cVaxpe">New York</span>

In [13]:
# Find all property prices
prices = soup.findAll("div",attrs = {"data-testid":"property-price"})


In [14]:
# Get all prices
clean_p = [p.text for p in prices]
clean_p

['$4,200/mo',
 '$3,445 - $7,445/mo',
 '$3,577 - $7,005/mo',
 '$3,112 - $4,451/mo',
 '$2,441 - $3,699/mo',
 '$4,304 - $7,894/mo',
 '$4,562/mo']

In [15]:
# Extract low and high prices
low_prices = []
high_prices = []
for p in clean_p:
    if "-" in p: # If the price is a range
        low_price, high_price = p.split("-")
        low_prices.append(float(low_price.replace(",", "").replace("$", "").replace("/mo", "")))
        high_prices.append(float(high_price.replace(",", "").replace("$", "").replace("/mo", "")))
    else: # If the price is not a range
        low_prices.append(float(p.replace(",", "").replace("$", "").replace("/mo", "")))
        high_prices.append(float(p.replace(",", "").replace("$", "").replace("/mo", "")))
        
# Print low and high prices
print("Low prices: ", low_prices)
print("High prices: ", high_prices)

Low prices:  [4200.0, 3445.0, 3577.0, 3112.0, 2441.0, 4304.0, 4562.0]
High prices:  [4200.0, 7445.0, 7005.0, 4451.0, 3699.0, 7894.0, 4562.0]


# I  Web scrapped Appartments.com to find out the cost of renting apartments in New York 

In [1]:
# We talk about user agents later so run and ignore
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" 
,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36 "
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9 "
,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" 
,"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36"] 

headers = {
    'User-Agent': user_agent_list[3]}

In [74]:
# Do your Request
URL = "https://www.apartments.com/new-york-ny/?bb=uz_842s_vHkt_2lkC"
page = requests.get(URL, headers = headers)

In [75]:
# Response of request object
print(page)

<Response [200]>


In [76]:
page = requests.get(URL, headers = headers)

In [77]:
soup = BeautifulSoup(page.content, "html.parser")

In [78]:
pricing = soup.find("p", {"class": "property-pricing"})

In [79]:
# extract the low and high prices
low_price = float(pricing.text.split(" - ")[0].replace(",", "").replace("$", ""))
high_price = float(pricing.text.split(" - ")[1].replace(",", "").replace("$", ""))

In [93]:
# initialize empty lists for low and high prices
low_prices = []
high_prices = []

# loop through each pricing element
for pricing in pricing_list:
    # extract the low and high prices
    price_parts = pricing.text.split(" - ")
    if len(price_parts) == 2:
        low_price = float(price_parts[0].replace(",", "").replace("$", ""))
        high_price = float(price_parts[1].replace(",", "").replace("$", ""))
    elif len(price_parts) == 1 and price_parts[0] != "Call for Rent":
        # if there is only one price and it's not "Call for Rent", use it as both the low and high prices
        low_price = high_price = float(price_parts[0].replace(",", "").replace("$", ""))
    else:
        # if the price is "Call for Rent" or empty, set the low and high prices to None
        low_price = high_price = None
    
    # append the low and high prices to their respective lists
    low_prices.append(low_price)
    high_prices.append(high_price)

# create a list of tuples from low_prices and high_prices
#price_list = list(zip(low_prices, high_prices))

# print the list of prices
print(low_prices)


[3375.0, 4070.0, 4675.0, 3995.0, 3750.0, 4295.0, 3000.0, 5923.0, 3580.0, 3330.0, 2955.0, 3900.0, 4885.0, 1283.0, 3869.0, 4950.0, 4029.0, 4600.0, 3675.0, 3800.0, None, 3460.0, 3733.0, 4500.0, 2275.0]


In [81]:
import pandas as pd

# initialize empty lists for low and high prices
low_prices = []
high_prices = []

# loop through each pricing element
for pricing in pricing_list:
    # extract the low and high prices
    price_parts = pricing.text.split(" - ")
    if len(price_parts) == 2:
        low_price = float(price_parts[0].replace(",", "").replace("$", ""))
        high_price = float(price_parts[1].replace(",", "").replace("$", ""))
        # append the low and high prices to their respective lists
        low_prices.append(low_price)
        high_prices.append(high_price)

In [67]:
print( low_prices)

[3375.0, 4070.0, 4675.0, 3995.0, 3750.0, 4295.0, 3000.0, 3580.0, 3330.0, 2955.0, 3900.0, 4885.0, 1283.0, 3869.0, 4029.0, 3675.0, 3800.0, 3460.0, 3733.0, 4500.0]


In [68]:
len(low_prices)

20

In [53]:
# find all the span elements with class "js-placardTitle title" on the page
title_list = soup.find_all("span", {"class": "js-placardTitle title"})

# initialize an empty list for the titles
titles = []

# loop through each title element
for title in title_list:
    # append the text content of the title element to the list of titles
    titles.append(title.text)

# print the list of titles
print( titles)

['Lyra', 'EOS', 'Park Towers South', 'VIA 57 WEST', 'The Helena', '70 W 45th St', 'The Smile Residential', 'FRANK 57 WEST', 'View 34', 'Lincoln at Bankside', 'Third at Bankside', 'Yorkshire Towers', '15 W 55th St', 'The Heritage by Common', '535 W 43rd Street', '510 West End Avenue', 'The Buchanan', '605 W 112th St', '792 Columbus Avenue', 'Manhattan Park', '120 West 97th  Street', 'Marquis Apartments', 'The Landon', 'Pod Pads', 'Riverton Square']


In [46]:
# find all the div elements with class "property-address" and extract their text content
address_list = [address.text.strip() for address in soup.find_all('div', {'class': 'property-address js-url'})]

# print the list of addresses
print(address_list)

['555 W 38th St, New York, NY 10018', '100 W 31st St, New York, NY 10001', '315 W 57th St, New York, NY 10019', '625 W 57th St, New York, NY 10019', '601 W 57th St, New York, NY 10019', '70 W 45th St, New York, NY 10036', '158 E 126th St, New York, NY 10035', '600 W 58th St, New York, NY 10019', '401 E 34th St, New York, NY 10016', '101 Lincoln Ave, Bronx, NY 10454', '2401 3rd Ave, Bronx, NY 10451', '305-315 E 86th St, New York, NY 10028', '15 W 55th St, New York, NY 10019', '1295 5th Ave, New York, NY 10029', '546 W 44th St, New York, NY 10036', '510 West End Ave, New York, NY 10024', '160 E 48th St, New York, NY 10017', 'New York, NY 10025', '784-792 Columbus Ave, New York, NY 10025', '10-40 River Rd, New York, NY 10044', '755 Amsterdam Ave, New York, NY 10025', '150 E 34th St, New York, NY 10016', '520 W 43rd St, New York, NY 10036', '400 W 42nd St, New York, NY 10036', '2225-2265 5th Ave, New York, NY 10037']


In [48]:
beds_list = [address.text.strip() for address in soup.find_all('p', {'class': 'property-beds'})]

print(beds_list)

['Studio - 2 Beds', 'Studio - 2 Beds', 'Studio - 1 Bed', 'Studio - 3 Beds', 'Studio - 1 Bed', '1-3 Beds', 'Studio - 2 Beds', '3 Beds', 'Studio - 3 Beds', 'Studio - 3 Beds', 'Studio - 3 Beds', 'Studio - 3 Beds', '2-3 Beds', 'Studio', 'Studio - 2 Beds', '2 Beds', 'Studio - 3 Beds', '1 Bed', 'Studio - 2 Beds', '1-2 Beds', 'Studio - 3 Beds', 'Studio - 1 Bed', 'Studio - 2 Beds', '1-2 Beds', '1-3 Beds']


In [98]:
# create DataFrame from low_prices and high_prices lists
df = pd.DataFrame({
    'Low Prices': low_prices,
    'High Prices': high_prices,
    "Titles": titles,
    'Address': address_list,
    'Beds':beds_list,
    
})

# print the DataFrame
df



Unnamed: 0,Low Prices,High Prices,Titles,Address,Beds
0,3375.0,5250.0,Lyra,"555 W 38th St, New York, NY 10018",Studio - 2 Beds
1,4070.0,7921.0,EOS,"100 W 31st St, New York, NY 10001",Studio - 2 Beds
2,4675.0,8600.0,Park Towers South,"315 W 57th St, New York, NY 10019",Studio - 1 Bed
3,3995.0,11995.0,VIA 57 WEST,"625 W 57th St, New York, NY 10019",Studio - 3 Beds
4,3750.0,4250.0,The Helena,"601 W 57th St, New York, NY 10019",Studio - 1 Bed
5,4295.0,8553.0,70 W 45th St,"70 W 45th St, New York, NY 10036",1-3 Beds
6,3000.0,5500.0,The Smile Residential,"158 E 126th St, New York, NY 10035",Studio - 2 Beds
7,5923.0,5923.0,FRANK 57 WEST,"600 W 58th St, New York, NY 10019",3 Beds
8,3580.0,8830.0,View 34,"401 E 34th St, New York, NY 10016",Studio - 3 Beds
9,3330.0,6875.0,Lincoln at Bankside,"101 Lincoln Ave, Bronx, NY 10454",Studio - 3 Beds
