# Craigslist Apartment Scraper

The purpose of this script is to pull apartment listings, characteristics, prices, and reply emails from for rent ads on craigslist. We plan to use this information in order to run an experiment to test the impact of including exclamation points on response rates to inquiries sent.

In [26]:
#Import modules
import requests
from bs4 import BeautifulSoup as bs4
import pandas as pd
import re
import numpy as np
import os
import xlsxwriter
import time

## Function to query craigslist  

This function will allow us to specify a price range, the number of bedrooms, and what craigslist site to query (e.g. Denver, SF, NYC, etc.)  

Note that these queries only return a max of 100 results each. Thus, we will want to be specific about the price ranges and bedrooms that we specify so we can maximize the number of listings we are able to capture.

In [3]:
#Define a function to fetch search results
def fetch_search_results(query=None, minAsk=None, maxAsk=None, bedrooms=None, base_URL = None):
    search_params = {key: val for key, val in locals().items() if val is not None}
    if not search_params:
        raise ValueError("No valid keywords")
    base = base_URL + '/search/apa'
    resp = requests.get(base, params=search_params, timeout=3)
    resp.raise_for_status()  # <- no-op if status==200
    return resp.content, resp.encoding

In [4]:
#test the query function.
test1, test2 = fetch_search_results(query = None, minAsk = 1000, maxAsk = 4000, bedrooms = 1, base_URL = 'https://denver.craigslist.org')

## Function to get full URLs and apartment characteristics from query function output  

This function will go through each of the listings found from our query and compile a dataset of URLs and apartment characteristics of all the results from the query. We will use the URLs to get the reply email addresses in a later step.

### Helper functions to get apartment characteristics from each query result  

price, bedrooms, square footage, listing title, posting date / time, and reply linnk

In [5]:
#get href - the relative link to the full apartment listing. These relative links are identified by <a> tags
#and have the class 'result-title hdrlnk'.
def get_href(result):
    href = result.find('a', {'class' : 'result-title hdrlnk'})['href']
    
    if href is None:
        href = np.nan
    
    return href

In [6]:
#get posting ID - These IDs are the data-ID portion of  <a> tags with the class 'result-title hdrlnk'.
def get_posting_ID(result):
    posting_ID = result.find('a', {'class' : 'result-title hdrlnk'})['data-id']
    
    if posting_ID is None:
        posting_ID = np.nan
    
    return posting_ID

In [7]:
#get price - price can be located by <span> tags of class 'result-price'
def get_price(result):
    price = result.find('span', {'class' : 'result-price'})
    
    #convert price to float
    if price is not None:
        price = float(price.text.strip('$'))
        
    else:
        price = np.nan
    
    return price

In [8]:
#get listing title which is identified by the text in the <a> tag with class 'result-title hdrlnk'
def get_title(result):
    title = result.find('a', {'class' : 'result-title hdrlnk'}).text
    
    if title is None:
        title = np.nan
        
    return title


In [9]:
#get the time the listing was posted
def get_posting_date(result):
    posting_date = result.find('time', {'class' : 'result-date'})['datetime']
    
    if posting_date is None:
        posting_date = np.nan
        
    return posting_date

In [10]:
#get bedrooms / sqft which is identified by the <span> tag of class 'housing'
def get_bedrooms_sqft_str(result):
    bedrooms_sqft = result.find('span', {'class' : 'housing'}).text.strip('\n')
    
    if bedrooms_sqft is None:
        price = np.nan
    
    return bedrooms_sqft

def get_bedrooms_sqft(bedrooms_sqft):
    #*******
    #remove the new line characters and white space
    p_1 = re.compile('-|\n|\s')

    bedrooms_sqft = p_1.sub('', bedrooms_sqft)

    #*******
    #get bedrooms
    #compile the regex
    bedroom_p = re.compile(r'\d+(?=br)', re.IGNORECASE)

    #get match in the bedroom / sqft string
    bedroom_m = bedroom_p.match(bedrooms_sqft)

    #get bedrooms
    n_bedrooms = float(bedrooms_sqft[bedroom_m.start(): bedroom_m.end()])

    #*******
    #get square footage
    #remove bedrooms
    bedrooms_sqft = bedrooms_sqft[bedroom_m.end() + 2:]

    #compile the regex
    sqft_p = re.compile(r'\d+(?=ft)', re.IGNORECASE)

    #get match in the square footage string
    sqft_m = sqft_p.match(bedrooms_sqft)

    #get square footage
    try:
        sqft = float(bedrooms_sqft[sqft_m.start():sqft_m.end()])
    
    except AttributeError:
        sqft = np.nan
    
    return n_bedrooms, sqft


### Function to compile all apartment characteristics

In [11]:
def compile_listing_URLs(query_result, base_URL):
    #parse the results of the query
    html = bs4(query_result, 'html.parser')

    #get all individual apartments from the query
    apt_results = html.find_all('p', attrs={'class' : 'result-info'})

    #initialize a list to contain all of the URLs that resulted from the query
    apts_results_df = pd.DataFrame(columns = ('base_URL', 'href','posting_ID', 'Listing_Title', 'Bedrooms', 'Sqft', 'Price', 'Posting_Date'))
   
    #Looop through all of the tags containing the apartments and get the addresses of those individual results.
    for apt in range(len(apt_results)):
        #use helper functions to get characteristics
        href = get_href(apt_results[apt])
        posting_ID = get_posting_ID(apt_results[apt])
        title = get_title(apt_results[apt])
        bedrooms_sqft_str = get_bedrooms_sqft_str(apt_results[apt])
        bedrooms, sqft = get_bedrooms_sqft(bedrooms_sqft_str)
        price = get_price(apt_results[apt])
        posting_date = get_posting_date(apt_results[apt])
        #populate the result dataframe with the characteristics
        apts_results_df.loc[apt] = [base_URL, href, posting_ID, title, bedrooms, sqft, price, posting_date]

    #construct full URL for the listing
    apts_results_df['full_URL'] = apts_results_df.apply(lambda row: row['base_URL'] + row['href'], axis = 1)
    
    #construct reply URL for the listing
    apts_results_df['Reply_contact_info_link'] = apts_results_df.apply(lambda row: row['base_URL'] + '/reply/den/apa/' + row['posting_ID'].strip('.html'), axis = 1)
    
    #delete base URL and href columns
    del apts_results_df['base_URL']
    del apts_results_df['href']
    
    return apts_results_df

In [12]:
#test the compiler function
test_compiled_URLs = compile_listing_URLs(query_result = test1, base_URL = 'https://denver.craigslist.org')

test_compiled_URLs.head()

pd.set_option('display.max_colwidth',1000)

test_compiled_URLs.head()

os.chdir('/Users/nwchen24/Desktop/UC_Berkeley/Experiments_and_causality/final_project_github_repo/mids-w241-final/CL_scraper/output')

#write the output file
writer = pd.ExcelWriter('Example Listing Pull for Denver 2-20-17.xlsx', engine = 'xlsxwriter')     
test_compiled_URLs.to_excel(writer, sheet_name = 'Example Listings')
writer.save()

# Operationalizing Phase  

This phase will incorporate the ability to run the scraper across a selection of cities and bedroom and price range specifications

## City to Craigslist URLs Dictionary

In [33]:
#create city list
cities = ['denver', 'newyork', 'cleveland', 'sanfrancisco']

#set base craigslist URLs
base_URLs = ['https://denver.craigslist.org', 'https://newyork.craigslist.org', 'https://cleveland.craigslist.org', \
             'https://sfbay.craigslist.org']

#set search URLS to feed to query function
search_URLs = ['https://denver.craigslist.org/search/apa', 'https://newyork.craigslist.org/search/abo', \
               'https://cleveland.craigslist.org/search/apa', 'https://sfbay.craigslist.org/search/sfc/apa']

#set reply strings which are intermediate strings between the base URL and the posting ID to access the page where
#reply emails are found
reply_strings = ['/reply/den/apa/', '/reply/nyc/abo/', '/reply/cle/apa/', '/reply/sfo/apa/']

#create dataframe with all of these pieces of information
city_to_URL_dict = {'base_URL' : base_URLs, 'search_URL' : search_URLs, 'reply_string' : reply_strings}

city_to_URL_df = pd.DataFrame(city_to_URL_dict, index = cities)

for city in city_to_URL_df.index:
    print city_to_URL_df.loc[city, 'base_URL']

https://denver.craigslist.org
https://newyork.craigslist.org
https://cleveland.craigslist.org
https://sfbay.craigslist.org


# NOTE LEFT OFF HERE

## Outline function to pull data

In [None]:
#Look at $500 price bucket increments for each number of bedrooms
min_prices = np.arange(500, 5000, 500).tolist()

#Look at 1, 2, 3 bedroom apartments
bedrooms = [1,2,3]

#initialize empty dataframe to hold query results
query_results_df = pd.DataFrame(columns = ('city', 'min_price', 'max_price', 'bedrooms', 'query_html'))

#initialize counter which we will use to populate the query result dataframe
counter = 0

#loop over cities
for city in city_to_URL_df.index:
    #loop over number of bedrooms
    for bedroom in bedrooms:
        #loop over the min prices
        for price in min_prices:
            #set start time
            start_time = time.time()
            
            #ping CL server to get query results
            query_results, query_encoding = fetch_search_results(query = None, minAsk = price, maxAsk = price + 500, bedrooms = bedroom,\
                                 base_URL = city_to_URL_df.loc[city, 'base_URL'])
            
            if query_results is not None:
                print "Query Success. City: " + city + ", Bedrooms: " + str(bedroom) + ", Price Range: " + str(price) +\
                "-" + str(price + 500)            
            
            else:
                print "No Results Found. City: " + city + ", Bedrooms: " + str(bedroom) + ", Price Range: " + str(price) +\
                "-" + str(price + 500)            
           
            
            #append result (which is a string) to query results dataframe
            query_results_df.loc[counter] = [city, price, price + 500, bedroom, query_results]
            
            #increment counter
            counter += 1
            
            #incorporate delay drawn from normal distribution centered around one minute to hopefully make
            #queries appear more human like
            delay = abs(np.random.normal(60, 15))
            time.sleep(delay)
            print "Delay: " + str(time.time() - start_time)
            



Query Success. City: denver Bedrooms: 1 Price Range: 500-1000


In [29]:
#outline for including a delay in queries
for i in [1, 2, 3, 4, 5]:
    start_time = time.time()
    print i
    delay = abs(np.random.normal(5, 1))
    time.sleep(delay)
    print "Delay: " + str(time.time() - start_time)
    

1
Delay: 4.34614300728
2
Delay: 2.38538599014
3
Delay: 5.01460599899
4
Delay: 3.70861887932
5
Delay: 3.37760400772


[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500]