# Craigslist Apartment Scraper

The purpose of this script is to pull apartment listings, characteristics, prices, and reply emails from for rent ads on craigslist. We plan to use this information in order to run an experiment to test the impact of including exclamation points on response rates to inquiries sent.

In [14]:
#Import modules
import requests
from bs4 import BeautifulSoup as bs4
import pandas as pd
import re
import numpy as np

## Function to query craigslist  

This function will allow us to specify a price range, the number of bedrooms, and what craigslist site to query (e.g. Denver, SF, NYC, etc.)  

Note that these queries only return a max of 100 results each. Thus, we will want to be specific about the price ranges and bedrooms that we specify so we can maximize the number of listings we are able to capture.

In [2]:
#Define a function to fetch search results
def fetch_search_results(query=None, minAsk=None, maxAsk=None, bedrooms=None, base_URL = None):
    search_params = {key: val for key, val in locals().items() if val is not None}
    if not search_params:
        raise ValueError("No valid keywords")
    base = base_URL + '/search/apa'
    resp = requests.get(base, params=search_params, timeout=3)
    resp.raise_for_status()  # <- no-op if status==200
    return resp.content, resp.encoding

In [3]:
#test the query function.
test1, test2 = fetch_search_results(query = None, minAsk = 1000, maxAsk = 4000, bedrooms = 1, base_URL = 'https://denver.craigslist.org')

## Function to get full URLs and apartment characteristics from query function output  

This function will go through each of the listings found from our query and compile a dataset of URLs and apartment characteristics of all the results from the query. We will use the URLs to get the reply email addresses in a later step.

### Helper functions to get apartment characteristics from each query result

In [4]:
#get href - the relative link to the full apartment listing. These relative links are identified by <a> tags
#and have the class 'result-title hdrlnk'.
def get_href(result):
    href = result.find('a', {'class' : 'result-title hdrlnk'})['href']
    
    if href is None:
        href = np.nan
    
    return href

In [5]:
#get price - price can be located by <span> tags of class 'result-price'
def get_price(result):
    price = result.find('span', {'class' : 'result-price'})
    
    #convert price to float
    if price is not None:
        price = float(price.text.strip('$'))
        
    else:
        price = np.nan
    
    return price

In [6]:
#get listing title which is identified by the text in the <a> tag with class 'result-title hdrlnk'
def get_title(result):
    title = result.find('a', {'class' : 'result-title hdrlnk'}).text
    
    if title is None:
        title = np.nan
        
    return title


In [26]:
#get the time the listing was posted
def get_posting_date(result):
    posting_date = result.find('time', {'class' : 'result-date'})['datetime']
    
    if posting_date is None:
        posting_date = np.nan
        
    return posting_date

In [27]:
#get bedrooms / sqft which is identified by the <span> tag of class 'housing'
def get_bedrooms_sqft(result):
    bedrooms_sqft = result.find('span', {'class' : 'housing'}).text.strip('\n')
    
    if bedrooms_sqft is None:
        price = np.nan
    
    return bedrooms_sqft

#get bedrooms and square feet from the combined bedrooms / sqft tag
def get_bedrooms_only(bdrms_sqft):
    #compile regular expression matching the pattern that bedrooms follows
    p = re.compile('\d+br', re.IGNORECASE)
    #get start and end of the number of bedrooms in the bedrooms / sqft combined string
    m = p.match(bdrms_sqft)
    start = m.start()
    end = m.end()
    #get the number of bedrooms
    bedrooms = float(bdrms_sqft[start:end].strip('br'))
    
    return bedrooms

#get the square feet from the bedrooms / sqft tag
def get_sqft_only(bdrms_sqft):
    #compile regular expression matching the pattern that bedrooms follows
    p = re.compile('\d+ft', re.IGNORECASE)
    #get start and end of the number of bedrooms in the bedrooms / sqft combined string
    m = p.match(bdrms_sqft)
    start = m.start()
    end = m.end()
    #get the number of bedrooms
    sqft = float(bdrms_sqft[start:end].strip('ft'))
    
    return sqft

# LEFT OFF HERE. Square footage doesn't seem to work because of the \n in the string

In [45]:
test = '1br -\n 650ft2 -\n'

#get the number of bedrooms in the bedrooms / sqft combined string
#compile the regex
bedroom_p = re.compile(r'\d+(?=br)', re.IGNORECASE)

#get match in the bedroom / sqft string
bedroom_m = bedroom_p.match(test)

#get bedrooms
n_bedrooms = test[bedroom_m.start(): bedroom_m.end()]




In [44]:
test = '1br -\n 650ft2 -\n'

#get the square footage
#compile the regex
sqft_p = re.compile(r'\d+(?=ft)', re.IGNORECASE)

#get match in the bedroom / sqft string
sqft_m = sqft_p.match(test)

#get bedrooms
sqft = test[sqft_m.start(): sqft_m.end()]

print sqft



AttributeError: 'NoneType' object has no attribute 'start'

### Function to compile all apartment characteristics

In [29]:
def compile_listing_URLs(query_result, base_URL):
    #parse the results of the query
    html = bs4(query_result, 'html.parser')

    #get all individual apartments from the query
    apt_results = html.find_all('p', attrs={'class' : 'result-info'})

    #initialize a list to contain all of the URLs that resulted from the query
    apts_results_df = pd.DataFrame(columns = ('base_URL', 'href', 'Listing_Title', 'Bedrooms_sqft', 'Price', 'Posting_Date'))
   
    #Looop through all of the tags containing the apartments and get the addresses of those individual results.
    for apt in range(len(apt_results)):
        #use helper functions to get characteristics
        href = get_href(apt_results[apt])
        title = get_title(apt_results[apt])
        bedrooms_sqft = get_bedrooms_sqft(apt_results[apt])
        price = get_price(apt_results[apt])
        posting_date = get_posting_date(apt_results[apt])
        #populate the result dataframe with the characteristics
        apts_results_df.loc[apt] = [base_URL, href, title, bedrooms_sqft, price, posting_date]

    #construct full URL for the listing
    apts_results_df['full_URL'] = apts_results_df.apply(lambda row: row['base_URL'] + row['href'], axis = 1)
    
    #construct reply URL for the listing
    apts_results_df['Reply_contact_info_link'] = apts_results_df.apply(lambda row: row['base_URL'] + '/reply/den' + row['href'].strip('.html'), axis = 1)
    
    #delete base URL and href columns
    del apts_results_df['base_URL']
    del apts_results_df['href']
    
    return apts_results_df

In [30]:
#test the compiler function
test_compiled_URLs = compile_listing_URLs(query_result = test1, base_URL = 'https://denver.craigslist.org')

test_compiled_URLs.head()

pd.set_option('display.max_colwidth',1000)

test_compiled_URLs.head()

Unnamed: 0,Listing_Title,Bedrooms_sqft,Price,Posting_Date,full_URL,Reply_contact_info_link
0,"Business Center with WiFi, Walk-In Closets, Updater Moving Partner",1br -\n,1110.0,2017-02-19 19:43,https://denver.craigslist.org/apa/6008376643.html,https://denver.craigslist.org/reply/den/apa/6008376643
1,FREE RENT ON GREAT APT. NEAR DTC! SECTION 8 HOUSING ACCEPTED!,1br -\n 815ft2 -\n,1295.0,2017-02-19 19:41,https://denver.craigslist.org/apa/5988727059.html,https://denver.craigslist.org/reply/den/apa/5988727059
2,"Vaulted Ceilings, Garden Tub, Oversized Windows",1br -\n 616ft2 -\n,1096.0,2017-02-19 19:41,https://denver.craigslist.org/apa/6004776726.html,https://denver.craigslist.org/reply/den/apa/6004776726
3,"Range, Cable/Satallite, Window Covering",1br -\n 680ft2 -\n,1095.0,2017-02-19 19:37,https://denver.craigslist.org/apa/5984825290.html,https://denver.craigslist.org/reply/den/apa/5984825290
4,1br/1ba Apartment in Cheesman Park,1br -\n 1000ft2 -\n,1370.0,2017-02-19 19:35,https://denver.craigslist.org/apa/6003981553.html,https://denver.craigslist.org/reply/den/apa/6003981553


# Notes  

Craigslist started giving me captcha dialog boxes to view the reply email addresses, so we may have to pull reply email addresses by hand.