In [1]:
import requests # Helps construct the request to send to the API
import json # JSON helper functions
from bs4 import BeautifulSoup
import pandas as pd
import time

In [38]:
#specify markets to search for housing ads - each market has two levels (e.g washington, DC, nva for listings in ) 
def define_markets_to_search():
    markets = [['washingtondc','nva'],['washingtondc','mld'],['washingtondc','doc']]
    return markets

In [3]:
# Determine how many pages of listings exist in each market - listings are split into pages with 100 listings per page
def count_number_of_listings(market):

        #create url for each specified market
        url = 'http://' + market[0] + '.craigslist.org/search/' + '/apa'

        # Make the request
        response = requests.post(url)

        #place data in Beautiful soup object
        soup = BeautifulSoup(response.text)

        #Determine how many pages (listings) exist to redefine number of times to run code
        pages_of_listings = int(soup.find_all('span', class_='totalcount')[0].text)
        return pages_of_listings

In [4]:
#get list of craiglist listing urls
def get_listing_url_list(market, pages_of_listings):
    listing_ids = []  #create list of all the listing ids - to create links to each listing page
        
    for page in xrange(0, pages_of_listings, 100):

        #create url for each specified market
        if page == 0:
            url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa'
        else: 
            url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa?s=' + str(page) + '&'

        # delay script each time it gets a new set of listing urls (gets 100 new listings for each page)
        time.sleep(0.2) 

        # Make the request
        response = requests.post(url)

        #place data in Beautiful soup object
        soup = BeautifulSoup(response.text)

        #Add links to each listing page
        data_pid = soup.find_all('p', class_='row')
        for listing_id in data_pid:
            listing_ids.append(listing_id['data-pid'])

        print page, url

    return listing_ids

In [5]:
# get data from craigslist for the single specified listing id
def get_craigslist_listing(id, market):
    url = 'http://' + market[0] + '.craigslist.org/' + market[1] + '/apa/' + id + '.html'
    
    # Make the request
    response = requests.post(url)

    # Confirm the response worked
    response.status_code
    
    #place data in Beautiful soup object
    soup = BeautifulSoup(response.text)

    return soup, url

In [25]:
def get_property_attributes(property):

    #create empty property dict to collect property attributes
    attribute_list = {}

    try:
        attributes_data = property.find('p',class_='attrgroup').find_all('span')
        for attribute in attributes_data:

            if 'BR' in attribute.text:
                attribute_list['bedroom'] = attribute.text.split('/')[0].replace('BR','')
            if 'Ba' in attribute.text:
                attribute_list['bathroom'] = attribute.text.split('/')[1].replace('Ba','')
            if 'ft' in attribute.text:
                attribute_list['square_footage'] = attribute.text.replace('ft2','')
            if attribute.text in ['apartment', 'condo', 'cottage/cabin', 'duplex', 'flat', 
                'house', 'in-law', 'loft', 'townhouse','manufactured', 'assisted living', 'land']:
                attribute_list['housing_type'] = attribute.text
            if 'cat' in attribute.text:
                attribute_list['cat'] = attribute.text
            if 'dog' in attribute.text:
                attribute_list['dog'] = attribute.text
            if attribute.text in ['w/d in unit','laundry in bldg','laundry on site','w/d hookups']:
                attribute_list['laundry'] = attribute.text 
            if attribute.text in ['carport', 'attached garage', 'detached garage',  'off-street parking', 'street parking', 'valet parking']:
                attribute_list['parking'] = attribute.text
            if 'smoking' in attribute.text:
                attribute_list['smoking'] = attribute.text

        attribute_list['availability'] = property.find('span', class_='housing_movein_now property_date')['today_msg']
        attribute_list['date_available'] = property.find('span', class_='housing_movein_now property_date')['date']

        return attribute_list
    except: pass

In [26]:
def get_property_price(property):

    price = property.find('h2')
    price = price.contents[3].text
    price = price.split()
    return price[0]

In [27]:
def get_posting_date_and_time(property):
    
    posting_time = property.find('p', class_='postinginfo').find('time').text
    return posting_time

In [28]:
def get_property_description(property):

    #get the written user description of the listing
    listing_description = property.find('section', class_='userbody').find('section')
    return listing_description.text

In [29]:
def get_image_data(property):

    #Create a list of dicts containing image number, image link, and image size
    average_image_size = 0
    image_number = 0
    
    try:
        images = property.find('figure').find_all('div')
        for pic in images[-1]:                
            image_number = int(pic['title']) #find out number of images in listing - only keep the last (max) number
            image_size = pic['href'].split('_')[-1].split('.')[0].split('x')
            average_image_size += int(image_size[0]) * int(image_size[1]) #get sum of image size (Width * Height)
    except: pass

    try:    
        average_image_size = average_image_size / image_number
    except: pass

    return image_number, average_image_size

In [30]:
def get_property_location(property):

    #create a dict with  location data: country, state, city, address, longitude, latitude, and location accuracy metric

    #get longitude, latitude, and location accuracy metric
    location = property.find('div', class_='viewposting')
     
    try:
        location_data_accuracy = property.find('div', class_='viewposting')['data-accuracy']
    except: pass
    
    try:
        latitude = property.find('div', class_='viewposting')['data-latitude']
    except: pass
    
    try:
        longitude = property.find('div', class_='viewposting')['data-longitude']
    except: pass

    #get country, state, city
    
    try:
        map = property.find('p', class_='mapaddress').find_all('a')[1]['href'] #get yahoo maps link - easier to extract data than google maps
        country = map.split('country=')[1]
        state = map.split('csz=')[1].split('&')[0].split('+')[1]
        city  = map.split('csz=')[1].split('&')[0].split('+')[0]
    except: pass
    
    #get address
    try:
        address = property.find('div', class_='mapaddress').text
    except: pass
    
    #place all location data into dict
    location_dict = {}
    
    try:
        location_dict['location_data_accuracy'] = location_data_accuracy
    except: pass
    
    try:
        location_dict['latitude'] = latitude
    except: pass
    
    try:
        location_dict['longitude'] = longitude
    except: pass
    
    try:
        location_dict['country'] = country
    except: pass
    
    try:
        location_dict['state'] = state
    except: pass
    
    try:
        location_dict['city'] = city
    except: pass
        
    latitude = ""
    longitude = ""
    country = ""
    state = ""
    city = ""
    location_data_accuracy = ''

    return location_dict

In [32]:
from IPython.display import HTML
HTML('<iframe src=http://washingtondc.craigslist.org/apa/ width=1000 height=500></iframe>')

In [None]:
#Main class - loop though all listings in listings_ids and extract features

#initialize lists to collect attribute and location data
property_attributes_data = []
property_location_data = []

markets = define_markets_to_search() #Define which markets to search (e.g. Washington DC)

for market in markets:    
    pages_of_listings = count_number_of_listings(market) #Count number of listings in specified market
    listing_ids = get_listing_url_list(market, pages_of_listings) #Get a list of all the individual listing to search

    for id in listing_ids:

        time.sleep(0.2) # delay script each time it get a new listing
        try:
            #get the listing HTML and the listing URL
            property, url = get_craigslist_listing(id, market) 
        except: pass

        try:
            #create initial dict with property attributes
            property_attributes = get_property_attributes(property)
            property_attributes['url'] = url
    
            #add price, description, and image data to dict
            property_attributes['price'] = get_property_price(property)
            property_attributes['description'] = get_property_description(property).encode('utf-8')
            property_attributes['time_of_posting'] = get_posting_date_and_time(property)

            image_number, average_image_size = get_image_data(property)
            property_attributes['image_number'] = image_number
            property_attributes['average_image_size'] = average_image_size

            #add property attributes to property_attributes_data list
            property_attributes = pd.Series(property_attributes, name=id)
            property_attributes_data.append(property_attributes)
        except: pass

        try:
            #add location data to location data list
            location = get_property_location(property)
            location = pd.Series(location, name=id)    
            property_location_data.append(location)
        except: pass

    #put lists into DataFrames
    property_attributes_dataframe = pd.DataFrame(property_attributes_data)
    property_location_dataframe = pd.DataFrame(property_location_data)

0 http://washingtondc.craigslist.org/search/nva/apa
100 http://washingtondc.craigslist.org/search/nva/apa?s=100&
200 http://washingtondc.craigslist.org/search/nva/apa?s=200&
300 http://washingtondc.craigslist.org/search/nva/apa?s=300&
400 http://washingtondc.craigslist.org/search/nva/apa?s=400&
500 http://washingtondc.craigslist.org/search/nva/apa?s=500&
600 http://washingtondc.craigslist.org/search/nva/apa?s=600&
700 http://washingtondc.craigslist.org/search/nva/apa?s=700&
800 http://washingtondc.craigslist.org/search/nva/apa?s=800&
900 http://washingtondc.craigslist.org/search/nva/apa?s=900&
1000 http://washingtondc.craigslist.org/search/nva/apa?s=1000&
1100 http://washingtondc.craigslist.org/search/nva/apa?s=1100&
1200 http://washingtondc.craigslist.org/search/nva/apa?s=1200&
1300 http://washingtondc.craigslist.org/search/nva/apa?s=1300&
1400 http://washingtondc.craigslist.org/search/nva/apa?s=1400&
1500 http://washingtondc.craigslist.org/search/nva/apa?s=1500&
1600 http://washingto

In [34]:
dat = pd.merge(property_location_dataframe,property_attributes_dataframe, left_index=True, right_index=True)

In [37]:
#check results
len(dat)
dat[0:2]

Unnamed: 0,city,country,latitude,location_data_accuracy,longitude,state,availability,average_image_size,bathroom,bedroom,...,dog,housing_type,image_number,laundry,parking,price,smoking,square_footage,time_of_posting,url
5001091161,Aldie,US,38.92376,0,-77.554379,DC,available now,270000,3,4,...,dogs are OK - wooof,townhouse,14,w/d in unit,attached garage,$1950,,1850,2015-04-28 9:16pm,http://washingtondc.craigslist.org/nva/apa/500...
5001100578,McLean,US,38.923478,0,-77.226526,DC,available now,270000,1,1,...,dogs are OK - wooof,apartment,19,w/d in unit,,$1650,no smoking,992,2015-04-28 9:24pm,http://washingtondc.craigslist.org/nva/apa/500...


In [None]:
dat.to_csv(r'C:\Users\alsherman\Desktop\GitHub\DataScience_GeneralAssembly\Data\Craigslist_Data_Apr_28_.csv')

In [None]:
##unused to collect image links

#Create a list of dicts containing image number, image link, and image size
#image_data = []

#images = soup.find('figure').find_all('div')
#for pic in images[-1]:    
#    pic_dict = {}
#    pic_dict['image_number'] = pic['title']
#    pic_dict['href'] = pic['href']
#    pic_dict['image_size'] = pic['href'].split('_')[-1]
    
#    image_data.append(pic_dict)


In [26]:
#display the first five listing ids
listing_ids[0:5]

100

In [30]:
#confirm get_craigslist_listing works
print(get_craigslist_listing(listing_ids[0])[0].prettify())

<!DOCTYPE html>
<html class="no-js">
 <head>
  <title>
   Spacious 1br in a resort style gated community
  </title>
  <meta content="NOARCHIVE,NOFOLLOW" name="robots"/>
  <meta content="If you are looking for affordable Resort Style living in the heart of Alexandria, consider this spacious 1br apartment. All utilities included in the rent, internet, in the process of updating the..." name="description"/>
  <meta content="preview" name="twitter:card"/>
  <meta content="If you are looking for affordable Resort Style living in the heart of Alexandria, consider this spacious 1br apartment. All utilities included in the rent, internet, in the process of updating the..." property="og:description"/>
  <meta content="http://images.craigslist.org/01616_9Y2NpN93A06_600x450.jpg" property="og:image"/>
  <meta content="craigslist" property="og:site_name"/>
  <meta content="Spacious 1br in a resort style gated community" property="og:title"/>
  <meta content="article" property="og:type"/>
  <meta co