In [25]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
import time
import random

In [2]:
def scrape_housing_links(dist, postal,
    url = "https://sandiego.craigslist.org/d/apartments-housing-for-rent/search/apa?s={page}&availabilityMode=0&postal={zip_code}&search_distance={miles}" 
    ):
    """
    
    """
    posting_links, results = set(), dict()
    # Find all of the links that redirects to the posting on craigslist
    temp = url.format(miles = dist, zip_code = postal, page = 0)
    listing = requests.get(temp)
    content = listing.text
    soup = BeautifulSoup(content, 'html.parser')
    total_count = int(soup.find('span', attrs = {'class': 'totalcount'}).text)
    current, num_results_on_a_page = 0, 120
    while current < total_count:
        # Access the website and parse the webpage
        time.sleep(2)
        temp_url = url.format(miles = dist, zip_code = postal, page = current)
        listing = requests.get(temp_url)
        content = listing.text
        soup = BeautifulSoup(content, 'html.parser')
        htmls = soup.find_all('a', attrs = {'class': 'result-title hdrlnk'})
        for link in htmls:
            posting_links.add(link.get('href'))
        current += num_results_on_a_page
    return posting_links      

In [3]:
links = scrape_housing_links(10, 92037)

In [5]:
links2 = [link for link in links if 'sandiego.craigslist' in link]

In [6]:
len(links2)

1300

In [8]:
def download_webpages(links):
    pages = []
    for link in links:
        time.sleep(random.randint(2, 10))
        listing = requests.get(link)
        content = listing.text
        soup = BeautifulSoup(content)
        pages.append(soup)
    return pages

In [9]:
downloaded_pages = download_webpages(links2)

In [14]:
len(downloaded_pages)

1300

In [71]:
def scrape_basic_info(post):
    """
    Input: post takes in a soup object of a craigslist posting
    and returns the post_id and post_date in a tuple
    """    
    try:
        url = post.find("meta", property="og:url").get('content')
        post_id = re.search('([\d]+).html', url).group(1)
        post_date = post.find('time', attrs = {'class': 'date timeago'}).get('datetime')
        return [post_id, post_date]
    except Exception as e:
        return None


In [58]:
def scrape_listing_info(post):
    """
    Input: post takes in a soup object of a craigslist posting
    Returns: a number of bedrooms, bathrooms, price, address, and the size of the listing.
    """
    price = post.find('span', attrs = {'class': 'price'})
    if price is not None:
        price = price.text.strip('$').replace(',', '')
    else:
        price = -1
    try:
        temp = post.find('span', attrs = {'class': 'shared-line-bubble'}).text.split('/')
        num_beds = int(temp[0].strip('BR '))
        num_baths = temp[1].strip(' Ba')
    except:
        num_beds = 0
        num_baths = 0
    address = post.find('div', attrs = {'class': 'mapaddress'}) #unable to scrape address if there is none
    if address is not None:
        address = address.text
    # To be implemented
#     sqft = soup.find_all('span', attrs = {'class': 'shared-line-bubble'})[1].text.strip('ft2')
    return [num_beds, num_baths, price, address]#, sqft]

In [59]:
def scrape_desc(post):
    """
    Input: post takes in a soup object of a craigslist posting
    Returns: a string of the description for the posting by the poster
    """
    description = post.find('section', attrs = {'id': 'postingbody'})
    if description is not None:
        return description.text.strip()
    return description

In [60]:
def scrape_features(post):
    """
    Input: post takes in a soup object of a craigslist posting
    Returns a number of features indicated by the poster
    """
    search = post.find_all('p', attrs = {'class': 'attrgroup'})
    if len(search) >= 1:
        attributes = search[1].find_all('span')
    else:
        attributes = None
    return attributes

In [73]:
def scrape_post(post):
    post_info = scrape_basic_info(post)
    if post_info is None:
        return None
    features = scrape_features(post)
    description = scrape_desc(post)
    info = scrape_listing_info(post)
    post_id = post_info[0]
    post_date = post_info[1]
    return [post_id, post_date, description, info, features]

In [78]:
def scrape_posting(posts):
    scraped_posts = []
    for post in posts:
        parsed_data = scrape_post(post)
        if parsed_data is None:
            continue
        scraped_posts.append(parsed_data)
    return scraped_posts

In [63]:
def convertToCSV(posts):
    headers = ["post_id", "post_date", "description", "info", "features"]
    df = pd.DataFrame(posts, columns = headers)
    df.to_csv("scrapped craigslist post.csv")

In [79]:
data = scrape_posting(downloaded_pages)

In [81]:
convertToCSV(data)

In [84]:
headers = ["post_id", "post_date", "description", "info", "features"]
df = pd.DataFrame(data, columns = headers)
df.dtypes

post_id        object
post_date      object
description    object
info           object
features       object
dtype: object

In [85]:
df

Unnamed: 0,post_id,post_date,description,info,features
0,7406258566,2021-11-10T14:23:12-0800,QR Code Link to This Post\n\n\nPOINT LOMA BAY ...,"[1, 1, 1940, 2449 SOTO ST]","[[EV charging], [cats are OK - purrr], [dogs a..."
1,7402763878,2021-11-02T17:49:42-0700,QR Code Link to This Post\n\n\nBedrooms: 3Bath...,"[3, 2, 3911, 7777 Westside Drive]","[[apartment], [w/d in unit], [carport]]"
2,7405170179,2021-11-08T09:03:56-0800,QR Code Link to This Post\n\n\nElan Beachhouse...,"[1, 1, 2995, 2515 Camino del Mar]","[[application fee details: , [$35.00]], [cats ..."
3,7406281281,2021-11-10T15:17:02-0800,QR Code Link to This Post\n\n\n We are open fo...,"[1, 1, 2858, 4855 Ariva Way, #130]","[[cats are OK - purrr], [dogs are OK - wooof],..."
4,7405635647,2021-11-09T09:06:20-0800,QR Code Link to This Post\n\n\nEncompass yours...,"[2, 2, 3535, 5280 Fiore Terrace]","[[EV charging], [cats are OK - purrr], [dogs a..."
...,...,...,...,...,...
1292,7403655906,2021-11-04T17:10:53-0700,QR Code Link to This Post\n\n\n2 Bd. 2 Bath (u...,"[2, 2, 1800, 4065 iowa st. near El Cajon Blvd.]","[[cats are OK - purrr], [flooring: , [carpet]]..."
1293,7405944950,2021-11-09T23:37:17-0800,QR Code Link to This Post\n\n\nThese spectacul...,"[1, 1, 2695, Kellogg St near Rosecrans]","[[application fee details: , [$40/adult applic..."
1294,7402527487,2021-11-02T09:47:51-0700,QR Code Link to This Post\n\n\nSan Diego's mos...,"[2, 2, 3819, 5395 Napa St]","[[apartment], [w/d in unit], [detached garage]]"
1295,7405332384,2021-11-08T13:48:34-0800,QR Code Link to This Post\n\n\nSDHC affordable...,"[1, 1, 1500, Washington St]","[[air conditioning], [flooring: , [wood]], [ap..."


In [7]:
# def scrape_posting(posts):
#     quick_info = dict()
#     for post in posts:
#         try:
#             listing = requests.get(post)
#             content = listing.text
#             soup = BeautifulSoup(content)
#             if soup is None:
#                 continue
#             price = soup.find('span', attrs = {'class': 'price'}).text.strip('$').replace(',', '')
#             temp = soup.find('span', attrs = {'class': 'shared-line-bubble'}).text.split('/')
#             num_beds = temp[0].strip('BR ')
#             num_baths = temp[1].strip(' Ba')
#             post_date = soup.find('time', attrs = {'class': 'date timeago'}).get('datetime')
#             try:
#                 post_id = soup.find_all('p', attrs = {'class': 'postinginfo'})[1].text.strip('post id: ')
#             except:
#                 post_id = None
#             try:
#                 sqft = soup.find_all('span', attrs = {'class': 'shared-line-bubble'})[1].text.strip('ft2')
#             except:
#                 sqft = 0
#             # needs additional parsing, considering one hot encoding
#             try:
#                 attributes = set(soup.find_all('p', attrs = {'class': 'attrgroup'})[1].find_all('span')) 
#             except:
#                 attributes = None
#             description = soup.find('section', attrs = {'id': 'postingbody'})
#             try:
#                 address = soup.find('div', attrs = {'class': 'mapaddress'}).text
#             except:
#                 address = "N/A"
#             attrs = {
#                 'post_id': post_id,
#                 'post_date': post_date,
#                 'price': price,
#                 'num_beds': num_beds,
#                 'num_baths': num_baths,
#                 'sqft': sqft,
#                 'address': address,
#                 'attributes': attributes,
#                 'description': description
#             }
#         except:
#             print('scrape failed for {}'.format(post))
#         quick_info[post_id] = attrs
#     return quick_info

In [8]:
df = scrape_posting(links)

scraping https://sandiego.craigslist.org/nsd/apa/d/solana-beach-your-luxury-beach-studio/7399870869.html
scraping https://sandiego.craigslist.org/csd/apa/d/san-diego-this-br-will-not-last-rent-me/7388318340.html
scraping https://sandiego.craigslist.org/csd/apa/d/san-diego-charming-and-cozy-la-jolla/7402619134.html
scraping https://sandiego.craigslist.org/csd/apa/d/san-diego-hardwood-flooring-amazing/7399931888.html
scraping https://sandiego.craigslist.org/csd/apa/d/san-diego-studio/7393806067.html
scraping https://sandiego.craigslist.org/nsd/apa/d/solana-beach-furnished-short-term/7401624145.html
scraping https://sandiego.craigslist.org/csd/apa/d/san-diego-room-for-rent-in-the-heart-of/7402224053.html
scraping https://sandiego.craigslist.org/csd/apa/d/san-diego-complimentary-coffee-bar/7402289124.html
scraping https://sandiego.craigslist.org/nsd/apa/d/solana-beach-relaxed-living-is-at-elan/7402499391.html
scraping https://sandiego.craigslist.org/csd/apa/d/del-mar-carmel-valley-corner-u

In [10]:
test = pd.DataFrame(data = df.values())

In [13]:
test.head(2)

Unnamed: 0,post_id,post_date,price,num_beds,num_baths,sqft,address,attributes,description
0,7399870869,2021-10-27T08:40:07-0700,2120,0,1,450,833 S. Cedros Ave. near Via De La Valle,"{[cats are OK - purrr], [off-street parking], ...","[\n, [\n, [QR Code Link to This Post], \n, [],..."
1,7388318340,2021-10-01T16:17:06-0700,4009,3,2,1323,5395 Napa St,"{[attached garage], [apartment], [w/d in unit]}","[\n, [\n, [QR Code Link to This Post], \n, [],..."


In [19]:
test['price'] = test['price'].astype(int)

In [21]:
test['price'].mean()

2904.106365834005

In [23]:
test.to_json('test_craigslist.json')