In [113]:
# Script for prototyping data flow
from bs4 import BeautifulSoup
import requests

# Set up base info
BASE_URL_PREFIX = "https://www.spareroom.co.uk/flatshare/?offset="
BASE_URL_SUFFIX = "&search_id=962193229&sort_by=age&mode=list"

# Utility function for formatting url
def getFormedURL(prefix, suffix, page_index):
    return prefix + str(page_index) + suffix
    

# Set up request and BS object
url = getFormedURL(BASE_URL_PREFIX, BASE_URL_SUFFIX, 0)
res = requests.get(url)
if int(res.status_code) != 200:
    print("couldnt make a successful request...")
soup = BeautifulSoup(res.text, "html.parser")

In [129]:

# Get articles for page
def getPageListings(soup):
    return soup.find_all("li", class_="listing-result")


listings = getPageListings(soup)

# THINGS WE WANT TO PARSE PER LISTING
parseKeys = [
    "data-listing-id",
    "data-listing-title",
    "data-listing-available",
    "data-listing-status",
    "data-listing-days-old",
    "data-listing-neighbourhood",
    "data-listing-property-type",
    "data-listing-rooms-in-property",
    "data-listing-advertiser-role",
    "data-default-payment"
]

def parseListing(listing, keys):
    res = {}
    for key in keys:
        
        try:
            cleaned_key = "-".join(key.split("-")[2:])
            res[cleaned_key] = listing[key]
        except:
            continue
            
    return res


def parseListings(listings, keys):
    return list(map(lambda listing: parseListing(listing, keys), listings))

def parseListingPrices(soup):
    prices = soup.find_all("strong", class_="listingPrice")
    return list(map(lambda priceElement: priceElement.text, prices))

def parseListingLinks(soup):
    links = soup.find_all("a", class_="more desktop")
    return list(map(lambda link: link["href"], links))
    
listings = parseListings(listings, parseKeys)
prices = parseListingPrices(soup)
links = parseListingLinks(soup)

In [130]:
print(links[0])

/flatshare/fad_click.pl?fad_id=2874990&search_id=962193229&offset=0&city_id=&flatshare_type=offered&search_results=%2Fflatshare%2F%3Foffset%3D0%26search_id%3D962193229%26sort_by%3Dage%26mode%3Dlist&


In [131]:
# Fuse lists into single object
def mergeData(listingInfo, prices, links):

    data = []
    for index, listing in enumerate(listingInfo, 0):
        listingObject = listing
        listingObject["price"] = prices[index]
        listingObject["link"] = links[index]
        data.append(listingObject)
    
    return data

In [132]:
cleaned_listings = mergeData(listings, prices, links)
print(cleaned_listings[0])

{'id': '2874990', 'title': ' AVOID TUBE - Aldgate Zone 1 - Walk to Office', 'available': 'Available Now', 'status': 'new', 'days-old': '1', 'neighbourhood': 'Aldgate', 'property-type': 'flat', 'rooms-in-property': '3', 'advertiser-role': 'live out landlord', 'price': '£650pcm', 'link': '/flatshare/fad_click.pl?fad_id=2874990&search_id=962193229&offset=0&city_id=&flatshare_type=offered&search_results=%2Fflatshare%2F%3Foffset%3D0%26search_id%3D962193229%26sort_by%3Dage%26mode%3Dlist&'}


In [133]:
# Setup Data flow for all the pages of listingin Zone 1
index = 0
scraped_objects = 0

scrapedListings = []


while True:
    if failed_requests == 3:
        break
    # Set up request and BS object
    url = getFormedURL(BASE_URL_PREFIX, BASE_URL_SUFFIX, index)
    res = requests.get(url)
    if int(res.status_code) != 200:
        print("couldnt make a successful request for index %s." %str(index))
        print("retrying with next page")
    soup = BeautifulSoup(res.text, "html.parser")
    
    # Extract and parse Data
    listings = getPageListings(soup)
    listings = parseListings(listings, parseKeys)
    prices = parseListingPrices(soup)
    links = parseListingLinks(soup)
    
    # Merge into single data objects and add
    data = mergeData(listings, prices, links)
    scrapedListings += data
    
    # Progression info
    print(f"Successfully scraped {len(scrapedListings)} listings...")
    
    index += 1
    

Successfully scraped 11 listings...
Successfully scraped 22 listings...
Successfully scraped 33 listings...
Successfully scraped 44 listings...
Successfully scraped 55 listings...
Successfully scraped 66 listings...
Successfully scraped 77 listings...
Successfully scraped 88 listings...
Successfully scraped 99 listings...
Successfully scraped 110 listings...
Successfully scraped 121 listings...
Successfully scraped 132 listings...
Successfully scraped 143 listings...
Successfully scraped 154 listings...
Successfully scraped 165 listings...
Successfully scraped 176 listings...
Successfully scraped 187 listings...
Successfully scraped 198 listings...
Successfully scraped 209 listings...
Successfully scraped 220 listings...
Successfully scraped 231 listings...
Successfully scraped 242 listings...
Successfully scraped 253 listings...
Successfully scraped 264 listings...
Successfully scraped 275 listings...
Successfully scraped 286 listings...
Successfully scraped 297 listings...
Successful

Successfully scraped 2330 listings...
Successfully scraped 2341 listings...
Successfully scraped 2352 listings...
Successfully scraped 2363 listings...
Successfully scraped 2374 listings...
Successfully scraped 2385 listings...
Successfully scraped 2396 listings...
Successfully scraped 2407 listings...
Successfully scraped 2418 listings...
Successfully scraped 2429 listings...
Successfully scraped 2440 listings...
Successfully scraped 2451 listings...
Successfully scraped 2461 listings...
Successfully scraped 2470 listings...
Successfully scraped 2478 listings...
Successfully scraped 2485 listings...
Successfully scraped 2491 listings...
Successfully scraped 2496 listings...
Successfully scraped 2500 listings...
Successfully scraped 2503 listings...
Successfully scraped 2505 listings...
Successfully scraped 2516 listings...
Successfully scraped 2527 listings...
Successfully scraped 2538 listings...
Successfully scraped 2549 listings...
Successfully scraped 2560 listings...
Successfully

Successfully scraped 4571 listings...
Successfully scraped 4582 listings...
Successfully scraped 4593 listings...
Successfully scraped 4604 listings...
Successfully scraped 4615 listings...
Successfully scraped 4626 listings...
Successfully scraped 4637 listings...
Successfully scraped 4648 listings...
Successfully scraped 4659 listings...
Successfully scraped 4670 listings...
Successfully scraped 4681 listings...
Successfully scraped 4692 listings...
Successfully scraped 4703 listings...
Successfully scraped 4714 listings...
Successfully scraped 4725 listings...
Successfully scraped 4736 listings...
Successfully scraped 4747 listings...
Successfully scraped 4758 listings...
Successfully scraped 4769 listings...
Successfully scraped 4780 listings...
Successfully scraped 4791 listings...
Successfully scraped 4802 listings...
Successfully scraped 4813 listings...
Successfully scraped 4824 listings...
Successfully scraped 4835 listings...
Successfully scraped 4846 listings...
Successfully

Successfully scraped 6812 listings...
Successfully scraped 6823 listings...
Successfully scraped 6834 listings...
Successfully scraped 6845 listings...
Successfully scraped 6856 listings...
Successfully scraped 6867 listings...
Successfully scraped 6878 listings...
Successfully scraped 6889 listings...
Successfully scraped 6900 listings...
Successfully scraped 6911 listings...
Successfully scraped 6922 listings...
Successfully scraped 6933 listings...
Successfully scraped 6944 listings...
Successfully scraped 6955 listings...
Successfully scraped 6966 listings...
Successfully scraped 6977 listings...
Successfully scraped 6988 listings...
Successfully scraped 6999 listings...
Successfully scraped 7010 listings...
Successfully scraped 7021 listings...
Successfully scraped 7032 listings...
Successfully scraped 7043 listings...
Successfully scraped 7054 listings...
Successfully scraped 7065 listings...
Successfully scraped 7076 listings...
Successfully scraped 7087 listings...
Successfully

Successfully scraped 9090 listings...
Successfully scraped 9100 listings...
Successfully scraped 9110 listings...
Successfully scraped 9120 listings...
Successfully scraped 9129 listings...
Successfully scraped 9137 listings...
Successfully scraped 9144 listings...
Successfully scraped 9150 listings...
Successfully scraped 9155 listings...
Successfully scraped 9159 listings...
Successfully scraped 9162 listings...
Successfully scraped 9164 listings...
Successfully scraped 9165 listings...
Successfully scraped 9175 listings...
Successfully scraped 9185 listings...
Successfully scraped 9195 listings...
Successfully scraped 9205 listings...
Successfully scraped 9215 listings...
Successfully scraped 9225 listings...
Successfully scraped 9235 listings...
Successfully scraped 9245 listings...
Successfully scraped 9255 listings...
Successfully scraped 9265 listings...
Successfully scraped 9275 listings...
Successfully scraped 9285 listings...
Successfully scraped 9295 listings...
Successfully

Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 10275 listings...
Successfully scraped 1027

KeyboardInterrupt: 

In [140]:
scrapedListings[7640]

{'id': '14984098',
 'title': 'Luxury Suite in Paddington',
 'available': 'Available Now',
 'status': 'new',
 'days-old': '1',
 'neighbourhood': 'Paddington',
 'property-type': 'flat',
 'rooms-in-property': '1',
 'advertiser-role': 'agent',
 'price': '£675pcm',
 'link': '/flatshare/flatshare_detail.pl?flatshare_id=14984098&search_id=962193229&city_id=&flatshare_type=offered&search_results=%2Fflatshare%2F%3Foffset%3D731%26search_id%3D962193229%26sort_by%3Dage%26mode%3Dlist&'}

In [142]:
df = pd.DataFrame(scrapedListings)
df

Unnamed: 0,advertiser-role,available,days-old,id,link,neighbourhood,price,property-type,rooms-in-property,status,title
0,live out landlord,Available Now,1,2874990,/flatshare/fad_click.pl?fad_id=2874990&search_...,Aldgate,£650pcm,flat,3,new,AVOID TUBE - Aldgate Zone 1 - Walk to Office
1,agent,Available Now,0,12741853,/flatshare/flatshare_detail.pl?flatshare_id=12...,Hoxton,\n£650pcm,flat,5,boosted,Nice single Rm in modern Flat - Hoxton/Shoreditch
2,agent,Available Now,0,15039631,/flatshare/flatshare_detail.pl?flatshare_id=15...,London SW3,£400pcm,flat,4,new today,🔷Classy Double Room in Chelsea area🔷
3,agent,Available Now,0,14138806,/flatshare/flatshare_detail.pl?flatshare_id=14...,Hoxton,\n£400pcm,flat,5,boosted,Modern Double in new-built Flat - Hoxton/Shore...
4,agent,Available Now,0,15039616,/flatshare/flatshare_detail.pl?flatshare_id=15...,London W8,"£870- £1,050pcm",flat,5,new today,🏢Special Double Room in Kensington area🏢
5,live out landlord,9 Jun,0,15039457,/flatshare/flatshare_detail.pl?flatshare_id=15...,Clerkenwell,"\n£870- £1,050pcm",flat,5,new today,Bright double room in friendly clean flat
6,agent,Available Now,0,15039577,/flatshare/flatshare_detail.pl?flatshare_id=15...,London SW1V,£500pcm,flat,5,new today,🌅En-suite Double Room in Victoria area🌅
7,agent,Available Now,0,15039394,/flatshare/flatshare_detail.pl?flatshare_id=15...,Marylebone,\n£500pcm,flat,3,new today,🔵 Promotion! Marylebone
8,agent,Available Now,0,15039535,/flatshare/flatshare_detail.pl?flatshare_id=15...,London NW1,"£950- £1,060pcm",flat,4,new today,🌇Elegant Double Room in Baker Street area🌇
9,agent,Available Now,0,15014524,/flatshare/flatshare_detail.pl?flatshare_id=15...,Marylebone,"\n£950- £1,060pcm",flat,2,boosted,Amazing Rooms Marylebone


In [144]:
df.to_csv("data/listings/raw")