## Required imports

In [77]:
# imports
from lxml import html, etree
import requests
import pandas as pd
import datetime as dt

## Function

In [79]:
def rightmove_webscrape(rightmove_url):
    
# Get the start & end of the web url around the index value
    start,end = rightmove_url.split('&index=')
    url_start = start+'&index='
    url_end = end[1:]
    
# Initialise variables
    price_pcm=[]
    titles=[]
    addresses=[]
    weblinks=[]
    page_counts=[]
    
# Initialise pandas DataFrame for results.
    df=pd.DataFrame(columns=['price','type','address','url'])

# Get the total number of results from the search
    page = requests.get(rightmove_url)
    tree = html.fromstring(page.content)
    xp_result_count = '//span[@class="searchHeader-resultCount"]/text()'
    result_count = int(tree.xpath(xp_result_count)[0])
    
# Turn the total number of search results into number of iterations for the loop
    loop_count = result_count/24
    if result_count%24>0:
        loop_count = loop_count+1
        
# Set the Xpath variables for the loop
    xp_prices = '//span[@class="propertyCard-priceValue"]/text()'
    xp_titles = '//div[@class="propertyCard-details"]//a[@class="propertyCard-link"]//h2[@class="propertyCard-title"]/text()'
    xp_addresses = '//address[@class="propertyCard-address"]/text()'
    xp_weblinks = '//div[@class="propertyCard-details"]//a[@class="propertyCard-link"]/@href'

# Start the loop through the search result web pages
    for pages in range(0,loop_count,1):
        rightmove_url = url_start+str(pages*24)+url_end
        page = requests.get(rightmove_url)
        tree = html.fromstring(page.content)
        
# Reset variables
        price_pcm=[]
        titles=[]
        addresses=[]
        weblinks=[]

# Create data lists from Xpaths
        for val in tree.xpath(xp_prices):
            price_pcm.append(val)
        for val in tree.xpath(xp_titles):
            titles.append(val)
        for val in tree.xpath(xp_addresses):
            addresses.append(val)
        for val in tree.xpath(xp_weblinks):
            weblinks.append(val)

# Convert data to temporary DataFrame
        data = [price_pcm, titles, addresses, weblinks]
        temp_df= pd.DataFrame(data)
        temp_df = temp_df.transpose()
        temp_df.columns=['price','type','address','url']
        
# Drop empty rows from DataFrame which come from placeoholders in html file.
        temp_df = temp_df[temp_df.url != '/property-for-sale/property-0.html']
    
# Join temporary DataFrame to main results DataFrame.
        frames = [df,temp_df]
        df = pd.concat(frames)

# Renumber results DataFrame index to remove duplicate index values.
    df = df.reset_index(drop=True)

# Convert price column to numeric values for analysis.
    df.price.replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    df.price=pd.to_numeric(df.price)

# Add in date column with date website was queried (i.e. today's date).
    now = dt.datetime.today().strftime("%d/%m/%Y")
    df['date'] = now

# Optional line to export the results to CSV if you wish to inspect them in an alternative program.
#     df.to_csv('rightmove_df.csv',encoding='utf-8')
 
    return df

## Using the function

In [80]:
# Create a search on the rightmove.co.uk website, then copy and paste the url from the first results page to this variable:
rightmove_url = 'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E70417&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=7&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

In [81]:
# Run the function 
df=rightmove_webscrape(rightmove_url)

# View the first few results
df.head()

## Analysis

In [83]:
print 'Different types of housing returned in the search:'
print
print df.type.unique()

array(['1 bedroom flat', '2 bedroom maisonette', '1 bedroom apartment',
       '2 bedroom apartment', '2 bedroom flat', '3 bedroom apartment',
       'Studio flat', '3 bedroom terraced house',
       '4 bedroom terraced house', '1 bedroom property', '3 bedroom flat',
       '2 bedroom duplex', '1 bedroom detached house',
       '4 bedroom apartment', '2 bedroom semi-detached house'], dtype=object)

In [None]:
print 'Average rent pcm - all results:'
print
print '£',round(df.price.mean(),2)

In [None]:
# error checking

# The below is a method for exporting the full html text from the url for inspection if required.
page = requests.get(rightmove_url)
tree = html.fromstring(page.content)
html_text=etree.tostring(tree)
file = open("html.txt", "w")
file.write(html_text)
file.close()