In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
! python --version

Python 3.10.4


# Rightmove Webscraper

In [3]:
import datetime as dt
from lxml import html, etree
import pandas as pd
import requests

#from rightmove_webscraper import rightmove_data
from property_scraper.rightmove_data import RightmoveData

## Python class to scrape property listings from the <a href="http://www.rightmove.co.uk">rightmove</a> website

In [4]:
url = "https://www.rightmove.co.uk/property-for-sale/find.html?searchType=SALE&locationIdentifier=REGION%5E94346"
rm = RightmoveData(url)

### Example scraping properties for sale

URL returned from searching with the following criteria:

* Property for SALE
* Area = "Battersea Power Station"
* Radius = within 3 miles
* Price range, bedrooms = no min or max
* Property type = Any
* Added to site within =  last 7 days

In [5]:
sale_url = "http://www.rightmove.co.uk/property-for-sale/find.html?searchType=SALE\
&locationIdentifier=REGION%5E94346&insId=1\
&radius=3.0\
&minPrice=&maxPrice=\
&minBedrooms=&maxBedrooms=\
&displayPropertyType=\
&maxDaysSinceAdded=7\
&_includeSSTC=on\
&sortByPriceDescending=\
&primaryDisplayPropertyType=\
&secondaryDisplayPropertyType=\
&oldDisplayPropertyType=\
&oldPrimaryDisplayPropertyType=\
&newHome=\
&auction=false"

In [6]:
# Create the instance of the class on the search URL.
sale_object = RightmoveData(sale_url)

ConnectionError: HTTPConnectionPool(host='www.rightmove.co.uk', port=80): Max retries exceeded with url: /property-for-sale/find.html?searchType=SALE&locationIdentifier=REGION%5E94346&insId=1&radius=3.0&minPrice=&maxPrice=&minBedrooms=&maxBedrooms=&displayPropertyType=&maxDaysSinceAdded=7&_includeSSTC=on&sortByPriceDescending=&primaryDisplayPropertyType=&secondaryDisplayPropertyType=&oldDisplayPropertyType=&oldPrimaryDisplayPropertyType=&newHome=&auction=false&index=288 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f83b19632b0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [None]:
# The .rent_or_sale attribute tells you what type of properties the search URL is for.
sale_object.rent_or_sale

In [None]:
# The .results_count attribute returns the number of search results returned by the URL.
sale_object.results_count

In [None]:
# The .result_pages_count attribute returns the number of search pages returned by the URL.
# Note that the rightmove website limits the number of results pages to a maximum of 42
sale_object.page_count

In [None]:
# The .get_results() method returns all listings in a pandas dataframe.
sale_results = sale_object.get_results
sale_results.head()

## Analysis of the London Rental Property Market
#### Toby Petty - 11 March 2018
Analysis of the London rental property market by using all rental listings added to <a href="http://www.rightmove.co.uk" _target="blank">rightmove</a> in the last 24 hours.

### Example scraping properties for rent

URL returned from a same criteria as above, except Property for RENT instead of for sale, added in last 24hrs:

In [None]:
rent_url = "http://www.rightmove.co.uk/property-to-rent/find.html?searchType=RENT&\
locationIdentifier=REGION%5E87490&insId=1\
&radius=0.0\
&minPrice=&maxPrice=\
&minBedrooms=&maxBedrooms=\
&displayPropertyType=\
&maxDaysSinceAdded=1\
&sortByPriceDescending=\
&_includeLetAgreed=on\
&primaryDisplayPropertyType=\
&secondaryDisplayPropertyType=\
&oldDisplayPropertyType=\
&oldPrimaryDisplayPropertyType=\
&letType=\
&letFurnishType=\
&houseFlatShare="

In [None]:
rent_url = 'http://www.rightmove.co.uk/property-to-rent/find.html?searchType=RENT\
&locationIdentifier=REGION%5E94346&insId=3\
&radius=3.0\
&minPrice=&maxPrice=\
&minBedrooms=&maxBedrooms=\
&displayPropertyType=\
&maxDaysSinceAdded=7\
&sortByPriceDescending=\
&_includeLetAgreed=on\
&primaryDisplayPropertyType=\
&secondaryDisplayPropertyType=\
&oldDisplayPropertyType=\
&oldPrimaryDisplayPropertyType=\
&letType=\
&letFurnishType=\
&houseFlatShare=false'

Create the instance of the class on the search URL to scrape rightmove.

In [None]:
rent_object = RightmoveData(rent_url)

In [None]:
# The .rent_or_sale attribute tells you what type of properties the search URL is for.
rent_object.rent_or_sale

In [None]:
# The .results_count attribute returns the number of search results returned by the URL.
rent_object.results_count

In [None]:
# The .result_pages_count attribute returns the number of search pages returned by the URL.
# Note that the rightmove website limits the number of results pages to a maximum of 42
rent_object.page_count

In [None]:
# The .get_results() method returns all listings in a pandas dataframe.
rent_results = rent_object.get_results
rent_results.head()

## Using the function

To use the function you first need to go to http://www.rightmove.co.uk/ and perform your search based on your desired criteria (e.g. 1 bedroom flats to rent in London Fields added to the website in the last 7 days). When the first page of results comes up copy the long url from the browser window and set it as the *rightmove_url* variable (in this example the search is for all residential properties to rent in London added to the website today):

In [None]:
# Example query: London fields, to rent, added last 7 days:
'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E70417&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=7&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

In [None]:
# Example query: All London, to rent, added today:
'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=1&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

In [None]:
rightmove_url = 'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=1&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

Then simply run the function on the url variable to create the dataframe. Here we'll assign the results to the *df* variable:

In [None]:
df = rent_object.rightmove_webscrape(rightmove_url)

We can look at the first few rows of data to check that the function worked as expected:

In [None]:
df.head()

And export the full results to *csv* for analysis:

In [None]:
df.to_csv('../output/search_results.csv',encoding='utf-8',index=False)

## Optional html export

In the event that the search does not return results as expected it may be that the Xpaths have changed and need updating; alternatively you may wish to add in additional Xpaths to collect more data. The below will export the full html text file from whichever url you set as the variable *rightmove_url*.

In [None]:
# Below is a method for exporting the full html text from the url if you wish to inspect it in detail.
page = requests.get(rightmove_url)
tree = html.fromstring(page.content)
html_text = etree.tostring(tree)

with open("../data/html.txt", "w") as f:
    f.write(html_text)

In [None]:
# Read in the csv file for analysis
df = pd.read_csv('../output/search_results.csv')
df.head()

## Analysis

Quick look at the shape of the data.

In [None]:
df.describe()

In [None]:
sorted(df.number_bedrooms.unique())

See which 'types' do not have bedroom number extracted

In [None]:
df[df.number_bedrooms.isnull()].type.unique()

In [None]:
print(f"Most expensive: {df[df['price'] == df['price'].max()]['price'].values[0]} GBP.")

Create a data frame with summary statistics by number of bedrooms.

In [None]:
df_by_bedroom = pd.DataFrame(columns = ['number_bedrooms','count','average_price'])
df_by_bedroom['number_bedrooms'] = df.number_bedrooms[df.number_bedrooms.notnull()].unique().astype(int)
df_by_bedroom.sort_values(by='number_bedrooms',inplace=True)
df_by_bedroom.reset_index(drop=True,inplace=True)
for i in range(0,len(df_by_bedroom.index),1):
    df_by_bedroom['count'].loc[i]=df[df.number_bedrooms==i].number_bedrooms.count()
    df_by_bedroom['average_price'].loc[i]=df[df.number_bedrooms==i].price.mean()

Plots to visualise results by number of bedrooms.

In [None]:
rent_object.plot_by_bedroom(df_by_bedroom)

## Using the postcode csv to find the property borough

Read in the csv

In [None]:
london_postcodes_df = pd.read_csv('../data/london_postcodes.csv')
london_postcodes_df.head()

In [None]:
# Tidy up column names

rent_object.fix_characters_in_column_names(london_postcodes_df)
london_postcodes_df.head()

Create new columns for the stem & end of the postcodes.

In [None]:
postcode_split = london_postcodes_df.Postcode.str.split(' ', expand=True).astype(str)
postcode_split.columns=('pc_stem','pc_end')
pc_df = pd.merge(london_postcodes_df,postcode_split,how='left',left_index=True,right_index=True)
pc_df.head()

Create a pivot table of the postcode stem by how many full postcodes sit in each borough

In [None]:
pivot = pd.DataFrame(pd.pivot_table(pc_df, values=['Postcode'], index=['pc_stem'], columns=['London_Borough'], aggfunc='count'))
pivot.head()

Unfortunately many postcode stems have full postcodes in more than one borough; e.g. N1 has postcodes in both Islington and Hackney. We'll just default to whichever borough has the greatest number of postcodes for each postcode stem.

Create dataframe of postcode stems & boroughs

In [None]:
pc_borough_lookup = pd.DataFrame(pivot.idxmax(axis=1))
pc_borough_lookup = pc_borough_lookup.reset_index()

Merge borough column into main df

In [None]:
df.postcode.unique()

In [None]:
df['postcode'] = df['postcode'].astype(str)
pc_borough_lookup['pc_stem'] = pc_borough_lookup['pc_stem'].astype(str)
df = pd.merge(df, pc_borough_lookup, how='left', left_on='postcode', right_on='pc_stem')
df = df.rename(columns={0:'borough'})

In [None]:
df.head()

In [None]:
print(f"Number of records in full dataframe: {len(df)}")
print(f"Number of records with borough data: {len(df[~pd.isnull(df['borough'])])}")

#### Focus on 1 beds by borough

In [None]:
indices = df[(df['borough'].notnull()) & (df['number_bedrooms'] == 1)].index
df_1beds = df.loc[indices]
df_1beds['number_bedrooms'] = df_1beds['number_bedrooms'].astype(int)
df_1beds.head()

In [None]:
borough_pivot = pd.pivot_table(data=df_1beds, values='price', index='borough', aggfunc=('count', 'mean'))
display(borough_pivot.head())
borough_pivot = borough_pivot.sort_values(by='mean', ascending=False)
borough_pivot = borough_pivot.rename(columns={'count':'number of property listings', 'mean':'mean price'})
borough_pivot['number of property listings'] = borough_pivot['number of property listings'].astype(int)
borough_pivot

Number of Bedrooms

In [None]:
# Average prices by number of bedrooms
pd.pivot_table(df, values='price', index='number_bedrooms', aggfunc=('mean','count'),dropna=True)

In [None]:
df.count()

In [None]:
print(f"Average rent pcm - all results: {round(df.price.mean(), 2)} GBP.")

### How many listings are there for each type of apartment?
E.g. Studios, 1-beds, 2-beds etc.

In [None]:
rent_object.plot_by_type()

## Which postcode areas have the most listings?

In [None]:
rent_object.plot_by_postcode(number_to_plot = 25)

## What are the average prices and number of listings in each London borough?

In [None]:
#rent_object.add_inner_outer(df, '../data/inner_outer_london.csv')

First off for all types of apartment:

In [None]:
# File to use to add in borough data:
filename = "../data/london_postcodes.csv"
postcodes_df = pd.read_csv(filename, low_memory = False)

price_by_borough_df = rent_object.summary_df(rent_object.remove_null_rows(rent_object.add_inner_outer(rent_object.add_borough(df, postcodes_df), filename), "borough"), 
                                             "borough", "price", filename)

rent_object.borough_scatterplot(price_by_borough_df, "count", "mean", "borough",
                    "London Rentals: Average Price by Number of Listings",
                    "Number of Listings", "Average Price", "inner_outer")

Next just for one-beds:

In [None]:
one_beds = df[df["number_bedrooms"]=="1"]
one_beds_borough_df = rent_object.summary_df(rent_object.remove_null_rows(rent_object.add_inner_outer(rent_object.add_borough(one_beds, postcodes_df), filename), "borough"),
                                 col_to_group = "borough", col_to_summarise = "price")

rent_object.borough_scatterplot(one_beds_borough_df, "count", "mean", "borough",
                    "London Rentals: Average One-Bed Price by Number of Listings",
                    "Number of Listings", "Average Price", "inner_outer")

Wandsworth and Tower Hamlets look like cheaper inner-London boroughs with plentiful listings ...
## Find some promising listings

In [None]:
sale_object.cheap_listings(postcodes_df, boroughs = ["Wandsworth", "Tower Hamlets"], number = 10)