# Notebook Overview

TLDR; Scrape Data, Create CSV file

Scrape Data:
* Request HTML from [My Student Halls](https://www.mystudenthalls.com/) site
* Scrape for Property, Owner/Operator, Address, Link URL
* Scrape for 1 x Type, Rent (p/w vs pcm), weeks
* Add data for 1 to master df
* Scrape for N x (Type, Rent (p/w vs pcm), weeks)
* Add data for N to master df
* Save as CSV
* Scrape for Area (sqm vs sqft)

# Import Libraries

In [1]:
# DataFrame
import pandas as pd

# Webscraping
from bs4 import BeautifulSoup as bs
import requests

# Creation of DataFrames

In [2]:
# Initial DataFrame to scrape the URLs
df_urls = pd.DataFrame(columns=['Property', 'url'])

# Column headings to be used in DataFrame
cols = ['Property', 'Owner / Operator', 'Type', 'Rent pw',
        'Rent pcm', 'Rent pa', 'Weeks', 'Area sqm', 'Area sqft',
        '£psf pa', 'Address', 'url']

# Master DataFrame to be populated
df = pd.DataFrame(columns=cols)

# Temporary DataFrame to be populated, added to Master DF then cleared
df_temp = pd.DataFrame(columns=cols)

# Which University?

In [3]:
# To be filled in by user
university = input()
url_home = 'https://www.mystudenthalls.com/?s=' + university

Leeds


# Web Scrape

## DF_URL (Scrape Property Names & associated URLs)

In [4]:
# Make a get request to retrieve the page
html_page = requests.get(url_home)

# Pass the page contents to beautiful soup for parsing
soup = bs(html_page.text, 'html.parser')

### Scrape all property names at the university

In [5]:
# Retreive all property names
all_properties = soup.find_all('div', class_='listing-detail')

In [6]:
# List for all the property names
property_names = []
for property_ in list(range(len(all_properties))):
    # Scrape the name of the nth property
    property_name = all_properties[property_].find("h2").get_text()
    # Add property name to the list of property names
    property_names.append(property_name)
    
# Add list of all property names to DataFrame
df_urls['Property'] = property_names

### Scrape all URLs for the properties

In [7]:
# Create a list of the properties' url links
links = []
for property_ in list(range(len(all_properties))):
    link = all_properties[property_].h2.a['href']
    links.append(link)
    
# Add List of properties' url links to df_url
df_urls['url'] = links

# Preview DataFrame
df_urls.head(2)

Unnamed: 0,Property,url
0,The Foundry,https://www.mystudenthalls.com/student-accommo...
1,Asa Briggs House,https://www.mystudenthalls.com/student-accommo...


## DF_Temp (temporary dataframe to collect all data for 1 property)

In [8]:
n = 0

# nth property
url = df_urls['url'][n]

# Make a get request to retrieve the page
html_page = requests.get(url)

# Pass the page contents to beautiful soup for parsing
soup = bs(html_page.text, 'html.parser')

### Type

In [9]:
# # List for all the types of room at this property
# rooms = soup.findAll('p', attrs={'class':'room_title'})

# types = []
# for room_num in list(range(len(rooms))):
#     types.append(rooms[room_num].text)
    
# # Add Room Type to df_temp
# df_temp['Type'] = types

In [22]:
def func_type(soup):
    # List for all the types of room at this property
    rooms = soup.findAll('p', attrs={'class':'room_title'})

    types = []
    for room_num in list(range(len(rooms))):
        types.append(rooms[room_num].text)

    return types

In [23]:
# Add Room Type to df_temp
df_temp['Type'] = func_type(soup)

### Rent PA

In [10]:
# # HTML that contains info we need for Rent PA
# rents_pa = soup.findAll('li', attrs={'class':'tenancy'})

# # Blank list to gather the different rents PA
# rent_pa = []

# # For each room, add the rent price to the blank list above
# for rent_pa_num in list(range(len(rents_pa))):
#     room_rent = rents_pa[rent_pa_num].text
#     rent_pa.append(int(room_rent[13:-3].replace(',','')))

# # Add Rent PA to df_temp
# df_temp['Rent pa'] = rent_pa

In [26]:
def func_rent_pa(soup):
    # HTML that contains info we need for Rent PA
    rents_pa = soup.findAll('li', attrs={'class':'tenancy'})

    # Blank list to gather the different rents PA
    rent_pa = []

    # For each room, add the rent price to the blank list above
    for rent_pa_num in list(range(len(rents_pa))):
        room_rent = rents_pa[rent_pa_num].text
        rent_pa.append(int(room_rent[13:-3].replace(',','')))
    
    return rent_pa

### Weeks

In [11]:
# # HTML that contains info we need for Weeks
# weeks_data = soup.findAll('li', attrs={'class':'availability'})

# # List of number of weeks room is available for in the year
# weeks = []
# for n in list(range(len(weeks_data)))[1::2]: #availablity tag also returns good/limited info of room, so we only want every other one
#     # add only the number of weeks to the list
#     weeks.append(int(weeks_data[n].text[7:9]))

# # Add number of weeks to the dataframe    
# df_temp['Weeks'] = weeks

In [28]:
def func_weeks(soup):
    # HTML that contains info we need for Weeks
    weeks_data = soup.findAll('li', attrs={'class':'availability'})

    # List of number of weeks room is available for in the year
    weeks = []
    for n in list(range(len(weeks_data)))[1::2]: #availablity tag also returns good/limited info of room, so we only want every other one
        # add only the number of weeks to the list
        weeks.append(int(weeks_data[n].text[7:9]))
    
    return weeks

### Rent PW

In [12]:
df_temp['Rent pw'] = df_temp['Rent pa'] / df_temp['Weeks']

### Rent PCM

In [13]:
df_temp['Rent pcm'] = df_temp['Rent pa'] / 12

### Address

In [14]:
# HTML that contains info we need for Address
address_data = soup.findAll('div', attrs={'class':'inner'})

df_temp['Address'] = address_data[5].h3.text[9:]

### Owner/Operator

In [15]:
# Scrape the Owner/Operator name
owner = soup.findAll('aside', attrs={'id':'sidebar-detail'})
owner = owner[0].h2.text

# Add Owner/Operator name to df_temp
df_temp['Owner / Operator'] = owner

### URL

In [16]:
df_temp['url'] = url

### Property

In [17]:
df_temp['Property'] = df_urls['Property'][n]

## Function to populate df_temp

In [29]:
def func_build_df_temp(n, soup):
    '''Populate the df_temp dataframe
    n: the property number as it appears in the list from the url_home
    soup: beautiful soup of the nth property's url, to be parsed'''

    # Column headings to be used in DataFrame
    cols = ['Property', 'Owner / Operator', 'Type', 'Rent pw',
        'Rent pcm', 'Rent pa', 'Weeks', 'Area sqm', 'Area sqft',
        '£psf pa', 'Address', 'url']

    # Temporary DataFrame to be populated, added to Master DF then cleared
    df_temp = pd.DataFrame(columns=cols)
    
    # Type
    df_temp['Type'] = func_type(soup)
    # RentPA
    df_temp['Rent pa'] = func_rent_pa(soup)
    # Weeks
    df_temp['Weeks'] = func_weeks(soup)
    # RentPW
    df_temp['Rent pw'] = df_temp['Rent pa'] / df_temp['Weeks']
    # RentPCM
    df_temp['Rent pcm'] = df_temp['Rent pa'] / 12
    # Address
    # HTML that contains info we need for Address
    df_temp['Address'] = soup.findAll('div',attrs={'class':'inner'})[5].h3.text[9:]
    # Owner/Operator
    df_temp['Owner / Operator'] = soup.findAll('aside', attrs={'id':'sidebar-detail'})[0].h2.text
    # URL
    df_temp['url'] = df_urls['url'][n]
    # Property
    df_temp['Property'] = df_urls['Property'][n]
    
    return df_temp

In [33]:
n = 1

# Make a get request to retrieve the page
html_page = requests.get(df_urls['url'][n])

# Pass the page contents to beautiful soup for parsing
soup = bs(html_page.text, 'html.parser')

func_build_df_temp(n, soup)

Unnamed: 0,Property,Owner / Operator,Type,Rent pw,Rent pcm,Rent pa,Weeks,Area sqm,Area sqft,£psf pa,Address,url
0,Asa Briggs House,Asa Briggs House,Bronze En-Suite,169.0,718.25,8619,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
1,Asa Briggs House,Asa Briggs House,Bronze Plus En-Suite,173.0,735.25,8823,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
2,Asa Briggs House,Asa Briggs House,Silver En-Suite,180.0,765.0,9180,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
3,Asa Briggs House,Asa Briggs House,Gold En-Suite,187.0,794.75,9537,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
4,Asa Briggs House,Asa Briggs House,Bronze Studio,210.0,892.5,10710,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
5,Asa Briggs House,Asa Briggs House,Silver Studio,221.0,939.25,11271,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
6,Asa Briggs House,Asa Briggs House,Gold Studio,231.0,981.75,11781,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
7,Asa Briggs House,Asa Briggs House,Platinum Studio,245.0,1041.25,12495,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
8,Asa Briggs House,Asa Briggs House,Platinum Plus Studio,255.0,1083.75,13005,51,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...
9,Asa Briggs House,Asa Briggs House,Bronze Plus En-suite,177.0,649.0,7788,44,,,,"Asa Briggs House, 6 St John's Rd, Woodhouse, L...",https://www.mystudenthalls.com/student-accommo...


## Add df_temp to df and clear df_temp

In [18]:
# Concatenate (add) df_temp to df
df = pd.concat([df, df_temp])

# Clear df_temp
df_temp = pd.DataFrame(columns=cols)

Unnamed: 0,Property,Owner / Operator,Type,Rent pw,Rent pcm,Rent pa,Weeks,Area sqm,Area sqft,£psf pa,Address,url
0,Symons House,Abodus Student Living,Standard En-suite,153.0,650.25,7803,51,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
1,Symons House,Abodus Student Living,Standard En-suite,163.0,597.666667,7172,44,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
2,Symons House,Abodus Student Living,Standard Plus En-Suite,163.0,692.75,8313,51,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
3,Symons House,Abodus Student Living,Standard Plus En-Suite,168.0,616.0,7392,44,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
4,Symons House,Abodus Student Living,Premium En-Suite,168.0,714.0,8568,51,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
5,Symons House,Abodus Student Living,Premium En-Suite,173.0,634.333333,7612,44,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
6,Symons House,Abodus Student Living,Deluxe En-suite,179.0,760.75,9129,51,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
7,Symons House,Abodus Student Living,Standard Studio,195.0,828.75,9945,51,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
8,Symons House,Abodus Student Living,Standard Plus Studio,239.0,1015.75,12189,51,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
9,Symons House,Abodus Student Living,Standard Plus Studio Dual Occupancy,279.0,1185.75,14229,51,,,,"The Foundry, Cavendish Street, Leeds, LS3 IBN",https://www.mystudenthalls.com/student-accommo...
