# Notebook Overview 

Project Steps:

* Blank DataFrame to be populated
* Scrape relevant Data
* Populate DataFrame

# Install Libraries

In [1]:
!pip install BeautifulSoup4



In [2]:
!pip install requests



# Import Libraries

In [3]:
# DataFrame
import pandas as pd

# Webscraping
from bs4 import BeautifulSoup as bs
import requests

# Blank Pandas DataFrame

In [4]:
# Column headings to be used in DataFrame
cols = ['Property', 'Owner / Operator', 'Type', 'Rent pw',
        'Rent pcm', 'Rent pa', 'Weeks', 'Area sqm', 'Area sqft',
        '£psf pa', 'Address', 'url']

In [5]:
# Initial DataFrame to scrape the URLs
df_urls = pd.DataFrame(columns=['Property', 'url'])

# Master DataFrame to be populated
df = pd.DataFrame(columns=cols)

# Temporary DataFrame to be populated, added to Master DF then cleared
df2 = pd.DataFrame(columns=cols)

# Scrape

* Scrape Data:
    * Request HTML from [My Student Halls](https://www.mystudenthalls.com/) site
    * Scrape for Property, Owner/Operator, Address, Link URL
    * Scrape for 1 x Type, Rent (p/w vs pcm), weeks
    * Add data for 1 to master df
    * Scrape for N x (Type, Rent (p/w vs pcm), weeks)
    * Add data for N to master df
    * Save as CSV
    * Scrape for Area (sqm vs sqft)

In [6]:
# To be filled in by user
# university = input()
university = 'leeds'

In [7]:
url_home = 'https://www.mystudenthalls.com/?s=' + university

In [8]:
# Make a get request to retrieve the page
html_page = requests.get(url_home)

# Pass the page contents to beautiful soup for parsing
soup = bs(html_page.text, 'html.parser')

## Property Names

In [9]:
# Retreive all property names
all_properties = soup.find_all('div', class_='listing-detail')
property_one = all_properties[0] 

# Show the first example
property_one 

<div class="listing-detail">
<h3><strong>Leeds •</strong> Fresh </h3>
<h2><a href="https://www.mystudenthalls.com/student-accommodation/leeds/the-refinery/">The Refinery</a></h2>
<div class="listing-price-holder">
<a href="https://www.mystudenthalls.com/student-accommodation/leeds/the-refinery/">
<span>Rooms from</span>
<span class="listing-price"><span><em>£159<span class="pence">.00</span></em> Per Week</span></span>
</a>
</div>
<div class="shortlist-indicator">
<a class="add-to-shortlist" data-action="add_quick_contact" data-id="38589" data-price="£159.00" data-title="The Refinery" href=""><span class="letter icon sprite"></span>Add to Shortlist</a>
</div>
</div>

In [10]:
# Name of first property
property_one.find("h2").get_text()

'The Refinery'

In [11]:
# Names of all properties
property_names = []
for property_ in list(range(len(all_properties))):
    # Scrape the name of the nth property
    property_name = all_properties[property_].find("h2").get_text()
    # Add property name to the list of property names
    property_names.append(property_name)
    
# Add property names to DataFrame
df_urls['Property'] = property_names

# Preview the DataFrame
df_urls.head(2)

Unnamed: 0,Property,url
0,The Refinery,
1,iQ Marsden House,


## URLs

In [12]:
# Create a list of the property url links
links = []
for property_ in list(range(len(all_properties))):
    link = all_properties[property_].h2.a['href']
    links.append(link)

In [13]:
# Populate DataFrame for URLs    
df_urls['url'] = links

# Preview DataFrame
df_urls.head(2)

Unnamed: 0,Property,url
0,The Refinery,https://www.mystudenthalls.com/student-accommo...
1,iQ Marsden House,https://www.mystudenthalls.com/student-accommo...


## Owner/Operator

In [14]:
n = 0

# nth property
url = df_urls['url'][n]

# Make a get request to retrieve the page
html_page = requests.get(url)

# Pass the page contents to beautiful soup for parsing
soup = bs(html_page.text, 'html.parser')

In [15]:
# Scrape the Owner/Operator name
owner = soup.findAll('aside', attrs={'id':'sidebar-detail'})
owner = owner[0].h2.text

df2['Owner / Operator'] = owner

In [16]:
df2.head(2)

Unnamed: 0,Property,Owner / Operator,Type,Rent pw,Rent pcm,Rent pa,Weeks,Area sqm,Area sqft,£psf pa,Address,url


## Type 

In [17]:
rooms = soup.findAll('p', attrs={'class':'room_title'})
rooms[0].text

'Bronze En Suite'

In [18]:
types = []
for room_num in list(range(len(rooms))):
    types.append(rooms[room_num].text)
types

['Bronze En Suite',
 'Bronze En Suite',
 'Silver En Suite',
 'Silver En Suite',
 'Gold En Suite',
 'Gold En Suite',
 'Platinum En Suite',
 'Platinum En Suite',
 'Bronze Studio',
 'Silver Studio',
 'Gold Studio',
 'Platinum Studio',
 '2021-22 Rooms']

In [19]:
df_urls['Property'][n]

'The Refinery'

In [20]:
df2 = pd.DataFrame(columns=['Property', 'Type'])
# Add room types to dataframe
df2['Type'] = types
# Fill Property name
df2['Property'] = df_urls['Property'][n]
df2.head(2)

Unnamed: 0,Property,Type
0,The Refinery,Bronze En Suite
1,The Refinery,Bronze En Suite


## Rent PA

In [21]:
# Gather HTML that contains info we need for Rent PA
rents_pa = soup.findAll('li', attrs={'class':'tenancy'})

# Blank list to gather the different rents PA
rent_pa = []

# For each room, add the rent price to the blank list above
for rent_pa_num in list(range(len(rents_pa))):
    room_rent = rents_pa[rent_pa_num].text
    rent_pa.append(int(room_rent[13:-3].replace(',','')))
    
rent_pa

[7436,
 8364,
 7524,
 8466,
 7656,
 8619,
 7832,
 7612,
 10710,
 10965,
 11220,
 11475,
 6996]

In [22]:
# Add Rent PA to our DataFrame
df2['Rent pa'] = rent_pa

In [23]:
df2.head(2)

Unnamed: 0,Property,Type,Rent pa
0,The Refinery,Bronze En Suite,7436
1,The Refinery,Bronze En Suite,8364


## Weeks

In [24]:
# Gather HTML that contains info we need for Weeks
weeks_data = soup.findAll('li', attrs={'class':'availability'})

In [25]:
weeks = []
for n in list(range(len(weeks_data)))[1::2]: #availablity tag also returns good/limited info of room, so we only want every other one
    # add only the number of weeks to the list
    weeks.append(int(weeks_data[n].text[-2:]))

# Add number of weeks to the dataframe    
df2['Weeks'] = weeks

In [26]:
df2.head(2)

Unnamed: 0,Property,Type,Rent pa,Weeks
0,The Refinery,Bronze En Suite,7436,44
1,The Refinery,Bronze En Suite,8364,51


### Rent pw

In [27]:
df2['Rent pw'] = df2['Rent pa'] / df2['Weeks']
df2.head(2)

Unnamed: 0,Property,Type,Rent pa,Weeks,Rent pw
0,The Refinery,Bronze En Suite,7436,44,169.0
1,The Refinery,Bronze En Suite,8364,51,164.0


### Rent pcm

In [28]:
df2['Rent pcm'] = df2['Rent pa'] / 12
df2.head(2)

Unnamed: 0,Property,Type,Rent pa,Weeks,Rent pw,Rent pcm
0,The Refinery,Bronze En Suite,7436,44,169.0,619.666667
1,The Refinery,Bronze En Suite,8364,51,164.0,697.0


## Address

In [29]:
# Gather HTML that contains info we need for Address
address_data = soup.findAll('div', attrs={'class':'inner'})
address_data[5].h3.text[9:]

'Bingley Street, Leeds, Leeds, LS3 1BZ'

In [30]:
df2['Address'] = address_data[5].h3.text[9:]
df2.head(2)

Unnamed: 0,Property,Type,Rent pa,Weeks,Rent pw,Rent pcm,Address
0,The Refinery,Bronze En Suite,7436,44,169.0,619.666667,"Bingley Street, Leeds, Leeds, LS3 1BZ"
1,The Refinery,Bronze En Suite,8364,51,164.0,697.0,"Bingley Street, Leeds, Leeds, LS3 1BZ"


## URL

In [31]:
df2['url'] = url
df2.head(2)

Unnamed: 0,Property,Type,Rent pa,Weeks,Rent pw,Rent pcm,Address,url
0,The Refinery,Bronze En Suite,7436,44,169.0,619.666667,"Bingley Street, Leeds, Leeds, LS3 1BZ",https://www.mystudenthalls.com/student-accommo...
1,The Refinery,Bronze En Suite,8364,51,164.0,697.0,"Bingley Street, Leeds, Leeds, LS3 1BZ",https://www.mystudenthalls.com/student-accommo...


# Function to repeat making df2 and combine them all

In [32]:
df2.head(2)

Unnamed: 0,Property,Type,Rent pa,Weeks,Rent pw,Rent pcm,Address,url
0,The Refinery,Bronze En Suite,7436,44,169.0,619.666667,"Bingley Street, Leeds, Leeds, LS3 1BZ",https://www.mystudenthalls.com/student-accommo...
1,The Refinery,Bronze En Suite,8364,51,164.0,697.0,"Bingley Street, Leeds, Leeds, LS3 1BZ",https://www.mystudenthalls.com/student-accommo...


# Save master df to a csv file