# Notebook Overview

TLDR; Scrape Data, Create CSV file

Scrape Data:
* Request HTML from [My Student Halls](https://www.mystudenthalls.com/) site
* Scrape for Property, Owner/Operator, Address, Link URL
* Scrape for 1 x Type, Rent (p/w vs pcm), weeks
* Add data for 1 to master df
* Scrape for N x (Type, Rent (p/w vs pcm), weeks)
* Add data for N to master df
* Save as CSV
* Scrape for Area (sqm vs sqft)

# Import Libraries

In [1]:
# DataFrame
import pandas as pd

# Webscraping
from bs4 import BeautifulSoup as bs
import requests

# Creation of DataFrames

In [2]:
# Initial DataFrame to scrape the URLs
df_urls = pd.DataFrame(columns=['Property', 'url'])

# Column headings to be used in DataFrame
cols = ['Property', 'Owner / Operator', 'Type', 'Rent pw',
        'Rent pcm', 'Rent pa', 'Weeks', 'Area sqm', 'Area sqft',
        '£psf pa', 'Address', 'url']

# Master DataFrame to be populated
df = pd.DataFrame(columns=cols)

# Temporary DataFrame to be populated, added to Master DF then cleared
df_temp = pd.DataFrame(columns=cols)

# Which University?

In [3]:
# To be filled in by user
university = input()
url_home = 'https://www.mystudenthalls.com/?s=' + university

Oxford


# Web Scrape

## DF_URL (Scrape Property Names & associated URLs)

In [4]:
# Make a get request to retrieve the page
html_page = requests.get(url_home)

# Pass the page contents to beautiful soup for parsing
soup = bs(html_page.text, 'html.parser')

### Scrape all property names at the university

In [5]:
# Retreive all property names
all_properties = soup.find_all('div', class_='listing-detail')

In [6]:
# List for all the property names
property_names = []
for property_ in list(range(len(all_properties))):
    # Scrape the name of the nth property
    property_name = all_properties[property_].find("h2").get_text()
    # Add property name to the list of property names
    property_names.append(property_name)
    
# Add list of all property names to DataFrame
df_urls['Property'] = property_names

### Scrape all URLs for the properties

In [7]:
# Create a list of the properties' url links
links = []
for property_ in list(range(len(all_properties))):
    link = all_properties[property_].h2.a['href']
    links.append(link)
    
# Add List of properties' url links to df_url
df_urls['url'] = links

# Preview DataFrame
df_urls.head(2)

Unnamed: 0,Property,url
0,"Between Towns Court, Oxford",https://www.mystudenthalls.com/student-accommo...
1,Student Castle Oxford,https://www.mystudenthalls.com/student-accommo...


## DF_Temp (temporary dataframe to collect all data for 1 property)

### Type

In [8]:
def func_type(soup):
    # List for all the types of room at this property
    rooms = soup.findAll('p', attrs={'class':'room_title'})

    types = []
    for room_num in list(range(len(rooms))):
        types.append(rooms[room_num].text)

    return types

### Rent PA

In [9]:
def func_rent_pa(soup):
    # HTML that contains info we need for Rent PA
    rents_pa = soup.findAll('li', attrs={'class':'tenancy'})

    # Blank list to gather the different rents PA
    rent_pa = []

    # For each room, add the rent price to the blank list above
    for rent_pa_num in list(range(len(rents_pa))):
        room_rent = rents_pa[rent_pa_num].text
        rent_pa.append(int(room_rent[13:-3].replace(',','')))
    
    return rent_pa

### Weeks

In [10]:
def func_weeks(soup):
    # HTML that contains info we need for Weeks
    weeks_data = soup.findAll('li', attrs={'class':'availability'})

    # List of number of weeks room is available for in the year
    weeks = []
    for n in list(range(len(weeks_data)))[1::2]: #availablity tag also returns good/limited info of room, so we only want every other one
        # add only the number of weeks to the list
        weeks.append(int(weeks_data[n].text[7:9]))
    
    return weeks

## Function to populate df_temp

In [11]:
def func_build_df_temp(n, soup):
    '''Populate the df_temp dataframe
    n: the property number as it appears in the list from the url_home
    soup: beautiful soup of the nth property's url, to be parsed'''

    # Column headings to be used in DataFrame
    cols = ['Property', 'Owner / Operator', 'Type', 'Rent pw',
        'Rent pcm', 'Rent pa', 'Weeks', 'Area sqm', 'Area sqft',
        '£psf pa', 'Address', 'url']

    # Temporary DataFrame to be populated, added to Master DF then cleared
    df_temp = pd.DataFrame(columns=cols)
    
    # Type
    df_temp['Type'] = func_type(soup)
    # RentPA
    df_temp['Rent pa'] = func_rent_pa(soup)
    # Weeks
    df_temp['Weeks'] = func_weeks(soup)
    # RentPW
    df_temp['Rent pw'] = df_temp['Rent pa'] / df_temp['Weeks']
    df_temp['Rent pw'] = df_temp['Rent pw'].round(2) # round to 2 decimal places
    # RentPCM
    df_temp['Rent pcm'] = df_temp['Rent pa'] / 12
    df['Rent pcm'] = df['Rent pcm'].round(2) # round to 2 decimal places
    # Address
    # HTML that contains info we need for Address
    df_temp['Address'] = soup.findAll('div',attrs={'class':'inner'})[5].h3.text[9:]
    # Owner/Operator
    df_temp['Owner / Operator'] = soup.findAll('aside', attrs={'id':'sidebar-detail'})[0].h2.text
    # URL
    df_temp['url'] = df_urls['url'][n]
    # Property
    df_temp['Property'] = df_urls['Property'][n]
    
    return df_temp

In [12]:
for n in list(range(len(df_urls))):
    # Make a get request to retrieve the nth property's url page
    html_page = requests.get(df_urls['url'][n])

    # Pass the page contents to beautiful soup for parsing
    soup = bs(html_page.text, 'html.parser')
    
    # Build df_temp of nth property
    df_temp = func_build_df_temp(n, soup)
    
    # Concatenate (add) df_temp to df
    df = pd.concat([df, df_temp])        

    # Clear df_temp
    df_temp = pd.DataFrame(columns=cols)

# Export df as csv file

In [13]:
csv_name = '../data/{}_student_accommodation_data.csv'.format(university)
df.to_csv(csv_name, index=False)

STILL TO FIX

* Edinburgh example had an issue with the text extraction from weeks.  wasnt in base 10.  maybe need to just extract the numbers, and the first numbers at that (in case its 44 or 55 weeks).  

* Make it really easily useable by james