In [1]:
from bs4 import BeautifulSoup # For HTML parsing
import requests # Website connections
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
import pandas as pd # For converting results to a dataframe and bar chart plots
import json # For parsing json
%matplotlib inline
from tqdm import tqdm

In [2]:
url = requests.get('https://www.royallepage.ca/en/search/homes/on/toronto/?property_type=&house_type=&features=&listing_type=&lat=43.648690000000045&lng=-79.38543999999996&bypass=&address=Toronto&address_type=city&city_name=Toronto&prov_code=ON&display_type=gallery-view&da_id=&travel_time=&school_id=&search_str=Toronto%2C+Ontario%2C+CAN&travel_time_min=30&travel_time_mode=drive&travel_time_congestion=&min_price=0&max_price=5000000%2B&min_leaseprice=0&max_leaseprice=5000%2B&beds=0&baths=0&transactionType=SALE&keyword=')
soup = BeautifulSoup(url.content)   

In [7]:
url.status_code

AttributeError: 'str' object has no attribute 'status_code'

In [9]:
url = 'https://www.royallepage.ca/en/search/homes/on/toronto/1/?property_type=&house_type=&features=&listing_type=&lat=43.648690000000045&lng=-79.38543999999996&bypass=&address=Toronto&address_type=city&city_name=Toronto&prov_code=ON&display_type=gallery-view&da_id=&travel_time=&school_id=&search_str=Toronto%2C+Ontario%2C+CAN&travel_time_min=30&travel_time_mode=drive&travel_time_congestion=&min_price=0&max_price=5000000%2B&min_leaseprice=0&max_leaseprice=5000%2B&beds=0&baths=0&transactionType=SALE&keyword='

In [8]:
def get_home_info(link):

    
    col = ['price', 'address', 'city and province', 'house_type', 'beds', 'bath'] 
    df = pd.DataFrame(columns = col)

    prices = []
    street_add = []
    city_prov = []
    house_type = []
    beds = []
    bath = []
    
    count = 1
    
    for i in range(23):
        
        link.replace(f'{count}', f'{count+1}')
        url = requests.get(link)
        
        soup = BeautifulSoup(url.content)


        for price, address, cityprov, contents in zip(soup.find_all(class_='price'), soup.find_all(class_='address-1'), 
                                                    soup.find_all(class_='card__address-2'), 
                                                    soup.find_all(class_='listing-meta listing-meta--small')):

            prices.append(price.get_text(strip=True).replace(',', '').replace('$', ''))
            street_add.append(address.get_text(strip=True))
            city_prov.append(cityprov.get_text(strip=True).replace('\n', '').replace(' ', ''))
            house_type.append(contents.get_text(strip=True).replace('\n', '').replace(' ', '').split(',')[0][:-4])
            beds.append(contents.get_text(strip=True).replace('\n', '').replace(' ', '').split(',')[0][-4:][:1])
            try:
                bath.append(contents.get_text(strip=True).replace('\n', '').replace(' ', '').split(',')[1])
            except:
                bath.append('no bath found')
    
    df['price'] = prices
    df['address'] = street_add
    df['city and province'] = city_prov
    df['house_type'] = house_type
    df['beds'] = beds
    df['bath'] = bath
    
    return df

In [10]:
%%time
df = get_home_info(url)

Wall time: 24.3 s


In [11]:
address_list = [x.replace(' ', '-') for x in df['address'].tolist()]

address_list

['64-Highland-Ave',
 '60-Codsell-Ave',
 '#5203--311-Bay-St',
 '130-Albany-Ave',
 '11-Riderwood-Dr',
 '445-Oriole-Pkwy',
 '126-Princess-Anne-Cres',
 '2970-Bayview-Ave',
 '370-Woburn-Ave',
 '#413--39-Queens-Quay-E',
 '230-Royal-York-Rd',
 '230-Royal-York-Rd',
 '153-Medland-St',
 '127-Eastville-Ave',
 '22b-Corinth-Gdns',
 '43-Castle-Knock-Rd',
 '66-Shippigan-Cres',
 '#809--130-Carlton-St',
 '#106--380-Macpherson-Ave',
 '#6009--1-Bloor-St-E',
 '#801--41-Dovercourt-Rd',
 '#411--1815-Yonge-St',
 '#901--41-Dovercourt-Rd',
 '#Lot-133--75-Frederick-Tisdale-Dr',
 '160b-Audrey-Ave',
 '#1010--360-Bloor-St-E',
 '490-Gladstone-Ave',
 '29-Tiverton-Ave',
 '472-Westmount-Ave',
 '120-Clansman-Blvd',
 '#22--240-Broadway-Ave',
 '29-Appleton-Ave',
 '563-Craven-Rd',
 '6-Minto-Street',
 '#2112--10-Navy-Wharf-Crt',
 '23-Mccormack-St',
 '5-Bisley-St',
 '#3206--750-Bay-St',
 '615-Craven-Rd',
 '#Ph1--500-Richmond-St-W',
 '96-Evelyn-Wiggins-Dr',
 '71-Christina-Cres',
 '99-Sufi-Cres',
 '351a-Oakwood-Ave',
 '#128--

In [12]:
def get_propinfo(addresses):
    
    property_info = []
    
    for address in addresses:
        try:
            link = 'https://www.royallepage.ca/en/property/ontario/toronto/' + address.lower()

#            print(link)

            url = requests.get(link)

            soup = BeautifulSoup(url.content)   

            info = soup.find(class_='body-15 body-15--light').get_text(strip=True)

            property_info.append(info)
       
        except:
            property_info.append('link not valid')
        
        
    return property_info

In [13]:
%%time
property_info = get_propinfo(address_list)

df['property_info'] = property_info

df

Wall time: 5min 43s


Unnamed: 0,price,address,city and province,house_type,beds,bath,property_info
0,6849000,64 Highland Ave,"Toronto,ON",House,8bds,8bths,Beautifully Renovated Home On One Of Rosedale'...
1,3449000,60 Codsell Ave,"Toronto,ON",House,5bds,7bths,"Exquisite Design, Layout And Finishes. Boastin..."
2,2995000,#5203 -311 Bay St,"Toronto,ON",Condo,2bds,3bths,link not valid
3,2795000,130 Albany Ave,"Toronto,ON",House,4bds,4bths,"High Income Producing Victorian Gem, Boasting ..."
4,2680000,11 Riderwood Dr,"Toronto,ON",House,5bds,4bths,link not valid
...,...,...,...,...,...,...,...
1053,960000,71 Christina Cres,"Toronto,ON",House,3bds,1bath,Highly Sought After Neighbourhood & Street. La...
1054,929888,99 Sufi Cres,"Toronto,ON",House,4bds,5bths,Location! Location! Location! Beautiful 3 Stor...
1055,929000,351a Oakwood Ave,"Toronto,ON",House,3bds,2bths,link not valid
1056,899900,#128 -20 Echo Pt,"Toronto,ON",Townhouse,4bds,2bths,link not valid


In [34]:
df['beds'] = [x.replace('bds', '') for x in df['beds'].tolist()]
df['bath'] = [x.replace('bths', '') for x in df['bath'].tolist()]
df['house_type'] = [x.replace('1', '') for x in df['house_type'].tolist()]

In [35]:
df['house_type'].value_counts()

House        690
Condo        276
Townhouse     69
Invest        23
Name: house_type, dtype: int64