In [None]:
from bs4 import BeautifulSoup as bs
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import pandas as pd
import time
import re
import datetime as dt
from datetime import date
import googlemaps


## Scraping

In [None]:
path = r"C:\Users\ajk51\Desktop\Apartment Search\chromedriver"
ser = Service(path)

options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")

driver  = webdriver.Chrome(service = ser, options=options)
driver.get("https://www.furnishedfinder.com/")


In [None]:
#Enters city and selects max budget option to pull in all options.

city = input('Enter destination city, state')

search  = driver.find_element(By.ID,'where')
search.send_keys(city)

budget = Select(driver.find_element(By.ID, 'maxbudget'))
budget.select_by_visible_text('$3,400+')                                   

go = driver.find_element(By.ID, 'Go')
go.click()


In [None]:
#Deselect room&hotel options, select pet friendly. Extract page state at that time, and create soup object

room = driver.find_element(By.ID, 'Shared1')
hotel = driver.find_element(By.ID, 'Hotel1')
pets = driver.find_element(By.XPATH,'//*[@id="ctl00_ContentPlaceHolder1_divRowFilter"]/div/div/div[4]/label')
#view = driver.find_element(By.CLASS_NAME, 'map_view')

#view.click()
room.click()
time.sleep(1)
hotel.click()
time.sleep(1)
pets.click()

time.sleep(15) #Required for proper function. Page needs some time to load in assets

page_source = driver.page_source
soup = bs(page_source, 'lxml')

driver.close()

In [None]:
#Finds and stores all listings, identified under 'table_container' class. Results dict initialized, then ID #'s
#and blurb info pulled from listings.

results = soup.find_all('div', class_='table_container')
key_list = []

for item in results:
    key_list.append(item.find(id=True).get('id')[1:])

cols = ['Property Type', 'Bedrooms', 'Rent', 'Utilities Included', 'Min Stay', 'Bathrooms', 'Fees', 'Yard', 'Parking']
listings = pd.DataFrame(index=key_list, columns=cols)

In [None]:
def extract_details(prop_id):
    '''Using ID number, navigate to details page for property and extract data to fill dataframe
        Returns 9 values'''
    try:
        url = F"https://www.furnishedfinder.com/property/{prop_id}"
        page = requests.get(url, 'lxml')
        soup = bs(page.text)

        tags = soup.find_all(class_='display_property')

        property_type = soup.find(name='label', text='Type').parent.div.text.strip()

        min_stay = soup.find(name='label', text='Minimum Term').parent.div.text.strip()

        bedrooms = soup.find(name='label', text='Bedrooms').parent.div.text.strip()[0]
        if bedrooms.isalpha(): #Classifies studio apts as 0.5 bedroom
            bedrooms = '0.5'

        bathrooms = soup.find(name='label', text='Bathrooms').parent.div.text.strip()[0]

        if 'Yes' in soup.find(name='label', text='Utilities Included?').parent.text:
            util_included = True
        else:
            util_included = False

        rent = soup.find(id='price1Div').text.strip().strip('$')

        try:
            fees = soup.find(name='h3', text=re.compile('Property Fees')).parent.parent.ul.get_text(strip=True, separator='/')
        except:
            fees = None

        parking = re.search('[\w ]*[pP]arking[\w ]*', str(soup.find(class_='AccList'))) #Searches 'Accomodations'
        if parking == None:
            parking == False
        else:
            parking = parking.group().strip()

        if re.search('[yY]ard', soup.find(id='collapseOne3').p.text)== None: #Searches property description
            yard = False
        else:
            yard = True

        return property_type, bedrooms, rent, util_included, min_stay, bathrooms, fees, yard, parking
    
    except:
        print(F"Extract details failure on {prop_id}")
        property_type, bedrooms, rent, util_included, min_stay, bathrooms, fees, yard, parking = ['ERR'] * 9
        
        return property_type, bedrooms, rent, util_included, min_stay, bathrooms, fees, yard, parking
        

In [None]:
def find_availability(prop_id):
    '''Using ID number, navigates to availability page for property and extracts "Not available" dates
    Webdriver required to navigate into iframe'''    
    
    try:
        options = Options()
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.headless = True
        driver  = webdriver.Chrome(service = ser, options=options)
        driver.get(F"https://www.furnishedfinder.com/property/{prop_id}/avail")

        frame = driver.find_element(By.TAG_NAME, 'iframe')
        driver.switch_to.frame(frame)

        time.sleep(1)

        scripts = driver.find_elements(By.TAG_NAME, 'script')
        avail = scripts[-3].get_attribute('outerHTML')

        check = re.findall("(?:Not Available.*?start:.*?)(\d{4}-\d{2}-\d{2})(?:.*?)(\d{4}-\d{2}-\d{2})", avail)
        driver.close()

        return check
    
    except:
        print(F"Find availability failure on {prop_id}")
        return 'ERR'

In [None]:
def extract_coordinates(prop_id):
    '''Using ID number, navigates to location page and extracts lat & lon data'''

    try:
        url = F"https://www.furnishedfinder.com/property/{prop_id}/loc"
        page = requests.get(url, 'lxml')
        loc_soup = bs(page.text)

        text = str(loc_soup(attrs={'data-js-optimize':'minify'})[0]) #Should always return one match

        return re.search('(?:{center: new google.maps.LatLng\()(.*)(?:\))',  text)[1].split(', ')
    
    except:
        print(F"Extract coords failure on {prop_id}")
        return "ERR"

In [None]:
#Extract data from listings and filter

listings = listings.apply(lambda x: extract_details(x.name), axis =1, result_type='expand')

listings.rename(columns=dict(zip(range(9), cols)), inplace=True)   

listings['Availability'] = listings.apply(lambda x: find_availability(x.name), axis=1)
listings['Location'] = listings.apply(lambda x: extract_coordinates(x.name), axis=1)
listings['Link'] = listings.apply(lambda x: F"https://furnishedfinder.com/property/{x.name}", axis=1)



## Data Cleaning

In [None]:
def filter_dates(row):
    '''Designed for use on a series. Checks and adds tuples of dates to results if today"s date is between them'''
    results = []
    today = dt.date.today()
    
    for entry in row:
        start = date.fromisoformat(entry[0])
        end = date.fromisoformat(entry[1])

        if (start < today < end or today < start):
            results.append(entry)
            
    return results
            

In [None]:
def availability_mask(row, start_date, end_date):
    
    for entry in row:
        res_start = date.fromisoformat(entry[0])
        res_end = date.fromisoformat(entry[1])
        
        if (job_start < res_start < job_end or job_start < res_end < job_end):
            return False
        
        if  (res_start < job_start < job_end or res_start < job_end < res_end):
            return False
        
        
    return True

In [None]:
#Deletes records with ERR entries, removes extraneous reservation data, cleans Rent columns.

listings = listings[listings['Availability'] != 'ERR']

listings['Availability'] = listings['Availability'].apply(filter_dates)

listings.to_pickle(input('Enter name for pickle file')+ '.pkl')

job_start = dt.datetime.strptime(input('Start date, YYYY, MM, DD'), '%Y, %m, %d').date()
job_end = dt.datetime.strptime(input('End date, YYYY, MM, DD'), '%Y, %m, %d').date()

listings['Is Available'] = listings['Availability'].apply(availability_mask, args=(job_start, job_end))
listings = listings[listings['Is Available'] == True]




## Google API Cells

In [None]:
#googlemaps API
#First, setup googlemaps object with API key, define work position, and pick departure time.

geolocator = googlemaps.Client(key='AIzaSyCGlQ7vlXIpzq9ngfd1Eoro9ERdKzcGmtM')
work = (input('lat, lon of work'))

In [None]:
#Run apply across all rows, using lat and lon values from each listing. RUN ONLY ONCE

listings['Time To Work'] = (listings.apply(lambda x: 
                           geolocator.directions((x['Location']), work, #directions requires lat & lon tuples
                             mode='driving', departure_time=job_start)
                           [0]['legs'][0]['duration_in_traffic']['value'] ,axis=1)) #indexing into the records get drive time

In [None]:
#Clean rent column and export to csv for use in Data Studio. Overwrite pickle file with updated listings data

listings.to_pickle(input('Name for final pickle file') + '.pkl')

listings['Rent'] = listings['Rent'].str.strip('/Month')
listings['Rent'] = listings['Rent'].str.slice(stop=5).str.slice_replace(start=1, stop=2)
listings['Location'] = listings['Location'].apply(lambda x: ', '.join(x))

listings.to_csv(input('File name for csv export') + '.csv', index_label='ID')