### Setup

Install chromium-chromedriver into Google Colab

In [None]:
!pip install selenium
!pip install fake_useragent
!apt-get update
!apt install chromium-chromedriver

Import libs, init driver, vars and GS client

In [3]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from fake_useragent import UserAgent

import re
import pandas as pd

from google.auth import default
from google.colab import auth
import gspread

In [5]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--window-size=1920,1080')
# Initialize the user agent rotator
ua = UserAgent()
chrome_options.add_argument(f"user-agent={ua.random}")

driver = webdriver.Chrome(options=chrome_options)

In [6]:
auth.authenticate_user()

creds, _ = default()
gs_client = gspread.authorize(creds)

In [7]:
olx_url = 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/'

In [8]:
# Get html element after it appeared
def get_element(by, name):
    return WebDriverWait(driver, 30).until(EC.presence_of_element_located((by, name)))

### Parse pages avaliable

Get last page

In [9]:
driver.get(olx_url)
pages = get_element(By.CLASS_NAME, 'pagination-list')
last_page = int(pages.find_elements(By.TAG_NAME, 'li')[-1].text)

Generate all pages

In [10]:
pages_url = [f'{olx_url}?page={num}' for num in range(1, last_page+1)]
pages_url

['https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=1',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=2',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=3',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=4',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=5',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=6',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=7',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=8',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=9',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=10',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/?page=11',
 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgos

Get post url from each page

In [11]:
get_cards_url_script = '''
let url_list = []

document.querySelectorAll(`[data-testid="l-card"]`).forEach(card => {
    card_url = card.getElementsByTagName('a')[0].href
    url_list.push(card_url)
})

return url_list
'''

In [12]:
post_urls = []

Parse first 5 pages

In [13]:
for page in pages_url[:5]:
    driver.get(page)
    urls = driver.execute_script(get_cards_url_script)
    post_urls += urls

In [14]:
len(post_urls)

260

### Extract data from each post

In [15]:
post_df = pd.DataFrame(columns=['url', 'price', 'location', 'floor', 'max_floor', 'area'])

In [16]:
post_parse_script = '''
let price = document.querySelector(`[data-testid="ad-price-container"]`).innerText
let spans = document.querySelectorAll('li')
let side_section = document.querySelector(`[data-testid="aside"]`).innerText

let floor = NaN
let max_floor = NaN
let area = NaN

for (let item of spans) {

    if (item.innerText.startsWith('Поверховість'))
        max_floor = item.innerText.match(/\d+/)[0]
    else if (item.innerText.startsWith('Поверх'))
        floor = item.innerText.match(/\d+/)[0]
    else if (item.innerText.startsWith('Загальна площа'))
        area = item.innerText.match(/\d+/)[0]
}

let pos_loc = side_section.search('МІСЦЕЗНАХОДЖЕННЯ')
let k_location = side_section.slice(pos_loc+18)
let coma = k_location.search(',')
let fin_location = k_location.slice(0, coma)


return {fin_location, price, floor, max_floor, area}
'''

In [17]:
def parse_data(url):
    # Function that parses price, location, floor, max_floor and area

    # Delete cookies and get new User-Agent
    driver.delete_all_cookies()
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": ua.random})

    driver.get(url)

    # Scroll to the needed field of view using JavaScript executor
    driver.execute_script("window.scrollTo(0, 900);")
    driver.implicitly_wait(1)
    # Wait a bit to ensure the content is fully loaded
    WebDriverWait(driver, 10).until(lambda d: d.execute_script('return document.readyState') == 'complete')

    post = driver.execute_script(post_parse_script)

    return pd.DataFrame.from_dict({
        'url': [url],
        'price': [post['price']],
        'location': [post['fin_location']],
        'floor': [post['floor']],
        'max_floor': [post['max_floor']],
        'area': [post['area']]
    })


Parse each post webpage until all are processed. Each post will be processed max 5 times

In [18]:
post_parse_temp = post_urls

In [None]:
soft_stop = 0

while post_parse_temp and soft_stop < 5:

    print("Urls to parse left:", len(post_parse_temp))

    for url in post_parse_temp:
        try:
            row_df = parse_data(url)
            print(row_df.to_string())
            post_df = pd.concat([post_df, row_df])
        except Exception as e:
            # 404
            if driver.title == 'Хьюстон, в нас проблема.':
                post_parse_temp.remove(url)
            print(f"ERROR: {driver.title} : {url}")

    # subtract parsed urls from list
    # try to parse webpages with errors again (max 5 times)
    post_parse_temp = list(set(post_parse_temp) - set(post_df.url.to_list()))
    soft_stop += 1

In [21]:
post_df

Unnamed: 0,url,price,location,floor,max_floor,area
0,https://www.olx.ua/d/uk/obyavlenie/orenda-4k-k...,3 800 $,Київ,4,9,170
0,https://www.olx.ua/d/uk/obyavlenie/srochnaya-a...,8 999 грн.,Черкаси,7,10,34
0,https://www.olx.ua/d/uk/obyavlenie/sdam-2k-met...,10 000 грн.,Харків,3,5,50
0,https://www.olx.ua/d/uk/obyavlenie/orenda-2-k-...,13 000 грн.,Львів,3,4,67
0,https://www.olx.ua/d/uk/obyavlenie/sdam-2k-kva...,8 000 грн.,Запоріжжя,6,9,52
...,...,...,...,...,...,...
0,https://www.olx.ua/d/uk/obyavlenie/zdatsya-2k-...,10 000 грн.,Хмельницький,4,9,63
0,https://www.olx.ua/d/uk/obyavlenie/orenda-zati...,26 000 грн.,Софіївська Борщагівка,5,10,67
0,https://www.olx.ua/d/uk/obyavlenie/dovgotrival...,13 500 грн.,Львів,4,5,55
0,https://www.olx.ua/d/uk/obyavlenie/zdam-kvarti...,200 $,Хуст,1,2,50


### Expand price features into three columns:
- price (number)
- currency
- negotiatied

In [34]:
post_df = post_df.reset_index(drop=True)

In [60]:
def process_price(price_str):
    price_str = price_str.replace(' ', '')
    price_str = price_str.replace('\n', '')
    currency = 'гривня' if '$' not in price_str else 'доллар'
    negotiated = False if 'Договірна' not in price_str else True

    row = {
        'price' : re.findall('\d+', price_str)[0],
        'currency' : currency,
        'negotiated' : negotiated
    }
    return row

In [61]:
post_df['currency'] = post_df.price.apply(lambda x : process_price(x)['currency'])
post_df['negotiated'] = post_df.price.apply(lambda x : process_price(x)['negotiated'])
post_df['price'] = post_df.price.apply(lambda x : process_price(x)['price'])

In [62]:
post_df

Unnamed: 0,url,price,location,floor,max_floor,area,currency,negotiated
0,https://www.olx.ua/d/uk/obyavlenie/orenda-4k-k...,3800,Київ,4,9,170,доллар,False
1,https://www.olx.ua/d/uk/obyavlenie/srochnaya-a...,8999,Черкаси,7,10,34,гривня,False
2,https://www.olx.ua/d/uk/obyavlenie/sdam-2k-met...,10000,Харків,3,5,50,гривня,False
3,https://www.olx.ua/d/uk/obyavlenie/orenda-2-k-...,13000,Львів,3,4,67,гривня,False
4,https://www.olx.ua/d/uk/obyavlenie/sdam-2k-kva...,8000,Запоріжжя,6,9,52,гривня,False
...,...,...,...,...,...,...,...,...
244,https://www.olx.ua/d/uk/obyavlenie/zdatsya-2k-...,10000,Хмельницький,4,9,63,гривня,False
245,https://www.olx.ua/d/uk/obyavlenie/orenda-zati...,26000,Софіївська Борщагівка,5,10,67,гривня,False
246,https://www.olx.ua/d/uk/obyavlenie/dovgotrival...,13500,Львів,4,5,55,гривня,False
247,https://www.olx.ua/d/uk/obyavlenie/zdam-kvarti...,200,Хуст,1,2,50,доллар,False


### Load final data into Google Spreadsheet

In [63]:
spreadsheet = gs_client.create('OLX Parse Results')
worksheet = spreadsheet.get_worksheet(0)
worksheet.update([post_df.columns.values.tolist()] + post_df.values.tolist())

{'spreadsheetId': '1JvFZ2QIa3KgIBzrUdwAFcsfJaywrqPmP_NTAgxQzLcI',
 'updatedRange': "'Аркуш1'!A1:H250",
 'updatedRows': 250,
 'updatedColumns': 8,
 'updatedCells': 2000}

In [64]:
spreadsheet.url

'https://docs.google.com/spreadsheets/d/1JvFZ2QIa3KgIBzrUdwAFcsfJaywrqPmP_NTAgxQzLcI'