In [2]:
# load all events
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import time
import re

from bs4 import BeautifulSoup

options = webdriver.ChromeOptions()
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome("chromedriver_win32\chromedriver.exe", options=options)

def get_h3s(page_source):
    # get event dates
    soup = BeautifulSoup(page_source, "html.parser")
    h3s = soup.find_all("h3", class_=re.compile('^jss[0-9]{3}'))
    return [h3.text for h3 in h3s]
    

In [15]:
from time import sleep
def get_events_driver(url: str, driver, delay=10):
    driver.get(url)
    try: 
        WebDriverWait(driver, delay).until(
            EC.presence_of_element_located(
#                 (By.XPATH, "//*[contains(text(), 'Load More..')]")
                (By.XPATH, '//*[@id="app"]/div[1]/div[1]/div[2]/div/div[1]/div[2]/div[1]/div/div/h3')
            )
        )
        print("Loaded page successfully")
    except Exception as exc:
        print(f"Timed out while loading page with exception {exc}")
        raise
            
        
    return driver

driver = get_events_driver("https://mookh.com/tickets", driver)
h3s = []
# iterate through all pages, use number of h3 elements found on page to determine end of scrolling
retry = True
while True:
    try:
        pagination_button = driver.find_element_by_xpath("//*[contains(text(), 'Load More..')]")
        #     pagination_button.click()
        driver.execute_script("arguments[0].click();", pagination_button)
    except Exception as exc: # an exception will be raised if no load more button is found, in that case wait a little bit more and try to get the button again
        sleep(10)
        pagination_button = driver.find_element_by_xpath("//*[contains(text(), 'Load More..')]")
        #   
        driver.execute_script("arguments[0].click();", pagination_button) # https://stackoverflow.com/questions/48665001/can-not-click-on-a-element-elementclickinterceptedexception-in-splinter-selen

    sleep(5)
    h3s_ = get_h3s(driver.page_source)
    is_same_page = len(h3s_) == len(h3s)
    
    if is_same_page and retry is True:
        retry = False
        # page took long to load wait a little bit more and load again
        sleep(3)
        continue
    
    if is_same_page and retry is False:
        print("loaded all events")
        retry = True
        break
        
    h3s = h3s_

Loaded page successfully
loaded all events
125


In [17]:
page_source = driver.page_source

In [18]:
from dataclasses import dataclass
import datetime

@dataclass(frozen=True)
class Event:
    name: str
    location: str
    start_date: datetime.date
    end_date: datetime.date
    start_time: datetime.time
    end_time: datetime.time
    banner_url: str
    site_name: str
    url: str

    def to_json(self):
        return json.dumps(
            {
                "name": self.name,
                "location": self.location,
                "start_date": self.start_date,
                "end_date": self.end_date,
                "start_time": self.start_time,
                "end_time": self.end_time,
                "banner_url": self.banner_url,
                "site_name": self.site_name,
                "url": self.url,
            }
        )

    @classmethod
    def create_event(
        cls,
        event_name: str,
        event_location: str,
        start_date: datetime.date,
        end_date: datetime.date,
        start_time: datetime.time,
        end_time: datetime.time,
        banner_url: str,
        site_name: str,
        event_url: str,
    ):
        return cls(
            event_name,
            event_location,
            start_date,
            end_date,
            start_time,
            end_time,
            banner_url,
            site_name,
            event_url,
        )
    

In [20]:
def load_event(url: str, delay=10):
    driver.get(url)
    try: 
        WebDriverWait(driver, delay).until(
            EC.presence_of_element_located(
                (By.XPATH, '//*[@id="imageDisplay"]')
            )
        )
        print("Loaded page successfully")
    except Exception as exc:
        print(f"Timed out while loading page with exception {exc}")
    return driver.page_source

In [27]:
import re
soup = BeautifulSoup(page_source, "html.parser")
event_links = soup.find_all("a", href=re.compile(r"^/event/[a-zA-Z]*"))
event_links = [i for i in set([event_link.get('href') for event_link in event_links])]
print(event_links)

['/event/hip-hop-garage-ground-zero', '/event/vivid-vibrations', '/event/club-comedy-kickofff', '/event/mxt-tour-2019-3rd-edition', '/event/aqua-splash', '/event/everyday-people-nairobi-2', '/event/wine-music-fashion-art-affair', '/event/cupcakes-condoms-the-conversation', '/event/unganisha-festival', '/event/mavoko-open-mic-chukua-selfie-edition-2', '/event/kaggia-show-6-sun-dec-15th-6pm', '/event/art-in-motion-12th-dec', '/event/money-at-the-museum-2', '/event/rukundo-egumeho-8th-edition', '/event/strictly-silk-4th-edition', '/event/family-meeting-8th-deci', '/event/tafaria-castle-day-trip-6th-edition', '/event/ngong-hills-yoga-and-picnic', '/event/nba-trap', '/event/2020-end-polio-bikeathon', '/event/game-devs-kenya-teens-novdecember-bootcamp', '/event/reminisce-soul-factor-5', '/event/zenfest', '/event/family-meeting-12th-dec', '/event/introduction-to-sports-disputes-resolution', '/event/kijiji-festival-2019', '/event/rotary-polio-end-game-olympics', '/event/kaggia-show-3-sat-dec-1

In [186]:


event_links = [i for i in set([event_link.get('href') for event_link in event_links])]
# fetch every page from the link a parse html
base_url = "https://mookh.com"
event_url = base_url + '/event/7-days-in-november-workshops'

# # throttle network
# driver.set_network_conditions(
#     offline=False,
#     latency=5*60,  # additional latency (ms)
#     download_throughput=500 * 1024,  # maximal throughput
#     upload_throughput=500 * 1024)  # maximal throughput

page_source = load_event(event_url)
event_soup = BeautifulSoup(page_source, "html.parser")
# print(event_soup.find_all('h5'))
event_organiser = event_soup.find('h5', class_="owner").text
event_name = event_soup.find('h4').text
event_date = event_soup.find('p', class_='eventDate').text
event_location = event_soup.find('span', class_="location-marker").text
image_tag = event_soup.find('div', id='imageDisplay')
banner_url = re.search(r'https://files.mookh.com/uploads/[a-zA-Z0-9./-_]+', image_tag.attrs['style']).group(0)
event_time = event_soup.find('p', class_='owner').find('span').text
start_time, end_time = [t.replace(' ', '') for t in event_time.split('-')]
t_format = '%I:%M%p'
start_time = datetime.datetime.strptime(start_time, t_format).time()
end_time = datetime.datetime.strptime(end_time, t_format).time()
dates = event_date.split("-")
# print(dates)
start_date = end_date = dates[0]
if len(dates) > 1:
    end_date = dates[1]
    
def parse_date(dt: str, d_format = '%a%b%d%Y')-> datetime.date:
    # remove all formatings and spaces
    dt = re.sub(r'\s+|,|:|st|nd|rd|th', '', dt)
    return datetime.datetime.strptime(dt, d_format).date()

start_date = parse_date(start_date)
end_date = parse_date(end_date)
site_name = "mookh.com"
event = Event.create_event(
    event_name,
            event_location,
            start_date,
            end_date,
            start_time,
            end_time,
            banner_url,
            site_name,
            event_url,
)
print(event)

Loaded page successfully
Event(name='7 DAYS IN NOVEMBER: WORKSHOPS', location='Mandera Road', start_date=datetime.date(2019, 11, 25), end_date=datetime.date(2019, 11, 29), start_time=datetime.time(9, 0), end_time=datetime.time(18, 30), banner_url='https://files.mookh.com/uploads/7_DAYS_2019_WORKSHOP', site_name='mookh.com', url='https://mookh.com/event/7-days-in-november-workshops')
