In [None]:
!pipenv install selenium pandas numpy matplotlib tqdm

In [2]:
URL="https://apd.usos.agh.edu.pl/topics/browse/"

In [3]:
#create new Edge webdriver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import os

In [4]:
driver = webdriver.Edge()
driver.get(URL)

In [5]:
# function to wait for login or exit if failed
# wait for login = presence of element containing phrase or "logout" or "wyloguj się"
#"<a href="/auth/cas/logout/"><span>wyloguj się</span></a>"
def login_wait(driver):
    retries=5
    print("Waiting for login...")
    while retries>0:
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[contains(text(), "logout")]')))
            print("Logged in")
            break
        except:
            try:
                WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.XPATH, '//*[contains(text(), "wyloguj się")]')))
                print("Logged in")
                break
            except:
                retries-=1
                print("Retrying...")

    if retries==0:
        print("Failed to log in")
        driver.quit()

In [6]:
# function to change language
'''
<a id="langSwitch" href="/pl/topics/browse/" title="Wersja polska" lang="pl">
  <span class="sr-only">Wersja polska</span>
  <img src="/static/usos/css/img/layout/lang-pl.svg" alt="Wersja polska">
</a>
'''
def change_language():
    try:
        lang = driver.find_element(By.XPATH, '//*[@id="langSwitch"]')
        lang.click()
    except:
        print("Failed to change language")
def change_language_to_pl():
    lang = driver.find_element(By.XPATH, '//*[@id="langSwitch"]')
    #get title attribute
    title = lang.get_attribute("title")
    if title!="Wersja polska":
        print("Already in Polish")
    else:
        lang.click()

In [None]:
#load tematy_df.pickle
tematy_df = pd.read_pickle("tematy_df.pickle")
tematy_df.sample(2)

In [11]:
change_language_to_pl()

In [51]:
driver.get(tematy_df.sample(1).iloc[0]["url"])

In [8]:
#function to get the main table
# the main table has class "frame"
def get_main_table():
    return driver.find_element(By.CLASS_NAME, "frame")

In [46]:
# function that will find Temat pracy w języku oryginału
def get_topic(table):
    #get first object with class="td width-100" use by xpath
    # print(table.find_elements(By.XPATH, '//*[@class="td width-100"]'))
    return table.find_elements(By.XPATH, '//*[@class="td width-100"]')[0].text.strip()


In [47]:
# function that will find description from table
def get_description(table):
    # #first in the table find object with title="Opis w języku [PL]"
    # x=table.find_element(By.XPATH, '//*[@title="Opis w języku [PL]"]')
    # #go 2 levels up, then get all divs and select the second one
    # x2=x.find_element(By.XPATH, '..').find_element(By.XPATH,'..').find_elements(By.TAG_NAME, "div")[1]
    # return x2.text
    # get second object with class="td width-100"
    return table.find_elements(By.XPATH, '//*[@class="td width-100"]')[1].text.strip()
    

In [None]:
get_main_table().get_attribute("innerHTML")

In [None]:
get_topic(get_main_table())

In [None]:
get_description(get_main_table())

In [None]:
login_wait(driver)

In [13]:
change_language_to_pl()

## Do actual scrapping

In [21]:
import time
starttime = time.time()

rows = []
total_rows=tematy_df.shape[0]
urls=tematy_df["url"]
for url in tqdm(urls):
    driver.get(url)
    table = get_main_table()
    topic = get_topic(table)
    description = get_description(table)
    rows.append([topic, description])

print('That took {} seconds'.format(time.time() - starttime))

In [24]:
#convert that to pandas dataframe
import pandas as pd
#add url as second column
columns = ["topic", "description"]
df = pd.DataFrame(rows, columns=columns)

In [26]:
df.sample(2)

In [29]:
import pickle
#pickle the dataframe
with open("descriptions_df.pickle", "wb") as f:
    pickle.dump(df, f)