In [1]:
!pipenv install selenium pandas numpy matplotlib tqdm

Installing selenium...
Resolving selenium...
[    ] Installing...
Installation Succeeded
[    ] Installing selenium...
[    ] Installing selenium...

Installing pandas...
Resolving pandas...
[    ] Installing...
Installation Succeeded
[    ] Installing pandas...
[    ] Installing pandas...

Installing numpy...
Resolving numpy...
[    ] Installing...
Installation Succeeded
[    ] Installing numpy...
[    ] Installing numpy...

Installing matplotlib...
Resolving matplotlib...
[    ] Installing...
Installation Succeeded
[    ] Installing matplotlib...
[    ] Installing matplotlib...

Installing tqdm...
Resolving tqdm...
[    ] Installing...
Installation Succeeded
[    ] Installing tqdm...
[    ] Installing tqdm...

Installing dependencies from Pipfile.lock (4da58c)...


In [2]:
URL="https://apd.usos.agh.edu.pl/topics/browse/"

In [3]:
#create new Edge webdriver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import os

In [4]:
driver = webdriver.Edge()
driver.get(URL)

In [5]:
# function to wait for login or exit if failed
# wait for login = presence of element containing phrase or "logout" or "wyloguj się"
#"<a href="/auth/cas/logout/"><span>wyloguj się</span></a>"
def login_wait(driver):
    retries=5
    print("Waiting for login...")
    while retries>0:
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[contains(text(), "logout")]')))
            print("Logged in")
            break
        except:
            try:
                WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.XPATH, '//*[contains(text(), "wyloguj się")]')))
                print("Logged in")
                break
            except:
                retries-=1
                print("Retrying...")

    if retries==0:
        print("Failed to log in")
        driver.quit()

In [6]:
# function to change language
'''
<a id="langSwitch" href="/pl/topics/browse/" title="Wersja polska" lang="pl">
  <span class="sr-only">Wersja polska</span>
  <img src="/static/usos/css/img/layout/lang-pl.svg" alt="Wersja polska">
</a>
'''
def change_language():
    try:
        lang = driver.find_element(By.XPATH, '//*[@id="langSwitch"]')
        lang.click()
    except:
        print("Failed to change language")
def change_language_to_pl():
    lang = driver.find_element(By.XPATH, '//*[@id="langSwitch"]')
    #get title attribute
    title = lang.get_attribute("title")
    if title!="Wersja polska":
        print("Already in Polish")
    else:
        lang.click()

In [7]:
# function to get main table from page
# find main table that is of type <table class="nav">
# table = driver.find_element_by_class_name("nav")
#'WebDriver' object has no attribute 'find_element_by_class_name'
def get_main_table():
    return driver.find_element(By.CLASS_NAME, "nav")

In [8]:
#function to get all rows from table
def get_data_rows(table):
    tbody = table.find_element(By.TAG_NAME, "tbody")
    return tbody.find_elements(By.TAG_NAME, "tr")
# print("Found", len(rows), "rows")

In [9]:
# header html and function to get column names
'''
<thead>
  <tr>
    <th class="width-100">
      <div class="nav-header">
        <div>
          <div>
            Thesis title in original language
          </div>
          <a class="ord" href="?page=1&amp;order=title">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort ascending by: Thesis title in original language">
          </a>
          <a class="ord" href="?page=1&amp;order=-title">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort descending by: Thesis title in original language">
          </a>
        </div>
        <div>
          <div>
            Organizational unit
          </div>
          <a class="ord" href="?page=1&amp;order=department">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort ascending by: Organizational unit">
          </a>
          <a class="ord" href="?page=1&amp;order=-department">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort descending by: Organizational unit">
          </a>
        </div>
      </div>
    </th>
    <th class="width-0">
      <div class="nav-header">
        <div>
          <div>
            <img src="/static/usos/img/tooltip/info-white.svg" class="usos-tooltip valign-middle" title="An employee who provided the topic in APD on his own behalf or on behalf of a person who does not have such capabilities. May be, but does not have to be, a future thesis supervisor. Check the content of the field &quot;Expected thesis supervisor&quot;." aria-labelledby="sro-2" alt="Tip" tabindex="0">
            <span id="sro-2" class="screen-reader-only">An employee who provided the topic in APD on his own behalf or on behalf of a person who does not have such capabilities. May be, but does not have to be, a future thesis supervisor. Check the content of the field "Expected thesis supervisor".</span>
          </div>
          <div>
            Person providing<br>the topic
          </div>
          <a class="ord" href="?page=1&amp;order=supervisor">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort ascending by: Person providing<br/>the topic">
          </a>
          <a class="ord" href="?page=1&amp;order=-supervisor">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort descending by: Person providing<br/>the topic">
          </a>
        </div>
      </div>
    </th>
    <th class="width-0">
      <div class="nav-header">
        <div>
          <div>
            Study fields
          </div>
        </div>
      </div>
    </th>
    <th class="width-0">
      <div class="nav-header">
        <div>
          <div>
            Thesis type
          </div>
        </div>
      </div>
    </th>
    <th class="width-0">
      <div class="nav-header">
        <div>
          <div>
            <img src="/static/usos/img/tooltip/info-white.svg" class="usos-tooltip valign-middle" title="<ul class=&quot;no-bullets&quot; style=&quot;padding-left: 0;&quot;>                                                            <li>Available</li>                                                            <li>Partially available</li>                                                            <li>Under negotiation</li>                                                            <li>Taken</li>                                                     </ul>" aria-labelledby="sro-3" alt="Tip" tabindex="0">
            <span id="sro-3" class="screen-reader-only">
              <ul class="no-bullets" style="padding-left: 0;">
                <li>Available</li>
                <li>Partially available</li>
                <li>Under negotiation</li>
                <li>Taken</li>
              </ul>
            </span>
          </div>
          <div>
            Topic<br>status
          </div>
          <a class="ord" href="?page=1&amp;order=status">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort ascending by: Topic<br/>status">
          </a>
          <a class="ord" href="?page=1&amp;order=-status">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort descending by: Topic<br/>status">
          </a>
        </div>
      </div>
    </th>
    <th class="width-0">
      <div class="nav-header">
        <div>
          <div>
            Date<br>of update
          </div>
          <div class="ord">
            <div></div>
          </div>
          <a class="ord" href="?page=1&amp;order=-update_date">
            <img src="/static/usos/img/tab/ord-empty.png" alt="Sort descending by: Date<br/>of update">
          </a>
        </div>
      </div>
    </th>
  </tr>
</thead>
'''

def get_columns_from_header_row(table):
    #get header row
    row = table.find_elements(By.TAG_NAME, "thead")[1].find_element(By.TAG_NAME, "tr")
    # print(row.get_attribute("innerHTML"))
    columns=[]
    #get title column name from header row
    columns.append(row.find_elements(By.TAG_NAME, "th")[0].find_elements(By.TAG_NAME, "div")[2].text)
    #get organizational unit column name from header row
    columns.append(row.find_elements(By.TAG_NAME, "th")[0].find_elements(By.TAG_NAME, "div")[4].text)
    #get person providing the topic column name from header row
    columns.append(row.find_elements(By.TAG_NAME, "th")[1].find_elements(By.TAG_NAME, "div")[1].text)
    #get study fields column name from header row
    columns.append(row.find_elements(By.TAG_NAME, "th")[2].find_elements(By.TAG_NAME, "div")[0].text)
    #get thesis type column name from header row
    columns.append(row.find_elements(By.TAG_NAME, "th")[3].find_elements(By.TAG_NAME, "div")[0].text)
    #get topic status column name from header row
    columns.append(row.find_elements(By.TAG_NAME, "th")[4].find_elements(By.TAG_NAME, "div")[1].text)
    #get date of update column name from header row
    columns.append(row.find_elements(By.TAG_NAME, "th")[5].find_elements(By.TAG_NAME, "div")[0].text)
    return [column.replace("\n", " ") for column in columns]


In [17]:
# sample html row and function to extract data from it
'''
<tr>
  <td class="align-left width-100">
    <div>
      <a href="/topics/show/15221/">
        <span lang="pl" class="block">Na Badanie wpływu nagrań binauralnych na aktywację kory ruchowej mózgu</span>
      </a>
    </div>
    <div style="margin-top:10px">
      <a target="_blank" style="color: black; font-size: smaller" href="https://web.usos.agh.edu.pl/kontroler.php?_action=actionx:katalog2/jednostki/pokazJednostke(kod:130-000)&amp;lang=2" fac_id="130-000">
        [130-000] Faculty of Mechanical Engineering and Robotics
      </a>
    </div>
  </td>
  <td class="align-left">
    <a href="/users/33443/" user_id="33443">
      Dorota Młynarczyk
    </a>
  </td>
  <td class="strong align-left width-0">
    Acoustic Engineering
  </td>
  <td class="strong align-left width-0">
    master
  </td>
  <td class="strong align-left width-0">
    <nobr>
      Taken
    </nobr>
  </td>
  <td class="strong align-left width-0">
    Oct. 7, 2024
  </td>
</tr>
'''
def get_data_from_row(row):
    data = []
    #get all td's from row
    tds = row.find_elements(By.TAG_NAME, "td")
    #get thesis title
    data.append(tds[0].find_element(By.TAG_NAME, "span").text)
    #get organizational unit
    data.append(tds[1].find_element(By.TAG_NAME, "a").text)
    #get person providing the topic
    data.append(tds[1].find_element(By.TAG_NAME, "a").text)
    #get study fields
    data.append(tds[2].text)
    #get thesis type
    data.append(tds[3].text)
    #get topic status
    data.append(tds[4].text)
    #get date of update
    data.append(tds[5].text)
    return data


In [11]:
# header row and functions to get next page button end extract total number of rows
'''
<div class="nav-bar">
  <div aria-disabled="true" role="button" tabindex="0" aria-label="Pierwsza strona">
    <svg class="disabled">
      <use xlink:href="/static/usos/img/tab/nav-all.svg#first_page"></use>
    </svg>
  </div>
  <div aria-disabled="true" role="button" tabindex="0" aria-label="Poprzednia strona">
    <svg class="disabled">
      <use xlink:href="/static/usos/img/tab/nav-all.svg#chevron_left"></use>
    </svg>
  </div>

  <div>
    Elementy <span class="red" style="margin: 0 0.25rem;">1..30</span>
    z <span class="red" style="margin: 0 0.25rem;">5307</span>
  </div>

  <a href="?page=2&amp;order=update_date" aria-label="Następna strona">
    <svg>
      <title>Następna strona</title>
      <use xlink:href="/static/usos/img/tab/nav-all.svg#chevron_right"></use>
    </svg>
  </a>
  <a href="?page=177&amp;order=update_date" aria-label="Ostatnia strona">
    <svg>
      <title>Ostatnia strona</title>
      <use xlink:href="/static/usos/img/tab/nav-all.svg#last_page"></use>
    </svg>
  </a>
</div>'''
def get_next_page_button(table):
    #get div with class nav-bar
    next_button = table.find_element(By.CLASS_NAME, "nav-bar").find_elements(By.TAG_NAME, "a")[-2]
def get_total_rows(table):
    #get div with class nav-bar
    nav_bar = table.find_element(By.CLASS_NAME, "nav-bar")
    #get span with class red that contains total number of rows
    total_rows = nav_bar.find_elements(By.TAG_NAME, "span")[-1].text
    return int(total_rows)

In [12]:
login_wait(driver)

Waiting for login...
Retrying...
Retrying...
Logged in


In [13]:
change_language_to_pl()

In [14]:
get_columns_from_header_row(get_main_table())

['Tytuł pracy w języku oryginału',
 'Jednostka organizacyjna',
 'Osoba zgłaszająca temat',
 'Kierunki studiów',
 'Typ pracy',
 'Status tematu',
 'Data aktualizacji']

In [18]:
get_data_from_row(get_data_rows(get_main_table())[0])

['Projekt zabezpieczenia skarp głębokiego wykopu przy budowie krajowej drogi ekspresowej',
 'Projekt zabezpieczenia skarp głębokiego wykopu przy budowie krajowej drogi ekspresowej',
 'Joanna Jakóbczyk',
 'Budownictwo',
 'inżynierska',
 'Zarezerwowany',
 '7 października 2024']