In [1]:
!pipenv install selenium pandas numpy matplotlib tqdm

Installing selenium...
Resolving selenium...
[    ] Installing...
Installation Succeeded
[    ] Installing selenium...
[    ] Installing selenium...

Installing pandas...
Resolving pandas...
[    ] Installing...
Installation Succeeded
[    ] Installing pandas...
[    ] Installing pandas...

Installing numpy...
Resolving numpy...
[    ] Installing...
Installation Succeeded
[    ] Installing numpy...
[    ] Installing numpy...

Installing matplotlib...
Resolving matplotlib...
[    ] Installing...
Installation Succeeded
[    ] Installing matplotlib...
[    ] Installing matplotlib...

Installing tqdm...
Resolving tqdm...
[    ] Installing...
Installation Succeeded
[    ] Installing tqdm...
[    ] Installing tqdm...

Installing dependencies from Pipfile.lock (395a0f)...


In [2]:
URL="https://apd.usos.agh.edu.pl/topics/browse/"

In [3]:
#create new Edge webdriver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import os

In [4]:
driver = webdriver.Edge()
driver.get(URL)

In [5]:
# function to wait for login or exit if failed
# wait for login = presence of element containing phrase or "logout" or "wyloguj się"
#"<a href="/auth/cas/logout/"><span>wyloguj się</span></a>"
def login_wait(driver):
    retries=5
    print("Waiting for login...")
    while retries>0:
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[contains(text(), "logout")]')))
            print("Logged in")
            break
        except:
            try:
                WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.XPATH, '//*[contains(text(), "wyloguj się")]')))
                print("Logged in")
                break
            except:
                retries-=1
                print("Retrying...")

    if retries==0:
        print("Failed to log in")
        driver.quit()

def fast_wait_pl(driver):
    #this is faster since the condition is checked every moment. We know we are waiting for polish word "wyloguj się", since this should be called after changing language to polish
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[contains(text(), "wyloguj się")]')))

In [6]:
# function to change language
'''
<a id="langSwitch" href="/pl/topics/browse/" title="Wersja polska" lang="pl">
  <span class="sr-only">Wersja polska</span>
  <img src="/static/usos/css/img/layout/lang-pl.svg" alt="Wersja polska">
</a>
'''
def change_language():
    try:
        lang = driver.find_element(By.XPATH, '//*[@id="langSwitch"]')
        lang.click()
    except:
        print("Failed to change language")
def change_language_to_pl():
    lang = driver.find_element(By.XPATH, '//*[@id="langSwitch"]')
    #get title attribute
    title = lang.get_attribute("title")
    if title!="Wersja polska":
        print("Already in Polish")
    else:
        lang.click()

In [7]:
#load tematy_df.pickle
tematy_df = pd.read_pickle("tematy_df.pickle")
tematy_df.sample(2)

Unnamed: 0,Tytuł pracy w języku oryginału,url,Jednostka organizacyjna,Osoba zgłaszająca temat,Kierunki studiów,Typ pracy,Status tematu,Data aktualizacji
44,Dłoń jako interfejs dla wirtualnej rzeczywistości,https://apd.usos.agh.edu.pl/topics/show/4585/,"[120-000] Wydział Elektrotechniki, Automatyki,...",Igor Wojnicki,"Computer Science,\ninformatyka,\nInformatyka","inżynierska,\nmagisterska",Zarezerwowany,3 października 2024
2578,Projekt i implementacja systemu rezerwacji pok...,https://apd.usos.agh.edu.pl/topics/show/9424/,"[120-000] Wydział Elektrotechniki, Automatyki,...",Paweł Skrzyński,"Informatyka,\nInformatyka",inżynierska,Dostępny,16 maja 2023


In [8]:
change_language_to_pl()

In [9]:
driver.get(tematy_df.sample(1).iloc[0]["url"])
fast_wait_pl(driver)

In [10]:
#function to get the main table
# the main table has class "frame"
def get_main_table():
    return driver.find_element(By.CLASS_NAME, "frame")

In [11]:
# function that will find Temat pracy w języku oryginału
def get_topic(table):
    #get first object with class="td width-100" use by xpath
    # print(table.find_elements(By.XPATH, '//*[@class="td width-100"]'))
    return table.find_elements(By.XPATH, '//*[@class="td width-100"]')[0].text.strip()


In [12]:
# function that will find description from table
def get_description(table):
    # #first in the table find object with title="Opis w języku [PL]"
    # x=table.find_element(By.XPATH, '//*[@title="Opis w języku [PL]"]')
    # #go 2 levels up, then get all divs and select the second one
    # x2=x.find_element(By.XPATH, '..').find_element(By.XPATH,'..').find_elements(By.TAG_NAME, "div")[1]
    # return x2.text
    # get second object with class="td width-100"
    return table.find_elements(By.XPATH, '//*[@class="td width-100"]')[1].text.strip()
    

In [13]:
get_main_table().get_attribute("innerHTML")

'\n        <div>Informacje o temacie pracy dyplomowej</div>\n        <div>\n            <table class="tab tab-no-outer-border info highlight tab-desc width-100">\n                <tbody>\n                    <tr>\n                        <td>\n                            Preferowany język pracy:\n                        </td>\n                        <td>\n                            Polski [PL]\n                        </td>\n                    </tr>\n                    <tr>\n                        <td>\n                            Temat pracy w języku oryginału:\n                        </td>\n                        <td>\n                            <div class="tab width-100">\n                                <div class="tr">\n                                    \n                                    <div name="language_flag" class="td width-0 align-right padding-0">\n    \n        <img src="/static/usos/img/languages/pl.png" style="margin:5px 0 0 0;width:1.4375rem;height:0.8125re

In [14]:
get_topic(get_main_table())

'Zastosowanie procesu spiekania do łączenia materiałów'

In [15]:
get_description(get_main_table())

'Praca będzie obejmowała badanie możliwości zastosowania procesu spiekania do łączenia materiałów. Porównane zostaną dwie metody: bez i z użyciem dodatkowych czynników. Po procesie łączenia zostanie określona mikrostruktura złącza oraz jego podstawowe własności.'

In [16]:
login_wait(driver)

Waiting for login...
Logged in


In [17]:
change_language_to_pl()

Already in Polish


## Do actual scrapping

In [18]:
#moved here since we might want to resume the process
rows = []

In [19]:
import time
starttime = time.time()

urls=tematy_df["url"]
#remove urls that are already in the rows
urls=urls[~urls.isin([x[2] for x in rows])]
print ("Url's filtered in {} seconds".format(time.time() - starttime))

for url in tqdm(urls):
    driver.get(url)
    fast_wait_pl(driver)
    table = get_main_table()
    topic = get_topic(table)
    description = get_description(table)
    rows.append([topic, description, url])

print('That took {} seconds'.format(time.time() - starttime))

Url's filtered in 0.0039408206939697266 seconds


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5311/5311 [35:03<00:00,  2.53it/s]

That took 2103.2968134880066 seconds





In [22]:
#convert that to pandas dataframe
import pandas as pd
#add url as second column
columns = ["topic", "description", "url"]
df = pd.DataFrame(rows, columns=columns)

In [23]:
df.sample(2)

Unnamed: 0,topic,description,url
999,Analiza pracy sieci dystrybucyjnej z instalacj...,Praca o charakterze analitycznym. W ramach pra...,https://apd.usos.agh.edu.pl/topics/show/13579/
516,System pomiarowy i akwizycji danych do analizy...,Praca obejmuje projekt i budowę systemu pomiar...,https://apd.usos.agh.edu.pl/topics/show/5506/


In [24]:
import pickle
#pickle the dataframe
with open("descriptions_df.pickle", "wb") as f:
    pickle.dump(df, f)