In [222]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import urllib.request
import os

In [223]:
vocabs = []
folder_final = './videos'
last_page = 65 # adjust for testing on small batches

In [224]:
# get the link of a video page and the name of the vocabulary being shown
def get_entries_from_list(url):
    answer = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'})
    html = BeautifulSoup(answer.text, 'html.parser')

    # <a href="link" title="Vokabel">Vokabel</a>
    elements = [elem.div.a for elem in html.find_all('div', class_='record')]

    for elem in elements:
        d = {
            'title': elem.get('title'),
            'link': elem.get('href')
        }
        vocabs.append(d)

In [225]:
def cleanup_vocabs_titles():
    changed_titles = []

    # cleanup title extras and change / and : occurances
    for i in range(len(vocabs)):
        vocabs[i]['title'] = vocabs[i]['title'].replace('Details zu \'', '')
        vocabs[i]['title'] = vocabs[i]['title'].replace('\' anzeigen', '')
        vocabs[i]['title'] = vocabs[i]['title'].replace('/', '&sol;')
        vocabs[i]['title'] = vocabs[i]['title'].replace(':', '&colon;')
        vocabs[i]['title'] = vocabs[i]['title'].strip()

    # add _1 to title duplicates
    for i in range(len(vocabs)):
        if(i == 0):
            continue
        if(vocabs[i]['title'].lower() == vocabs[i - 1]['title'].lower()):
            vocabs[i]['title'] = vocabs[i]['title'] + "_1"
            changed_titles.append(vocabs[i]["title"])

    # print(changed_titles)

In [226]:
def get_video_data(url):
    answer = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'})
    html = BeautifulSoup(answer.text, 'html.parser')

    elements = [elem.source for elem in html.find_all('video')]
    video_urls = [elem.get('src') for elem in elements]

    labels = [elem.string for elem in html.find_all('td', class_='label')]
    values = [elem.string for elem in html.find_all('td', class_='value')]

    for i in range(len(labels)):
        labels[i] = labels[i].replace(u'\xa0', '')
        labels[i] = labels[i].replace(':', '')
        values[i] = values[i].replace(u'\xa0', '')
        values[i] = values[i].replace('\n', '')
        values[i] = values[i].replace('\t', '')
    
    meta = dict()
    
    for i, label in enumerate(labels):
        meta[label] = values[i]

    return video_urls, meta

In [227]:
def cleanup_edge_cases():

    # remove vocabs without videos
    for i, vocab in enumerate(vocabs):
        if(len(vocab['video_urls']) == 0):
            vocabs.pop(i)
    
    # very specific edge cases
    vocabs[532]['meta_data']['Name'] = vocabs[532]['meta_data']['Name'].replace(', \u03c72', '')
    vocabs[532]['title'] = vocabs[532]['title'].replace(', \u03c72', '')
    vocabs[1057]['meta_data']['Name'] = 'Fluidität'

In [228]:
# create folder vor vocabulary, fill it with metadata as a txt and all mp4s that belong to that vocab
def download_video_and_meta(vocab):
    folder_path = folder_final + f'/{vocab["title"]}'

    # create a folder
    os.mkdir(folder_path)

    folder_path_file = folder_path + f'/{vocab["title"]}'

    # write metadata in txt
    with open(folder_path_file + '.txt', 'w') as f:
        f.write(str(vocab['meta_data']))

    # download the video(s)
    if(len(vocab['video_urls']) == 0):
        return
    elif(len(vocab['video_urls']) == 1):
        url = vocab['video_urls'][0]
        filename = folder_path_file + '.mp4'
        urllib.request.urlretrieve(url, filename)
        return

    for i, video_url in enumerate(vocab['video_urls']):
        url = video_url
        filename = folder_path_file + f'_{chr(97 + i)}.mp4'
        urllib.request.urlretrieve(url, filename)

In [229]:
vocabs = []
last_page = 65

print('filling vocabs array with title & link...')
get_entries_from_list('https://fachgebaerden.tsc.tuwien.ac.at/gebaerden/action/list/')
for i in tqdm(range(1, last_page, 1)):
    get_entries_from_list(f'https://fachgebaerden.tsc.tuwien.ac.at/gebaerden/action/list/page/{i}/')

print('\n cleaning up vocabs')
cleanup_vocabs_titles()

print('\n getting all mp4 urls and the metadata of each vocab')
for i in tqdm(range(len(vocabs))):
    vocabs[i]['video_urls'], vocabs[i]['meta_data'] = get_video_data(vocabs[i]['link'])

print('\n removing vocabs without video')
cleanup_edge_cases()

vocabs

filling vocabs array with title & link...


100%|██████████| 64/64 [00:40<00:00,  1.56it/s]



 cleaning up vocabs

 getting all mp4 urls and the metadata of each vocab


100%|██████████| 3234/3234 [22:31<00:00,  2.39it/s]



 removing vocabs without video


[{'title': 'Abbildungsverzeichnis',
  'link': 'https://fachgebaerden.tsc.tuwien.ac.at/gebaerden/abbildungsverzeichnis/',
  'video_urls': ['https://fachgebaerden.tsc.tuwien.ac.at/fileadmin/videos/abbildungsverzeichnis_3.mp4',
   'https://fachgebaerden.tsc.tuwien.ac.at/fileadmin/videos/abbildungsverzeichnis_4.mp4'],
  'meta_data': {'Name': 'Abbildungsverzeichnis',
   'Herkunft': 'Neue Gebärde',
   'Themengebiete': 'Informatik'}},
 {'title': 'Abbruchbedingung',
  'link': 'https://fachgebaerden.tsc.tuwien.ac.at/gebaerden/phylogenetisch/',
  'video_urls': ['https://fachgebaerden.tsc.tuwien.ac.at/fileadmin/videos/phylogenetisch_2.mp4'],
  'meta_data': {'Name': 'Abbruchbedingung',
   'Herkunft': 'Existierende Gebärde',
   'Themengebiete': 'Informatik'}},
 {'title': 'Abdichtung',
  'link': 'https://fachgebaerden.tsc.tuwien.ac.at/gebaerden/abdichtung/',
  'video_urls': ['https://fachgebaerden.tsc.tuwien.ac.at/fileadmin/videos/abdichtung_1.mp4'],
  'meta_data': {'Name': 'Abdichtung',
   'Herkunf

In [230]:
folder_final = './videos'
for vocab in tqdm(vocabs):
    download_video_and_meta(vocab)

vocabs

100%|██████████| 3214/3214 [31:50<00:00,  1.68it/s] 


[{'title': 'Abbildungsverzeichnis',
  'link': 'https://fachgebaerden.tsc.tuwien.ac.at/gebaerden/abbildungsverzeichnis/',
  'video_urls': ['https://fachgebaerden.tsc.tuwien.ac.at/fileadmin/videos/abbildungsverzeichnis_3.mp4',
   'https://fachgebaerden.tsc.tuwien.ac.at/fileadmin/videos/abbildungsverzeichnis_4.mp4'],
  'meta_data': {'Name': 'Abbildungsverzeichnis',
   'Herkunft': 'Neue Gebärde',
   'Themengebiete': 'Informatik'}},
 {'title': 'Abbruchbedingung',
  'link': 'https://fachgebaerden.tsc.tuwien.ac.at/gebaerden/phylogenetisch/',
  'video_urls': ['https://fachgebaerden.tsc.tuwien.ac.at/fileadmin/videos/phylogenetisch_2.mp4'],
  'meta_data': {'Name': 'Abbruchbedingung',
   'Herkunft': 'Existierende Gebärde',
   'Themengebiete': 'Informatik'}},
 {'title': 'Abdichtung',
  'link': 'https://fachgebaerden.tsc.tuwien.ac.at/gebaerden/abdichtung/',
  'video_urls': ['https://fachgebaerden.tsc.tuwien.ac.at/fileadmin/videos/abdichtung_1.mp4'],
  'meta_data': {'Name': 'Abdichtung',
   'Herkunf