## Webscraping
Website: [Maturaarbeiten Archiv](https://maturitaetsarbeiten.ch/cms/archiv.html)

In [1]:
from algoliasearch.search_client import SearchClient
import boto3

from bs4 import BeautifulSoup
from uuid import uuid4
import requests
import random
import string
import sqlite3
import json
import re
import os

In [3]:
BASE_URL = "https://maturitaetsarbeiten.ch"
PB_URL = "http://127.0.0.1:8090"
HOME_URL = "/cms/archiv/a-z-namen-2020.html"
HOME_FILE = "home.html"

page = requests.get(BASE_URL + HOME_URL)
soup = BeautifulSoup(page.content, "html.parser")

container = soup.find("div", class_="uk-child-width-1-1 uk-child-width-1-1@m uk-grid-match")
links = soup.find_all("a", class_="uk-link-reset")

conn = sqlite3.connect("pb_data/data.db")
cursor = conn.cursor()

client = SearchClient.create("K4983KEQZ8", "0599d99b7c0aa6c92daab423161b4acd")
index = client.init_index("arbeiten")

s3_client = boto3.client("s3")

In [15]:
def upload_file(url):
  r = requests.get(url)

  key = uuid4().hex
  new_url = "https://matura-archive.s3.eu-central-1.amazonaws.com/" + key
  ending = url.split(".")[-1].lower()
  
  if ending == "pdf": type = 'application/pdf'
  elif ending == "jpg": type = 'image/jpg'
  elif ending == "jpeg": type = 'image/jpeg'
  elif ending == "png": type = 'image/png'

  extra_args = {'ACL':'public-read', 'ContentType':type}

  with open("test.pdf", "wb") as f:
    f.write(r.content)

  s3_client.upload_file("test.pdf", "matura-archive", key, ExtraArgs=extra_args)
  
  return { "name": "Anhang", "key": key, "url": new_url, "type":type, "size": len(r.content) }

In [16]:
def create_entry(page):
  for i, f in enumerate(page["fach"]):
    r = requests.post(PB_URL+"/api/collections/fach/records", json={"name":f})
    page["fach"][i] = r.json()["id"]
  
  for i, b in enumerate(page["betreuer"]):
    r = requests.post(PB_URL+"/api/collections/betreuer/records", json={"name":b})
    page["betreuer"][i] = r.json()["id"]
  
  for i, l in enumerate(page["links"]):
    r = requests.post(PB_URL+"/api/collections/link/records", json={"url":l})
    page["links"][i] = r.json()["id"]
  
  for i, l in enumerate(page["anhang"]):
    file = upload_file(l)
    r = requests.post(PB_URL+"/api/collections/datei/records", json=file)
    page["anhang"][i] = r.json()["id"]
  
  file = upload_file(page["arbeit"])
  r = requests.post(PB_URL+"/api/collections/datei/records", json=file)
  page["arbeit"] = r.json()["id"]
  
  r = requests.post(PB_URL+"/api/collections/archiv/records", json=page)

  print(200 if r.status_code == 200 else r.text)

In [17]:
def parse_fach(f):
  f = f.text.strip()
  f = f.split(": ")[1]
  f = re.sub("\(.+\)", "", f)
  f = re.sub(" ?/ ?$", "", f)
  f = re.sub(", ", " / ", f)
  f = re.sub(" und ", " / ", f)
  f = re.split(" ?/ ?", f)

  return f

In [18]:
def parse_page(url, BASE_URL, i):
  page = requests.get(BASE_URL + url)
  soup = BeautifulSoup(page.content, "html.parser")

  data1 = soup.find("div", class_="uk-tile-primary uk-tile")
  jahr, name = data1.find_all("h2")
  titel = data1.find("h1")
  untertitel = data1.find("h3")

  data2 = soup.find("div", class_="tm-grid-expand uk-grid-collapse")
  *betreuer, schule = data2.find_all("div", class_="uk-panel uk-margin")
  
  data3 = soup.find("div", class_="uk-tile-default uk-tile")
  datei, *sonst = data3.find_all("a")
  fach = data3.find("h6")
  intro = data3.find("div", class_="uk-panel uk-text-lead uk-margin") or data3.find("div", class_="uk-panel uk-text-lead uk-text-emphasis uk-margin")
  abstract = data3.find("div", class_="uk-panel uk-margin")

  link_div = soup.find_all("div", class_="uk-margin uk-text-center") or []
  for div in link_div:
    sonst += div.find_all("a")
  
  anhang = [BASE_URL + l["href"] for l in sonst if l["href"].startswith("/cms")]
  links = [l["href"] for l in sonst if not l["href"].startswith("/cms")]

  return {
    "jahr": int(jahr.text.strip()),
    "schule": schule.text.strip().replace("Schule: ", ""),
    "sprache": "Deutsch",
    "farbe": ["r", "g", "b"][i % 3],
    "name": name.text.strip(),
    "titel": titel.text.strip(),
    "untertitel": getattr(untertitel, "text", "").strip(),
    "betreuer": [b.text.strip().split(": ")[1] for b in betreuer],
    "fach": parse_fach(fach),
    "intro": getattr(intro, "text", "").strip(),
    "abstract": abstract.text.strip(),
    "arbeit": BASE_URL + datei["href"],
    "anhang": anhang,
    "links": links,
  }

In [19]:
def parse_hit(hit):
  expand = hit["expand"]
  id = hit["id"]

  excl = ["collectionId", "collectionName", "updated", "expand", "id"]
  hit = dict((k,v) for k,v in hit.items() if not k in excl)
  excl.append("created")

  del hit["anhang"]
  del hit["arbeit"]
  del hit["links"]
  
  for i, obj in enumerate(expand.get("betreuer", [])):
    hit["betreuer"][i] = dict((k,v) for k,v in obj.items() if not k in excl)
  for i, obj in enumerate(expand.get("fach", [])):
    hit["fach"][i] = dict((k,v) for k,v in obj.items() if not k in excl)
  
  hit["objectID"] = id

  return hit

In [27]:
for n, i in enumerate(links):
  try:
    page = parse_page(i["href"], BASE_URL, n)
    create_entry(page)
    print(n)
  except Exception as err:
    print(f"Error {n}:", err)

200
0
200
1
200
2
200
3
200
4
200
5
200
6
200
7
200
8
200
9
200
10
200
11
200
12
200
13
200
14
200
15
200
16
200
17
200
18
200
19
200
20
200
21
200
22
200
23
200
24
200
25
200
26
200
27
200
28
200
29
200
30
200
31
200
32
200
33
200
34
200
35
200
36
200
37
200
38
200
39
200
40
200
41
200
42
200
43
200
44
200
45
200
46
200
47
200
48
200
49
200
50
200
51
200
52
200
53


In [31]:
hits = requests.get(PB_URL+"/api/collections/archiv/records", params={"page":2, "perPage":100, "sort":"created,id", "expand":"betreuer,fach,links,arbeit,anhang"}).json()

hits = list(map(parse_hit, hits["items"]))

index.save_objects(hits)

<algoliasearch.responses.IndexingResponse at 0x7fcc70104d30>

In [26]:
len(links)

54