# Les 07: Webscraping

## Pagina als platte tekst binnennemen

In [None]:
import requests
request = requests.get("https://nl.wikipedia.org/wiki/Vork_(bestek)")
print(request.text)

## Aan de slag met BeautifulSoup
pip install beautifulsoup4

In [None]:
from bs4 import BeautifulSoup
import requests

request = requests.get("https://nl.wikipedia.org/wiki/Vork_(bestek)")
print(request.status_code)
soup = BeautifulSoup(request.text)
print(soup.prettify())


## Pagina bewaren als .html

In [None]:
import requests

request = requests.get("https://nl.wikipedia.org/wiki/Vork_(bestek)")
print(request.status_code)

file = open("vork.html", 'w', encoding="utf-16")
file.write(request.text)
file.close()

print("File is gemaakt!")


## Wat zit er allemaal in de soep?

In [None]:
from bs4 import BeautifulSoup
import requests

request = requests.get("https://nl.wikipedia.org/wiki/Vork_(bestek)")
print(request.status_code)
soup = BeautifulSoup(request.text) # platte HTML code --> request.text

print(soup.title)
print(soup.title.get_text())
print(len(soup.get_text()))
print(soup.get_text()) # pure tekst vanop webpagina

## Lijst van enkel de woorden
Geef een lijst van enkel de woorden!!
https://regexr.com/

In [16]:
from bs4 import BeautifulSoup
import requests
import re

request = requests.get("https://nl.wikipedia.org/wiki/Vork_(bestek)")
print(request.status_code)
soup = BeautifulSoup(request.text) # platte HTML code --> request.text

# ik wil een lijst van gewone woorden!!!
regex = r'\w+' 
result = re.findall(regex, soup.get_text())

print(result)

200
['Vork', 'bestek', 'Wikipedia', 'Vork', 'bestek', 'Uit', 'Wikipedia', 'de', 'vrije', 'encyclopedie', 'Naar', 'navigatie', 'springen', 'Naar', 'zoeken', 'springen', '1', 'Gebaksvork', '2', 'Kaasvork', '3', 'Kindervork', '4', 'Gewone', 'eetvork', '7', 'Vleesvork', 'Gebaksvorkje', 'Een', 'vork', 'is', 'een', 'onderdeel', 'van', 'het', 'bestek', 'en', 'bestaat', 'uit', 'een', 'handvat', 'met', 'daaraan', 'een', 'aantal', 'tanden', 'gewoonlijk', 'drie', 'of', 'vier', 'Een', 'vork', 'wordt', 'vooral', 'gebruikt', 'tijdens', 'het', 'snijden', 'van', 'voedsel', 'om', 'het', 'op', 'de', 'plaats', 'te', 'houden', 'En', 'om', 'afgesneden', 'stukjes', 'naar', 'de', 'mond', 'te', 'brengen', 'Verder', 'overlapt', 'de', 'functie', 'van', 'de', 'vork', 'die', 'van', 'de', 'lepel', 'ten', 'dele', 'veel', 'voedsel', 'kan', 'zowel', 'met', 'een', 'lepel', 'als', 'met', 'een', 'vork', 'naar', 'de', 'mond', 'gebracht', 'worden', 'Traditioneel', 'is', 'een', 'vork', 'gemaakt', 'van', 'metaal', 'maar', '

## Elementen te scrapen

elementen inspecteren : CTRL SHIFT C
<br>developerstools : F12 of CTRL SHIFT i 

< a href="https://an.wikipedia.org/wiki/Forqueta" title="Forqueta – Aragonees" lang="an" hreflang="an" class="interlanguage-link-target">

< a href="https://es.wikipedia.org/wiki/Tenedor" title="Tenedor – Spaans" lang="es" hreflang="es" class="interlanguage-link-target">Español</a>

In [24]:
from bs4 import BeautifulSoup
import requests

request = requests.get("https://nl.wikipedia.org/wiki/Vork_(bestek)")
print(request.status_code)
soup = BeautifulSoup(request.text) # platte HTML code --> request.text

# ik wil verschillende vertalingen van het woord vork

tags = soup.findAll('a',attrs={ "class" : "interlanguage-link-target"})
for tag in tags:
    print(tag["title"])
    # print(tag.get("title"))

# PYTHONIC CODE
names = [a["title"] for a in tags]
print(names)

200
Forqueta – Aragonees
شوكة (أداة) – Arabisch
Tenedor – Asturisch
Patcakihamakan – Atikamekw
Çəngəl – Azerbeidzjaans
Šakotė – Samogitian
Відэлец – Belarussisch
Відэлец – Belarusian (Taraškievica orthography)
Вилица – Bulgaars
কাঁটা চামচ – Bengaals
ཁ་དབྲག – Tibetaans
Fourchetez – Bretons
Forquilla – Catalaans
Vidlička – Tsjechisch
Gaffel (bestik) – Deens
Çatale – Zazaki
Πιρούνι – Grieks
Fursèina – Emiliano-Romagnolo
Fork – Engels
Forko – Esperanto
Tenedor – Spaans
Kahvel – Estisch
Sardexka – Baskisch
چنگال – Perzisch
Haarukka – Fins
Fourchette – Frans
Gabhlóg – Iers
Garfo – Galicisch
מזלג – Hebreeuws
Vilica – Kroatisch
Fouchèt – Haïtiaans Creools
Villa (evőeszköz) – Hongaars
Պատառաքաղ (սպասք) – Armeens
Garpu – Indonesisch
Forketo – Ido
Gaffall – IJslands
Forchetta – Italiaans
フォーク (食器) – Japans
Garpu – Javaans
Tuzart – Kabylisch
Шанышқы – Kazachs
포크 – Koreaans
Çatel – Koerdisch
Furca (cibus) – Latijn
Forschett – Luxemburgs
Force – Lingua Franca Nova
Pirù – Lombardisch
Šakutė – Litouws

## Countries scrapen

https://scrapethissite.com/pages/simple/

In [25]:
# Code Miuno

request = requests.get("https://scrapethissite.com/pages/simple/")

soup = BeautifulSoup(request.text)
tags =soup.findAll('span', attrs={"class" : "country-area"})

lst = []
i = 0
for tag in tags:
    lst.append(tag.get_text())

sorted_lst = sorted(lst)

while i < 10:
    print(sorted_lst[i])
    i+=1

0.0
0.44
1.4E7
1.71E7
1.95
10.0
1001.0
1001450.0
102.0
102.0


In [33]:
# code Kato

from bs4 import BeautifulSoup
import requests


request = requests.get("https://scrapethissite.com/pages/simple/")
print(request.status_code)
soup = BeautifulSoup(request.text) # platte HTML code --> request.text

# landen eruit krijgen
tags = soup.findAll('div',attrs={"class" : "col-md-4 country"})
for tag in tags:
    country = tag('h3',attrs={"class":'country-name'}) # country
    area = tag('span',attrs={"class":'country-area'}) # area
    print(area)
    for land in country:
        landen = land.get_text()
        landen = landen.strip()
        print(landen)
    for gebied in area:
        ruimte = gebied.get_text()
        ruimte = ruimte.strip()
        print(ruimte)

200
[<span class="country-area">468.0</span>]
Andorra
468.0
[<span class="country-area">82880.0</span>]
United Arab Emirates
82880.0
[<span class="country-area">647500.0</span>]
Afghanistan
647500.0
[<span class="country-area">443.0</span>]
Antigua and Barbuda
443.0
[<span class="country-area">102.0</span>]
Anguilla
102.0
[<span class="country-area">28748.0</span>]
Albania
28748.0
[<span class="country-area">29800.0</span>]
Armenia
29800.0
[<span class="country-area">1246700.0</span>]
Angola
1246700.0
[<span class="country-area">1.4E7</span>]
Antarctica
1.4E7
[<span class="country-area">2766890.0</span>]
Argentina
2766890.0
[<span class="country-area">199.0</span>]
American Samoa
199.0
[<span class="country-area">83858.0</span>]
Austria
83858.0
[<span class="country-area">7686850.0</span>]
Australia
7686850.0
[<span class="country-area">193.0</span>]
Aruba
193.0
[<span class="country-area">1580.0</span>]
Åland
1580.0
[<span class="country-area">86600.0</span>]
Azerbaijan
86600.0
[<span

In [31]:
# code Jonas

from bs4 import BeautifulSoup
import requests

request = requests.get("https://scrapethissite.com/pages/simple/")
print(request.status_code)
soup = BeautifulSoup(request.text) # platte HTML code --> request.text

regex=r'[\n\r]*Capital:[ \t]*([^\n\r]*)[\n\r]*Population:[ \t]*([^\n\r]*)'
result = re.findall(regex,soup.get_text())
print(result)

200
[('Andorra la Vella', '84000'), ('Abu Dhabi', '4975593'), ('Kabul', '29121286'), ("St. John's", '86754'), ('The Valley', '13254'), ('Tirana', '2986952'), ('Yerevan', '2968000'), ('Luanda', '13068161'), ('None', '0'), ('Buenos Aires', '41343201'), ('Pago Pago', '57881'), ('Vienna', '8205000'), ('Canberra', '21515754'), ('Oranjestad', '71566'), ('Mariehamn', '26711'), ('Baku', '8303512'), ('Sarajevo', '4590000'), ('Bridgetown', '285653'), ('Dhaka', '156118464'), ('Brussels', '10403000'), ('Ouagadougou', '16241811'), ('Sofia', '7148785'), ('Manama', '738004'), ('Bujumbura', '9863117'), ('Porto-Novo', '9056010'), ('Gustavia', '8450'), ('Hamilton', '65365'), ('Bandar Seri Begawan', '395027'), ('Sucre', '9947418'), ('Kralendijk', '18012'), ('Brasília', '201103330'), ('Nassau', '301790'), ('Thimphu', '699847'), ('None', '0'), ('Gaborone', '2029307'), ('Minsk', '9685000'), ('Belmopan', '314522'), ('Ottawa', '33679000'), ('West Island', '628'), ('Kinshasa', '70916439'), ('Bangui', '4844927'