In [None]:
import bs4

with open('./data/example.html') as f:
    example_html = f.read()
    
soup = bs4.BeautifulSoup(example_html)
print(type(soup))
print(soup.prettify())

## Exercise with beautifulSoup
Use BeautifulSoup to extract all titles on all radio programs https://www.dr.dk/radio/programmer
1. First find how many pages there are
2. Then find all titles on https://www.dr.dk/radio/programmer?side=1
3. Then find all titles on all pages


In [95]:
import bs4
import requests

r = requests.get('https://www.dr.dk/radio/programmer')
r.raise_for_status()
soup = bs4.BeautifulSoup(r.text, 'html.parser')

pagination = soup.select('.pagination')
max_pages = pagination[0].get('data-pages')
print(max_pages)

40


### How to Extract Dates and Prices from Strings.

Remember, the raw data, which we extracted from the web pages is all of type `str`. To do statistics about possible correlation of start times and entry fees, we need to convert the corresponding tuple fields into datetimes and integers respectively.


Since dates given on the web do not necessarily conform to standardized time formats, we can apply the `dateparser` (https://pypi.python.org/pypi/dateparser) module, which tries to parse arbitrary strings into datetimes.

You can install the module via:

```bash
pip install dateparser
```

You can read more about the module and its capabilities https://dateparser.readthedocs.io/en/latest/.

In [None]:
%%bash
#pip install dateparser

In [None]:
from tqdm import tqdm
import re
from dateparser import parse
from datetime import datetime


def get_dates_and_prices(scraped_events):
    """
    NO LONGER WORKS WELL WITH KULTUNAUT website after they changes layout and hid the prices behind a js function.
    Cleanup the data. Get price as integer and date as date.
    
    returns:
        A two-element tuple with a datetime representing the start 
        time of an event and an integer representing the price in Dkk.
    """

    price_regexp = r"(?P<price>\d+)" #initial ? is a lookbehind. r() r is for raw text, P<some pattern> is to give a pattern name to refer to. \d is numeric digit, + is for 1 or more.

    data_points = []
    three_at_night = datetime.now().replace(hour=3, minute=0, second=0, microsecond=0).time()
    for event_data in tqdm(scraped_events):
        title_str, place_str, date_str, price_str = event_data
        
        if 'Free admission' in price_str:
            price = 0
        else:
            m = re.search(price_regexp, price_str) # m is the Match object returned from re.search (might be None)
            try:
                price = int(m.group('price')) # if price can be converted to int then we do it else return 0.
            except:
                price = 0

        date_str = date_str.strip().strip('.')
        if '&' in date_str:
            date_str = date_str.split('&')[0]
        if '-' in date_str:
            date_str = date_str.split('-')[0]
        if '.' in date_str:
            date_str = date_str.replace('.', ':')
        
        date = parse(date_str)
        if date and date.time() > three_at_night:
            data_points.append((date, price))
            
    return data_points

def get_dates(scraped_events):
    """
    Cleanup the data. Get date as date.
    
    returns:
        A datetime representing the start 
        time of an event.
    """
    three_at_night = datetime.now().replace(hour=3, minute=0, second=0, microsecond=0).time()
    dates = []
    for event_data in tqdm(scraped_events):
        title_str, place_str, date_str = event_data
        
        date_str = date_str.strip().strip('.')
        if '&' in date_str:
            date_str = date_str.split('&')[0]
        if '-' in date_str:
            date_str = date_str.split('-')[0]
        if '.' in date_str:
            date_str = date_str.replace('.', ':')
        
        date = parse(date_str)
        if date and date.time() > three_at_night:
            dates.append(date)
    return dates

dates = get_dates(scraped_events)
print(dates)

In [None]:
dates[10:20]

## Scraping Images from a Page

In the following code you will use Beautiful Soup to extract all links to images, which are in `img` tags on a web page.

In [None]:
import bs4
import os
import sys
import requests
import shutil


def collect_img_links(url):
    """based on a url returns a list of image links contained in the requested page"""
    r = requests.get(url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    #print(soup.select('img'))
    return [img.get('src') for img in soup.select('img') 
            if img.get('src') and img.get('src').startswith('http')]


def download_imgs(links, out_folder="./data/test/"):
    """download all images from a list of image links. 
    Requires a folder named: test to be there"""
    img_no = 0
    for l in links:
        img_no += 1
        r = requests.get(l, stream=True)
        with open(out_folder+'img'+str(img_no)+'.jpg', 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)     
        
links = collect_img_links('https://www.google.dk/search?site=&tbm=isch&source=hp&biw=1163&bih=812&q=minions&oq=minions')
print(links)
download_imgs(links)

# Exercise 2: Writing a Simple Web Crawler

Write a simple web crawler. More precisely, a program that extracts recursively all links from web pages. The result of running the web crawler is a dictionary, were the key-value pairs correspond to outgoiung links from a web page with the URL, which is stored in the key.


In case a page returns a status code, which is not `200` we just disregard this page. See https://en.wikipedia.org/wiki/List_of_HTTP_status_codes for more detailes on the various HTTP status codes.

In [None]:
def scrape_links(from_url, for_depth, all_links={}):
    # This is what the exercise below asks you to implement!
    pass


start_url = 'https://www.version2.dk/artikel/google-deepmind-vi-oeger-sikkerheden-mod-misbrug-sundhedsdata-1074452'

link_dict = scrape_links(from_url=start_url, for_depth=2)

The web crawler that you wrote above is perhaps not the most performant. If you are interested in more web scraping and application of crawlers have a look at the `scrapy` module (https://scrapy.org)