# Extract government SOU dataset at https://sou.kb.se (Kungliga biblioteket)

This notebook extracts all pdf-links from https://sou.kb.se. The links are written to "sou_pdf_links.txt" and, as a batch of wget commands with proper output anmes (filesystem sane), to "wget_all.sh". Edit the wget_all.sh to limit downloads to a smaller range of years if needed – the dataset is relatively huge (hundreds of gigabytes). Use chmod +x and then execute in a Linux/Unix environment.

```python
CDHU="""\
 ____ ____ ____ ____ ________ 
||C |||D |||H |||U |||       
||__|||__|||__|||__|||_______
|/__\|/__\|/__\|/__\|/_______

 EXTRACT SOU / BEAUTIFUL SOUP
"""
```



In [117]:
# Code: Matts L/CDHU
# Reuires: urllib, BeautifulSoup4, pandas
from urllib import request
from bs4 import BeautifulSoup
import re
import pandas as pd 

# connect to website and get list of all pdfs
url="https://sou.kb.se"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")     
links = soup.find_all('a', href=re.compile(r'(sou-\d)'))

In [118]:
TESTING=False
sou_urn_links = []
sou_names = []
sou_pdf_links = []

#Gets all A tags with SOU:s
a_tags = soup.find_all('a', href=re.compile(r'(sou-\d)'))

count=0
limit=1500 #limit requests when testing

#Iterates all A tags
for l in a_tags:
    count=count+1
    if count >= limit and TESTING:
      print("testing, limit="+str(limit))
      break
    # Get and store SOU name and urn sub page link
    sou_page = l.get('href')
    sou_urn_links.append(sou_page)
    sou_link_text = l.get_text()
    sou_names.append(sou_link_text)
    # Request urn page and extract PDF link
    print(  '++ Requesting URN-sub page (' \
             +sou_link_text+ \
             ') and extracting pdf-link... ', 
             end = '')
    response = request.urlopen(sou_page).read()
    soup2=BeautifulSoup(response, "html.parser")
    for l in soup2.find_all('a', href=re.compile(r'(\.pdf)')):
      pdf_link = l.get('href')
      sou_pdf_links.append(pdf_link)
    print('Done.')


++ Requesting URN-sub page (1922:1 första serien) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:1) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:2 första serien) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:2) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:3 första serien) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:3) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:4 första serien) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:4) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:5 första serien) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:5) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:6 första serien) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:6) and extracting pdf-link... Done.
++ Requesting URN-sub page (1922:7 första serien) and extracting pdf-link... Done.
++ Re

In [119]:
DECADE=1920 #set starting decade

#package with pandas:
dict = {'titel': sou_names, 'pdf': sou_pdf_links, 'urn': sou_urn_links}
df = pd.DataFrame(dict)

#//UNUSED:
#code here to packager everything in a table with zip()
#sou_table = list(zip(sou_names, sou_pdf_links, sou_urn_links))
#print(sou_table)
#print(sou_pdf_links)

# write to file
# and/or output as string
sous_csv_string = df.to_csv()
try:
  df.to_csv('sous.csv')
except: 
  print('-- Error!')
else:
  print('++ Wrote sous.csv')
try:
  with open(r'wget_all.sh', 'w') as fp:
    fp.write("#!/bin/bash\n")
    for ind in df.index:
          df_titel = df['titel'][ind]
          result = re.search(r'(\d\d\d\d)(\:)', df_titel)
          year=(result.group(1))
          if int(year)-DECADE == 10:
            DECADE=DECADE+10 # inc 10 yrs
            fp.write("# "+str(DECADE)+"\n")

          url = df['pdf'][ind]
          out_file = df['titel'][ind]
          #sanitize
          #out_file = "".join([c for c in out_file if c.isalpha() or c.isdigit() or c==' ']).rstrip()
          #TODO: should use more regex and zfill() for better, corretcly sortable, filename numbers here...
          out_file = out_file.replace(" ", "_")
          out_file = out_file.replace(":", "-")
          out_file = out_file.replace("/", "-")
          out_file = 'sou-'+out_file
          command = 'wget --continue -O \"'+out_file+'.pdf\" '+url
          #write each item on a new line
          fp.write("%s\n" % command)
except:
  print("-- Error!")
else:
  print('++ Wrote wget_all.sh')

# Write pdf links for wget
# (To parallelize dl, you can use something like: cat sou_pdf_links.txt | parallel --gnu "wget {}"
# ...to download parallelised)
# Useful => metadata in SOU-pdfs: pdfinfo -enc UTF-8 <filename.pdf> gets full swedish titles and libris ID.
try:
  with open(r'sou_pdf_links.txt', 'w') as fp:
      for item in sou_pdf_links:
          # write each item on a new line
          fp.write("%s\n" % item)
except:
  print("-- Error!")
else:
  print('++ Wrote sou_pdf_links.txt')

++ Wrote sous.csv
++ Wrote wget_all.sh
++ Wrote sou_pdf_links.txt
