# scrape dta

- öffne Autoren-Seite aus Author-table
- suche alle Einträge mit "Bellistrik :: [Novelle, Roman, Erzählung, usw. Prosa halt]"
- sammle von gefundenen Einträgen Metadaten
- verbinde Metadaten von Eintrag mit Metadaten aus Author-table
- aus Bellistrikkorpus verschiebe Eintrag nach corpus-folder


## Problems

- lots of empty files like Maler Nolten
- other stuff (written in commentary)

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
import re
import time

In [2]:
authors_df = pd.read_csv("src/author_table.tsv", encoding="utf8", sep = "\t")

In [3]:
def get_html(website):
    try:
        with urllib.request.urlopen(website) as f:
           html_text = f.read()#.decode('utf-8')
    except urllib.error.URLError as e:
       print(e.reason)
    return html_text

In [4]:
list_of_genres = ["/Roman/",
                  "/Romane/",
                  "/Erz%C3%A4hlungen/", # Erzählungen
                  "/M%C3%Archen/",
                  "/Erz%C3%Ahlprosa/",
                  "/Prosa/",
                  "/Epik/",
                  "/Legenden/",
                  "/Sagen/"] 

In [5]:
# get metadata

titles = []
authors = []
genres = []
filenames = []
sources = []

for website in list(authors_df.dropna(subset=["Zeno"])["Zeno"]):
    author = authors_df["Voller_Name"].loc[authors_df["Zeno"]==website].values[0]
    
    html_string = get_html(website)
    soup = BeautifulSoup(html_string, 'html.parser')
    
    div = soup.find_all("div", {"class": "zenoTRNavBottom"})
    
    if len(div)>=1:
        links = div[0].find_all("a")      
    else:
        continue
    
    # print(author)
    # print(website)
    
    # find relevant prose works 
    for ele in links:
        for genre in list_of_genres:
            if genre in ele.get("href"):
                # placeholder metadata
                genres.append(genre.replace("/", ""))
                authors.append(author)
                
                # metadata
                titles.append(ele.text)
                
                source = "http://www.zeno.org" + ele.get("href")
                sources.append(source)
                
                filename = author.split()[-1] +"_"+ ele.text.replace(" ", "").replace(".", "").replace(r"*", "") +"_zeno.txt"
                filenames.append(filename)
                
                # debugging
                # print("\t" + source)
                # print("\t" + genre)
                # print("\t" + ele.text)
                # print("\t" + filename)
                # print("\t" + "=========")
            else:
                continue        

In [6]:
df = pd.DataFrame()
df["title"]=titles
df["author"]=authors
df["filename"]=filenames
df["genre"]=genres
df["sources"]=sources

df.head()

Unnamed: 0,title,author,filename,genre,sources
0,"Armut, Reichtum, Schuld und Buße der Gräfin Do...",Achim von Arnim,"Arnim_Armut,Reichtum,SchuldundBußederGräfinDol...",Romane,"http://www.zeno.org/Literatur/M/Arnim,+Ludwig+..."
1,Die Kronenwächter,Achim von Arnim,Arnim_DieKronenwächter_zeno.txt,Romane,"http://www.zeno.org/Literatur/M/Arnim,+Ludwig+..."
2,Der Wintergarten,Achim von Arnim,Arnim_DerWintergarten_zeno.txt,Erz%C3%A4hlungen,"http://www.zeno.org/Literatur/M/Arnim,+Ludwig+..."
3,Isabella von Ägypten,Achim von Arnim,Arnim_IsabellavonÄgypten_zeno.txt,Erz%C3%A4hlungen,"http://www.zeno.org/Literatur/M/Arnim,+Ludwig+..."
4,Die drei liebreichen Schwestern und der glückl...,Achim von Arnim,Arnim_DiedreiliebreichenSchwesternundderglückl...,Erz%C3%A4hlungen,"http://www.zeno.org/Literatur/M/Arnim,+Ludwig+..."


In [7]:
len(df)

503

In [8]:
len(df[df.duplicated(["filename"], keep=False)])

0

In [9]:
# df.to_csv("metadata_Zeno.csv", encoding="utf8", sep=";")

In [10]:
def get_chapter_list(li_soup):
    chapters = []
    
    for li in li_soup:
            if li.b:
                continue
            else:
                if li.a:
                    for genre in list_of_genres:
                        if genre in li.a.get("href"):
                            chapters.append("http://www.zeno.org" + li.a.get("href"))
    return chapters

In [11]:
for text_website in list(df["sources"]):
    filename = df["filename"].loc[df["sources"]==text_website].values[0]
    
    text = ""
    
    html_string = get_html(text_website)
    soup = BeautifulSoup(html_string, 'html.parser') 
    
    lis = soup.find_all("li",{'class': False, 'id': False})
    
    chapters = get_chapter_list(lis)
    
    if len(chapters)>0:
        for chapter in chapters:
            sub_html_string = get_html(chapter)
            sub_soup = BeautifulSoup(sub_html_string, 'html.parser')
            
            elements = sub_soup.find_all("p")
            
            for element in elements:
                text = text + element.text + "\n"
    else:
        elements = soup.find_all("p")

        for element in elements:
            text = text + element.text + "\n"
            
    
    if len(text.split())<5:
        print(filename)
    
    with open("zeno_files/" + filename, "w", encoding="utf8") as f:
                f.write(text)
    time.sleep(1)

Wieland_DieAbenteuerdesDonSylviovonRosalva_zeno.txt
Willkomm_WeisseSclavenoderdieLeidendesVolkes_zeno.txt
Lewald_VonGeschlechtzuGeschlecht_zeno.txt
Freytag_DieverloreneHandschrift_zeno.txt
Wassermann_DasGänsemännchen_zeno.txt
Gutzkow_DerZauberervonRom_zeno.txt
