In [1]:
import pandas as pd

In [2]:
chapters = pd.read_csv('data/chapters_2nd.csv')
titles = pd.read_csv('data/titles_2nd.csv')
authors = pd.read_csv('data/authors_2nd.csv')

In [3]:
df = (chapters.merge(titles, on='title_url', how='left')
      .merge(authors, on='author_url', how='left')
     )

In [4]:
df.isna().sum()

title_url        0
chapter_num      0
chapter        923
author_url       0
title            0
year             0
author           0
born             0
died             0
dtype: int64

In [5]:
df = df.loc[~df.chapter.isna()]

In [6]:
df = df.loc[~df.chapter.str.contains('Projekt Gutenberg')]

### Imputing years
This might be a task for an API, but because of a hard limit on the calls I'll probably impute some dates from time to time manually.
For now we'll remove the titles without a proper year.

In [7]:
df.loc[(df.year == 'o.J.')&(df.author_url == 'zweig'), 'year'] = 1904

In [8]:
df = df.loc[~(df.year == 'o.J.')]

In [9]:
df['year'] = pd.to_numeric(df['year'])

#### Stefan Zweig

In [10]:
df.loc[(df.author_url == 'zweig') & (df.title == 'Amok'), 'year'] = 1922
df.loc[(df.author_url == 'zweig') & (df.title == 'Die unsichtbare Sammlung'), 'year'] = 1925
df.loc[(df.author_url == 'zweig') & (df.title == 'Erstes Erlebnis'), 'year'] = 1911
df.loc[(df.author_url == 'zweig') & (df.title == 'Marceline Desbordes-Valmore'), 'year'] = 1920
df.loc[(df.author_url == 'zweig') & (df.title == 'Reise nach Rußland'), 'year'] = 1928
df.loc[(df.author_url == 'zweig') & (df.title == 'Sternstunden der Menschheit'), 'year'] = 1927
df.loc[(df.author_url == 'zweig') & (df.title == 'Silberne Saiten'), 'year'] = 1901
df.loc[(df.author_url == 'zweig') & (df.title == 'Die frühen Kränze'), 'year'] = 1906

#### Gottfried August Bürger

In [11]:
df.loc[(df.author_url == 'buerger') &(df.title == 'Gedichte'), 'year'] = 1778

#### Ludwig Tieck

In [12]:
df.loc[(df.year < 10) &(df.author_url == 'tieck')].title.unique()

array(['Das Jüngste Gericht', 'Des Lebens Überfluß',
       'Das Fest zu Kenelworth',
       'Leben des berühmten Kaisers Abraham Tonelli', 'Pietro von Abano',
       'Tod des Dichters', 'Der Abschied', 'Der Aufzug der Romanze',
       'Fortunat', 'Leben und Tod der heiligen Genoveva',
       'Einzelne Gedichte', 'Gedichte', 'Coriolanus', 'Cymbeline',
       'Macbeth', 'Frauendienst', 'Chronologie der Werke Tiecks',
       'Über Leben und Werk Tiecks'], dtype=object)

In [13]:
df.loc[(df.author_url == 'tieck') & (df.title == 'Des Lebens Überfluß'), 'year'] = 1839
df.loc[(df.author_url == 'tieck') & (df.title == 'Das Fest zu Kenelworth'), 'year'] = 1828
df.loc[(df.author_url == 'tieck') & (df.title == 'Pietro von Abano'), 'year'] = 1825
df.loc[(df.author_url == 'tieck') & (df.title == 'Tod des Dichters'), 'year'] = 1834
df.loc[(df.author_url == 'tieck') & (df.title == 'Leben und Tod der heiligen Genoveva'), 'year'] = 1821
df.loc[(df.author_url == 'tieck') & (df.title == 'Frauendienst'), 'year'] = 1812
df.loc[(df.author_url == 'tieck') & (df.title == 'Tod des Dichters'), 'year'] = 1792

#### Wilhelm Raabe

In [14]:
df.loc[(df.year < 10) &(df.author_url == 'raabe')].title.unique()

array(['Christoph Pechlin', 'Das letzte Recht', 'Der Dräumling',
       'Der gute Tag', 'Der Marsch nach Hause', 'Des Reiches Krone',
       'Die Gänse von Bützow', 'Die Innerste',
       'Die Kinder von Finkenrode', 'Eine Grabrede aus dem Jahr 1609',
       'Eulenpfingsten', 'Fabian und Sebastian', 'Gedelöcke',
       'Gutmanns Reisen', 'Kloster Lugau',
       'Meister Autor oder Die Geschichten vom versunkenen Garten',
       'Sankt Thomas', 'Prinzessin Fisch', 'Unruhige Gäste',
       'Verworrenes Leben. Novellen und Skizzen', 'Vom alten Proteus'],
      dtype=object)

In [15]:
df.loc[(df.author_url == 'raabe') & (df.title == 'Christoph Pechlin'), 'year'] = 1873
df.loc[(df.author_url == 'raabe') & (df.title == 'Das letzte Recht'), 'year'] = 1862
df.loc[(df.author_url == 'raabe') & (df.title == 'Der Dräumling'), 'year'] = 1872
df.loc[(df.author_url == 'raabe') & (df.title == 'Der Marsch nach Hause'), 'year'] = 1870
df.loc[(df.author_url == 'raabe') & (df.title == 'Des Reiches Krone'), 'year'] = 1873
df.loc[(df.author_url == 'raabe') & (df.title == 'Die Gänse von Bützow'), 'year'] = 1906
df.loc[(df.author_url == 'raabe') & (df.title == 'Die Innerste'), 'year'] = 1876
df.loc[(df.author_url == 'raabe') & (df.title == 'Die Kinder von Finkenrode'), 'year'] = 1859
df.loc[(df.author_url == 'raabe') & (df.title == 'Eine Grabrede aus dem Jahr 1609'), 'year'] = 1862
df.loc[(df.author_url == 'raabe') & (df.title == 'Eulenpfingsten'), 'year'] = 1908
df.loc[(df.author_url == 'raabe') & (df.title == 'Fabian und Sebastian'), 'year'] = 1882
df.loc[(df.author_url == 'raabe') & (df.title == 'Gedelöcke'), 'year'] = 1866
df.loc[(df.author_url == 'raabe') & (df.title == 'Gutmanns Reisen'), 'year'] = 1892
df.loc[(df.author_url == 'raabe') & (df.title == 'Kloster Lugau'), 'year'] = 1894
df.loc[(df.author_url == 'raabe') & (df.title == 'Meister Autor oder Die Geschichten vom versunkenen Garten'), 'year'] = 1874
df.loc[(df.author_url == 'raabe') & (df.title == 'Sankt Thomas'), 'year'] = 1866
df.loc[(df.author_url == 'raabe') & (df.title == 'Prinzessin Fisch'), 'year'] = 1883
df.loc[(df.author_url == 'raabe') & (df.title == 'Verworrenes Leben. Novellen und Skizzen'), 'year'] = 1862
df.loc[(df.author_url == 'raabe') & (df.title == 'Vom alten Proteus'), 'year'] = 1879

#### Friedrich Gottlieb Klopstock

In [17]:
df.loc[(df.year < 10) &(df.author_url == 'klopstoc')].title.unique()

array(['Gedichte', 'Messias'], dtype=object)

In [22]:
df.loc[(df.author_url == 'klopstoc') & (df.title == 'Messias'), 'year'] = 1748 

In [39]:
df.loc[(df.author_url == 'klopstoc') & (df.title == 'Gedichte') 
       &(df.chapter_num >= 1) 
       &(df.chapter_num <= 2), 'year'] = 1747
df.loc[(df.author_url == 'klopstoc') & (df.title == 'Gedichte') 
       &(df.chapter_num >= 3) 
       &(df.chapter_num <= 5), 'year'] = 1748
df.loc[(df.author_url == 'klopstoc') & (df.title == 'Gedichte')  
       &(df.chapter_num == 6), 'year'] = 1747
df.loc[(df.author_url == 'klopstoc') & (df.title == 'Gedichte')  
       &(df.chapter_num == 7), 'year'] = 1767
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte') 
       &(df.chapter_num >= 8) 
       &(df.chapter_num <= 9), 'year'] = 1748
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte') 
       &(df.chapter_num >= 10) 
       &(df.chapter_num <= 11), 'year'] = 1750
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte') 
       &(df.chapter_num >= 12) 
       &(df.chapter_num <= 15), 'year'] = 1752
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte') 
       &(df.chapter_num >= 16) 
       &(df.chapter_num <= 17), 'year'] = 1753
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 18), 'year'] = 1758
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte') 
       &(df.chapter_num >= 19) 
       &(df.chapter_num <= 21), 'year'] = 1759
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte') 
       &(df.chapter_num >= 22) 
       &(df.chapter_num <= 24), 'year'] = 1764
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 25), 'year'] = 1766
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 26), 'year'] = 1750
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 27), 'year'] = 1751
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 28), 'year'] = 1766
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 29), 'year'] = 1775
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 30), 'year'] = 1780
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 31), 'year'] = 1764
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 32), 'year'] = 1770
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 33), 'year'] = 1764
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 34), 'year'] = 1767
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 35), 'year'] = 1773
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 36), 'year'] = 1781
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 37), 'year'] = 1782
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 38), 'year'] = 1782
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 39), 'year'] = 1784
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 40), 'year'] = 1784
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 41), 'year'] = 1796
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 42), 'year'] = 1790
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 43), 'year'] = 1792
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 44), 'year'] = 1793
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 45), 'year'] = 1793
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 46), 'year'] = 1799
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 47), 'year'] = 1795
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 48), 'year'] = 1784
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 49), 'year'] = 1789
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 50), 'year'] = 1795
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 51), 'year'] = 1796
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 52), 'year'] = 1796
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 53), 'year'] = 1797
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 54), 'year'] = 1797
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 55), 'year'] = 1800
df.loc[(df.author_url == 'klopstoc') 
       & (df.title == 'Gedichte')  
       &(df.chapter_num == 57), 'year'] = 1752

#### Max Dauthendey

In [43]:
df.loc[(df.year < 10) & (df.author_url == 'dauthend')].title.unique()

array(['Das Märchenbriefbuch der heiligen Nächte im Javanerlande',
       'Des großen Krieges Not', 'Geschichten aus den vier Winden',
       'Insichversunkene Lieder im Laub', 'Novellen und Erzählungen',
       'Raubmenschen', 'Einzelne Gedichte', 'Die ewige Hochzeit',
       'Der brennende Kalender', 'Weltspuk'], dtype=object)

In [44]:
df.loc[(df.author_url == 'dauthend') 
       & (df.title == 'Des großen Krieges Not'), 'year'] = 1914
df.loc[(df.author_url == 'dauthend') 
       & (df.title == 'Geschichten aus den vier Winden'), 'year'] = 1915
df.loc[(df.author_url == 'dauthend') 
       & (df.title == 'Insichversunkene Lieder im Laub'), 'year'] = 1908
df.loc[(df.author_url == 'dauthend') 
       & (df.title == 'Raubmenschen'), 'year'] = 1911
df.loc[(df.author_url == 'dauthend') 
       & (df.title == 'Die ewige Hochzeit'), 'year'] = 1907
df.loc[(df.author_url == 'dauthend') 
       & (df.title == 'Der brennende Kalender'), 'year'] = 1905
df.loc[(df.author_url == 'dauthend') 
       & (df.title == 'Weltspuk'), 'year'] = 1910

#### Fritz Reuter

In [46]:
df.loc[(df.year < 10)&(df.author_url == 'reuter')].title.unique()

array(['Abendteuer des Entspekter Bräsig',
       'Memoiren eines alten Fliegenschimmels', 'Aus der Franzosenzeit',
       'Zwei Gedichte'], dtype=object)

In [49]:
df.loc[(df.year < 10)&(df.author_url == 'reuter')&(df.title == 'Aus der Franzosenzeit'), 'year'] = 1859

In [53]:
df.loc[(df.year < 10)&(df.author_url == 'jeanpaul')].title.unique()

array(['Aphoristisches', 'Bemerkungen über uns närrische Menschen',
       'Der Komet', 'Der Papierdrache. Erster Theil',
       'Der Papierdrache. Zweiter Theil',
       'Des Geburtshelfers Walther Vierneissel Nachtgedanken über seine verlorenen Fötus-Ideale',
       'Die Vernichtung',
       'Mein Aufenthalt in der Nepomukskirche während der Belagerung der Reichsfestung Ziebingen',
       'Selina oder über die Unsterblichkeit der Seele'], dtype=object)

In [54]:
df.loc[(df.year < 10)&(df.author_url == 'jeanpaul') &(df.title == 'Der Komet'), 'year'] = 1820

In [56]:
df.loc[df.year < 10]

Unnamed: 0,title_url,chapter_num,chapter,author_url,title,year,author,born,died
3,/anzengru/erzaehlg/erzaehlg.html,1,"Florian Traidmann hieß das kleine, schwächlich...",anzengru,Drei Kleinere Erzählungen,0,Ludwig Anzengruber,1839,1889
4,/anzengru/erzaehlg/erzaehlg.html,2,"Dort, wo der Wald niedergeht und ein Spitz wie...",anzengru,Drei Kleinere Erzählungen,0,Ludwig Anzengruber,1839,1889
5,/anzengru/erzaehlg/erzaehlg.html,3,Ein dichter Nebel lag über der großen Stadt Lo...,anzengru,Drei Kleinere Erzählungen,0,Ludwig Anzengruber,1839,1889
6,/anzengru/gedichte/gedichte.html,0,Verlag der J. G. Cotta'schen Buchhandlung Na...,anzengru,Gedichte,0,Ludwig Anzengruber,1839,1889
7,/anzengru/gedichte/gedichte.html,1,Ludwig Anzengruber hat seine Gedichte weder fü...,anzengru,Gedichte,0,Ludwig Anzengruber,1839,1889
...,...,...,...,...,...,...,...,...,...
52981,/zweig/rezensio/rezensio.html,31,Welche legendarischen und mythenbildenden Kräf...,zweig,Rezensionen 1902 - 1939,0,Stefan Zweig,1881,1942
52982,/zweig/rezensio/rezensio.html,32,Vor etwa einem halben Jahrhundert brachte Ferd...,zweig,Rezensionen 1902 - 1939,0,Stefan Zweig,1881,1942
52983,/zweig/rezensio/rezensio.html,33,Gebrauchsanweisung: man suche zuerst nach eine...,zweig,Rezensionen 1902 - 1939,0,Stefan Zweig,1881,1942
52984,/zweig/rezensio/rezensio.html,34,Iwan Gontscharow: \n OblomowDie Gesetze d...,zweig,Rezensionen 1902 - 1939,0,Stefan Zweig,1881,1942
