## Data Scraping

As a first step we will scrape the necessary data to proceed, namely:
- titles for each author that are publicly available
- publishing year, if it is stored in the metadata
- text strings split into single chapters

In [1]:
from bs4 import BeautifulSoup
import requests

In [1]:
import pandas as pd

In [3]:
from src.scraping import scrape_titles_years
from src.utils import dict_to_csv, csv_to_dict

In [4]:
authors_df = pd.read_csv('data/authors.csv')
titles_years_list = []
for url in authors_df.author_url:
    titles_years_list.extend(scrape_titles_years(url))

df = pd.DataFrame(data = titles_years_list, columns = ['author_url', 'title', 'title_url', 'year'])
df.to_csv('data/titles.csv', index=False)

In [3]:
titles_df = pd.read_csv('data/titles.csv')

In [4]:
from src.scraping import scrape_chapters

In [6]:
chapters_list = []
for title in titles_df.title_url:
    chapters_list.append(scrape_chapters(title))

df = pd.DataFrame(data = chapters_list, columns = ['title_url',
                                              'chapter_num',
                                              'chapter'])
df.to_csv('data/chapters.csv', index=False) 

## Using API's
If scraping the years from projekt-gutenberg.org does not yield any results, we'll have to rely on API's. Trying to avoid that because of hard limits though.

In [None]:
from src.utils import PrivateKeysHandler

In [22]:
relative_path_to_file = '.env'
keys = PrivateKeysHandler(relative_path_to_file)
api_key_dict = keys.load_keys('APIs')

In [52]:
# import requests

# url = "https://google-web-search.p.rapidapi.com/"

# querystring = {"query":"theodor+fontane+effi+briest","max":"1"}

# headers = {
# 	"X-RapidAPI-Key": api_key_dict['google_web_search_key'],
# 	"X-RapidAPI-Host": "google-web-search.p.rapidapi.com"
# }

# response = requests.request("GET", url, headers=headers, params=querystring)

# print(response.text)

{"search_term":"theodor+fontane+effi+briest","search_site":null,"knowledge_panel":{"name":"Effi Briest","label":"Novel by Theodor Fontane","description":{"text":"Effi Briest is a realist novel by Theodor Fontane. Published in book form in 1895, Effi Briest marks both a watershed and a climax in the poetic realism of literature.","url":"https:\/\/en.wikipedia.org\/wiki\/Effi_Briest","site":"Wikipedia"},"image":{"url":"https:\/\/encrypted-tbn0.gstatic.com\/images?q=tbn:mP-X_DD_yDH2zM","width":181,"height":278,"page_url":"https:\/\/books.google.com\/books\/about\/Effi_Briest.html?id=OYX5JoRGQkoC&source=kp_cover"},"info":[{"title":"Originally published","labels":["1894"]},{"title":"Author","labels":["Theodor Fontane"]},{"title":"Genre","labels":["Novel"]},{"title":"Adaptations","labels":["Effi Briest (1974)","The False Step (1939)"]}]},"results":[{"position":1,"url":"https:\/\/en.wikipedia.org\/wiki\/Effi_Briest","title":"Effi Briest - Wikipedia","description":"Effi Briest is a realist nov

In [59]:
#int(response.json()['knowledge_panel']['info'][0]['labels'][0])

1894

# Second attempt


In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
import pandas as pd

In [3]:
from src.scraping import scrape_titles, scrape_title_years

In [9]:
authors_df = pd.read_csv('data/authors_2nd.csv')
titles_list = []
for url in authors_df.author_url:
    titles_list.extend(scrape_titles(url))

df = pd.DataFrame(data = titles_years_list, columns = ['author_url', 'title', 'title_url'])
df.to_csv('data/titles_2nd.csv', index=False)

In [None]:
df = pd.read_csv('data/titles_2nd.csv')
title_years = []
for title_url in df['title_url']:
    title_years.extend(scrape_title_years(title_url))
df['year'] = title_years
df.to_csv('data/titles_2nd.csv', index=False)

In [10]:
authors_df

Unnamed: 0,author,author_url,born,died
0,Ludwig Anzengruber,anzengru,1839,1889
1,Achim von Arnim,arnim,1781,1831
2,Bettina von Arnim,arnimb,1785,1859
3,Johann Jakob Bodmer,bodmer,1698,1783
4,Ludwig Börne,boerne,1786,1837
...,...,...,...,...
105,Georg Weerth,weerth,1822,1856
106,Johann Karl Wezel,wezel,1747,1819
107,Christoph Martin Wieland,wieland,1733,1813
108,Ernst von Wildenbruch,wildenbr,1845,1909


In [11]:
titles_df = pd.read_csv('data/titles_2nd.csv')

In [12]:
from src.scraping import scrape_chapters

In [13]:
chapters_list = []
for title in titles_df.title_url:
    chapters_list.append(scrape_chapters(title))

right now the scrape_chapters function returns a list for every chapter. 
I should probably extend the chapters list instead of appending, or let the function return some more structured list. But for now I have to go through the chapters_list again to create my df and save it.

In [26]:
chapters = []
for chapter in chapters_list:
    for i in range(round(len(chapter) /3)):
        chapters.append([chapter[3*i], chapter[3*i +1], chapter[3*i +2]])
df = pd.DataFrame(data = chapters, columns = ['title_url',
                                              'chapter_num',
                                              'chapter'])
df.to_csv('data/chapters_2nd.csv', index=False) 

In [27]:
df.shape

(41022, 3)

we have 41022 chapters, which surprises me since I scraped more authors this time

In [28]:
df.columns

Index(['title_url', 'chapter_num', 'chapter'], dtype='object')

In [29]:
df.title_url.nunique()

1952