In [134]:
#!pip install -q selenium

In [135]:
import pandas as pd
import re
import codecs
import os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.common.action_chains import ActionChains

import time, random

# Data

In [136]:
path_data = "./Data"

## Web scraping

In [137]:
def extract_poem(driver):
    elem = driver.find_element(By.CLASS_NAME, "card-body")
    return "\n\n".join(p.text for p in elem.find_elements(By.CSS_SELECTOR, "p"))

def iterate_poems(driver, df=None):

    def get_elements():
        mytable = driver.find_element(By.CSS_SELECTOR, 'tbody')
        return mytable.find_elements(By.TAG_NAME, "a")

    poems = []
    titles = []
    auth_elem = driver.find_element(By.CLASS_NAME, "poet__name")
    author = auth_elem.text
    main_window = driver.current_window_handle

    i = 0
    elements = get_elements()
    for link in elements:
        title = link.text
        if df is not None and title in df["Title"].values:
            continue
        
        titles.append(title)

        # Open link in new tab
        link.send_keys(Keys.CONTROL + Keys.RETURN)
        windows = driver.window_handles
        driver.switch_to.window(windows[-1])

        # Extract poem
        time.sleep(2)
        poems.append(extract_poem(driver))
        time.sleep(5)

        # Close Current Tab
        driver.close()

        # Put focus back on main window
        driver.switch_to.window(main_window)
        time.sleep(2)
    
    ActionChains(driver).move_to_element(auth_elem).perform()

    return author, titles, poems

def iterate_web(driver, web, df=None, max_p=5):
    driver.get(web)
    time.sleep(1)

    more_next = True
    if df is None:
        df = pd.DataFrame({"Poet":[], "Poem":[], "Title":[]})
        
    i=0
    while more_next and i < max_p:
        # Obtain all poems author
        author, titles, poems = iterate_poems(driver, df)

        df_aux = pd.DataFrame({"Poet":[author]*len(poems),
                                "Poem":poems,
                                "Title":titles})

        df = pd.concat([df, df_aux], ignore_index=True)
        del df_aux

        try:
            time.sleep(5)
            # Remove spam covering next
            link = driver.find_element(By.XPATH,
                                        '/html/body/w-div/span')
            link.click()
            time.sleep(0.5)
        except (NoSuchElementException, ElementNotInteractableException):
            pass

        try:     
            # Click next   
            link = driver.find_element(By.CSS_SELECTOR,
                                        '[aria-label="Go to next page"]')
            link.click()
        except (NoSuchElementException, ElementNotInteractableException):
            more_next = False
        
        i += 1
        
    return df

def extract_webs(webs, df=None):
    driver = webdriver.Chrome()
    if df is None:
        df = pd.DataFrame({"Poet":[], "Poem":[], "Title":[]})
        
    for web in webs:
        df = iterate_web(driver, web, df)
        df.to_csv(os.path.join(path_data, "PoetryData.csv"),index=False)
    
    #time.sleep(50)
    driver.close()
    return df

In [138]:
webs=[]
with open(os.path.join(path_data, "poets.txt"), "r") as f:
        webs = [w for w in f.readlines()]

print(webs)

['https://poets.org/poet/e-e-cummings']


In [139]:
df = pd.read_csv(os.path.join(path_data, "PoetryData.csv"))
df = extract_webs(webs, df)
# df = extract_webs(webs)

In [162]:
df.to_csv(os.path.join(path_data, "PoetryData.csv"),index=False)

## Feature extraction

In [141]:
df = pd.read_csv(os.path.join(path_data, "PoetryData.csv"))
df.columns

Index(['Poet', 'Poem', 'Title'], dtype='object')

In [142]:
df_red = df[["Title", "Poem", "Poet"]]


In [148]:
df2 = df_red.groupby(['Poet'])['Title'].count().nlargest(11)

In [149]:
df2

Poet
Langston Hughes        50
Naomi Shihab Nye       50
Robert Frost           50
Walt Whitman           50
William Shakespeare    50
W. B. Yeats            39
Emily Dickinson        35
E. E. Cummings         33
John Keats             30
William Wordsworth     30
Name: Title, dtype: int64

In [160]:
df_red = df_red.dropna()

a = df_red.drop_duplicates(subset=['Title'], ignore_index=True)

In [161]:
a.groupby(['Poet'])['Title'].count().nlargest(11)

Poet
William Shakespeare    48
Robert Frost           33
Walt Whitman           33
Langston Hughes        30
W. B. Yeats            28
Naomi Shihab Nye       26
Emily Dickinson        22
John Keats             19
E. E. Cummings         14
Name: Title, dtype: int64

In [157]:
df2 = df_red.groupby(['Poet']).head(30).reset_index(drop=True)

In [158]:
df2

Unnamed: 0,Title,Poem,Poet
0,Nothing Gold Can Stay,"Nature’s first green is gold,\nHer hardest hue...",Robert Frost
1,The Road Not Taken,"Two roads diverged in a yellow wood,\nAnd sorr...",Robert Frost
2,A Line-storm Song,"The line-storm clouds fly tattered and swift, ...",Robert Frost
3,Stopping by Woods on a Snowy Evening,Whose woods these are I think I know.\nHis hou...,Robert Frost
4,"Out, Out–",The buzz-saw snarled and rattled in the yard\n...,Robert Frost
...,...,...,...
265,Songs (I),,E. E. Cummings
266,Amores (VII),,E. E. Cummings
267,Songs (II),,E. E. Cummings
268,Amores (VIII),,E. E. Cummings


In [None]:
# cleaning
df_red.loc[:, "Poem"] = df_red.loc[:, "Poem"].str.replace("\r","")
df_red.loc[:, "Poem"] = df_red.loc[:, "Poem"].str.rstrip("\n")

df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace(r" {2,}", "", regex=True)
df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace("\n", "")
df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace("\r", "")

df_red.loc[:, "Poet"] = df_red.loc[:, "Poet"].str.replace("\n", "")
df_red.loc[:, "Poet"] = df_red.loc[:, "Poet"].str.replace("\r", "")

In [159]:
nan_poems = df_red["Poem"].isnull().sum()
nan_poets = df_red["Poet"].isnull().sum()
nan_title = df_red["Title"].isnull().sum()
print(f"Num NaN Poems: {nan_poems}" +
      f"\nNum NaN Poets: {nan_poets}" +
      f"\nNum NaN Title: {nan_title}")

Num NaN Poems: 96
Num NaN Poets: 0
Num NaN Title: 0


In [None]:
"""
Poem text files should be formatted as:
TITLE
AUTHOR
TEXTTEXTTEXT[...]
******
TITLE2
AUTHOR(2)
TEXT...
"""

with codecs.open(os.path.join(path_data, "style_input.txt"), "w", "utf-8") as f:
    aux = ""
    for _, row in df_red.iterrows():
        new_line = str(row["Title"]) + "\n" + str(row["Poet"]) + "\n" + str(row["Poem"]) + "\n******\n"
        
        f.write(new_line)
      