In [None]:
#!pip install -q selenium

In [24]:
import pandas as pd
import re
import codecs
import os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains

import time, random

# Data

In [25]:
path_data = "./Data"

## Web scraping

In [None]:
def extract_poem(driver):
    elem = driver.find_element(By.CLASS_NAME, "card-body")
    return "\n\n".join(p.text for p in elem.find_elements(By.CSS_SELECTOR, "p"))

def iterate_poems(driver):

    def get_elements():
        mytable = driver.find_element(By.CSS_SELECTOR, 'tbody')
        return mytable.find_elements(By.TAG_NAME, "a")

    poems = []
    titles = []
    auth_elem = driver.find_element(By.CLASS_NAME, "poet__name")
    author = auth_elem.text
    main_window = driver.current_window_handle

    i = 0
    elements = get_elements()
    for link in elements:
        titles.append(link.text)

        # Open link in new tab
        link.send_keys(Keys.CONTROL + Keys.RETURN)
        windows = driver.window_handles
        driver.switch_to.window(windows[-1])

        # Extract poem
        time.sleep(2)
        poems.append(extract_poem(driver))
        time.sleep(5)

        # Close Current Tab
        driver.close()

        # Put focus back on main window
        driver.switch_to.window(main_window)
        time.sleep(2)

    return author, titles, poems

def iterate_web(driver, web, df=None):
    driver.get(web)
    time.sleep(1)

    more_next = True
    if df is None:
        df = pd.DataFrame({"Poet":[], "Poem":[], "Title":[]})

    while more_next:
        # Obtain all poems author
        author, titles, poems = iterate_poems(driver)

        df_aux = pd.DataFrame({"Poet":[author]*len(poems),
                                "Poem":poems,
                                "Title":titles})

        df = pd.concat([df, df_aux], ignore_index=True)
        del df_aux

        try:
            # Remove spam covering next
            link = driver.find_element(By.XPATH,
                                        '/html/body/w-div/span')
            link.click()
            time.sleep(0.5)
        except NoSuchElementException:
            pass

        try:     
            # Click next   
            link = driver.find_element(By.CSS_SELECTOR,
                                        '[aria-label="Go to next page"]')
            link.click()
        except NoSuchElementException:
            more_next = False
        
        
    return df

def extract_webs(webs):
    driver = webdriver.Chrome()
    df = pd.DataFrame({"Poet":[], "Poem":[], "Title":[]})
    for web in webs:
        df = iterate_web(driver, web, df)
    
    #time.sleep(50)
    driver.close()
    return df

In [None]:
webs=[]
with open(os.path.join(path_data, "poets.txt"), "r") as f:
        webs.append(f.readline())
        
df = extract_webs(webs)

hola
New page
hola
New page
hola
New page


In [26]:
df.to_csv(os.path.join(path_data, "PoetryData.csv"),index=False)

## Feature extraction

In [27]:
df = pd.read_csv(os.path.join(path_data, "PoetryData.csv"))
df.columns

Index(['Poet', 'Poem', 'Title'], dtype='object')

In [28]:
df_red = df[["Title", "Poem", "Poet"]]


In [29]:
df2 = df_red.groupby(['Poet'])['Poet'].count().nlargest(100)

In [30]:
df2

Poet
John Wieners                17
Cristin O'Keefe Aptowicz     8
Name: Poet, dtype: int64

In [32]:
# cleaning
df_red.loc[:, "Poem"] = df_red.loc[:, "Poem"].str.replace("\r","")
df_red.loc[:, "Poem"] = df_red.loc[:, "Poem"].str.rstrip("\n")

df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace(r" {2,}", "", regex=True)
df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace("\n", "")
df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace("\r", "")

df_red.loc[:, "Poet"] = df_red.loc[:, "Poet"].str.replace("\n", "")
df_red.loc[:, "Poet"] = df_red.loc[:, "Poet"].str.replace("\r", "")

In [31]:
nan_poems = df_red["Poem"].isnull().sum()
nan_poets = df_red["Poet"].isnull().sum()
nan_title = df_red["Title"].isnull().sum()
print(f"Num NaN Poems: {nan_poems}" +
      f"\nNum NaN Poets: {nan_poets}" +
      f"\nNum NaN Title: {nan_title}")

Num NaN Poems: 0
Num NaN Poets: 0
Num NaN Title: 0


In [35]:
"""
Poem text files should be formatted as:
TITLE
AUTHOR
TEXTTEXTTEXT[...]
******
TITLE2
AUTHOR(2)
TEXT...
"""

with codecs.open(os.path.join(path_data, "style_input.txt"), "w", "utf-8") as f:
    aux = ""
    for _, row in df_red.iterrows():
        new_line = str(row["Title"]) + "\n" + str(row["Poet"]) + "\n" + str(row["Poem"]) + "\n******\n"
        
        f.write(new_line)
      