In [21]:
import pandas as pd
import re
import codecs

# Data

In [240]:
path = ".\Data\PoetryFoundationData.csv"
df = pd.read_csv(path)
df.columns

Index(['Unnamed: 0', 'Title', 'Poem', 'Poet', 'Tags'], dtype='object')

In [241]:
df_red = df[["Title", "Poem", "Poet"]]


In [252]:
df2 = df_red.groupby(['Poet'])['Poet'].count().nlargest(100)

In [253]:
df2

Poet
Edna St. Vincent Millay    10
Ntozake Shange             10
Quan Barry                 10
Thomas Centolella          10
Kahlil Gibran               9
                           ..
Bei Dao                     3
Bill Zavatsky               3
Charles Olson               3
Chinua Achebe               3
Christian Wiman             3
Name: Poet, Length: 100, dtype: int64

In [243]:
# cleaning
df_red.loc[:, "Poem"] = df_red.loc[:, "Poem"].str.replace("\r","")
df_red.loc[:, "Poem"] = df_red.loc[:, "Poem"].str.rstrip("\n")

df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace(r" {2,}", "", regex=True)
df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace("\n", "")
df_red.loc[:, "Title"] = df_red.loc[:, "Title"].str.replace("\r", "")

df_red.loc[:, "Poet"] = df_red.loc[:, "Poet"].str.replace("\n", "")
df_red.loc[:, "Poet"] = df_red.loc[:, "Poet"].str.replace("\r", "")

In [244]:
nan_poems = df_red["Poem"].isnull().sum()
nan_poets = df_red["Poet"].isnull().sum()
nan_title = df_red["Title"].isnull().sum()
print(f"Num NaN Poems: {nan_poems}" +
      f"\nNum NaN Poets: {nan_poets}" +
      f"\nNum NaN Title: {nan_title}")

Num NaN Poems: 0
Num NaN Poets: 0
Num NaN Title: 0


In [245]:
"""
Poem text files should be formatted as:
TITLE
AUTHOR
TEXTTEXTTEXT[...]
******
TITLE2
AUTHOR(2)
TEXT...
"""

with codecs.open(".\Data\style_input.txt", "w", "utf-8") as f:
    aux = ""
    for _, row in df_red.iterrows():
        new_line = row["Title"] + "\n" +\
                    row["Poet"] + "\n" +\
                    row["Poem"] + "\n******\n"

        f.write(new_line)
      

In [256]:
!pip install -q selenium

In [9]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

import time, random


In [45]:
def extract_poem(driver):
    elem = driver.find_element(By.CLASS_NAME, "card-body")
    return "\n\n".join(p.text for p in elem.find_elements(By.CSS_SELECTOR, "p"))

def iterate_poems(driver):
    web = driver.current_url
    def get_elements(web):
        driver.get(web)
        time.sleep(2) 
        mytable = driver.find_element(By.CSS_SELECTOR, 'tbody')
        return mytable.find_elements(By.TAG_NAME, "a")

    num_elems = len(get_elements(web))
    poems = []
    titles = []
    author = driver.find_element(By.CLASS_NAME, "poet__name").text

    i = 0
    while i<num_elems:
        elements = get_elements(web)
        
        link = elements[i]
        titles.append(link.text)
        link.click()
        time.sleep(random.randint(2, 5))
        poems.append(extract_poem(driver))
        i+=1

    time.sleep(random.randint(2, 5))
    driver.get(web)

    return author, titles, poems

def iterate_web(driver, web, df=None):
    driver.get(web)
    time.sleep(1)

    more_next = True
    if df is None:
        df = pd.DataFrame({"Poet":[], "Poem":[], "Title":[]})

    while more_next:
        try:

            author, titles, poems = iterate_poems(driver)
            df_aux = pd.DataFrame({"Poet":[author]*len(poems),
                                    "Poem":poems,
                                    "Title":titles})
            df = df.append(df_aux)
            time.sleep(1)
            link = driver.find_element(By.CSS_SELECTOR,
                                      '[aria-label="Go to next page"]')
            link.click()
            time.sleep(2)
        except NoSuchElementException:
            more_next = False
        print("New page")

    return df

def extract_webs(webs):
    driver = webdriver.Chrome()
    df = pd.DataFrame({"Poet":[], "Poem":[], "Title":[]})
    for web in webs:
        df = iterate_web(driver, web, df)

    driver.close()
    return df

In [46]:
webs = ["https://poets.org/poet/john-wieners"]
df = extract_webs(webs)

IndexError: list index out of range

In [None]:
df

Unnamed: 0,Poet,Poem,Title
0,John Wieners,Promise you wont forget\neach time we met\nwe ...,In Public
1,John Wieners,"He was as a god,\nstepped out of eternal dream...",Billie
2,John Wieners,"by evening light, at the window, where wind bl...",Reading in Bed
3,John Wieners,A poor man cannot make use of himself.\nHe is ...,Money Is Not Monogamous
4,John Wieners,For Gerrit\n\n \n\nWhat is poetry? an image...,Music
5,John Wieners,What kind of poem would one write if one could...,au rive
6,John Wieners,"For our nerves\nthis drink, a beating\non our ...",With Mr. J. R. Morton
7,John Wieners,"I had a fellowship, but lived poorly\nOn slice...",Charity Balls
8,John Wieners,How my Mother’s embroidered apron unfolds in m...,Not Complete Enough
9,John Wieners,I sit in Lees. At 11:40 PM with\nJimmy the...,A poem for tea heads
