In [5]:
#!pip install -q selenium
#pip install ripser

In [41]:
import pandas as pd
import re
import codecs
import os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.common.action_chains import ActionChains

import time, random
import numpy as np


import stablerank.srank as sr
from ripser import ripser
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import scipy.spatial as spatial

# Data

In [8]:
path_data = "./Data"

def extract_text(path):
    poets, titles, poems = [], [], []
    in_poem = False
    poet = False
    title = False 
    poem = ""
    with open(path, "r", encoding="UTF-8") as f:
        for line in f:
            if  line.strip() == "******":
                poet = False
                title = False
                poems.append(poem.strip())
                poem = ""
            elif poet and title:
                poem += line
                in_poem = True
            elif not title:
                titles.append(line.strip())
                title = True
            elif not poet:
                poets.append(line.strip())
                poet = True
    
    return pd.DataFrame({"Poet":poets, "Poem":poems, "Title":titles})

## Web scraping

In [12]:
def extract_poem(driver):
    elem = driver.find_element(By.CLASS_NAME, "card-body")
    return "\n\n".join(p.text for p in elem.find_elements(By.CSS_SELECTOR, "p"))

def iterate_poems(driver, df=None):

    def get_elements():
        mytable = driver.find_element(By.CSS_SELECTOR, 'tbody')
        return mytable.find_elements(By.TAG_NAME, "a")

    poems = []
    titles = []
    auth_elem = driver.find_element(By.CLASS_NAME, "poet__name")
    author = auth_elem.text
    main_window = driver.current_window_handle

    i = 0
    elements = get_elements()
    for link in elements:
        title = link.text
        if df is not None and title in df["Title"].values:
            continue
        
        titles.append(title)

        # Open link in new tab
        link.send_keys(Keys.CONTROL + Keys.RETURN)
        windows = driver.window_handles
        driver.switch_to.window(windows[-1])

        # Extract poem
        time.sleep(2)
        poems.append(extract_poem(driver))
        time.sleep(5)

        # Close Current Tab
        driver.close()

        # Put focus back on main window
        driver.switch_to.window(main_window)
        time.sleep(2)
    
    ActionChains(driver).move_to_element(auth_elem).perform()

    return author, titles, poems

def iterate_web(driver, web, df=None, max_p=5):
    driver.get(web)
    time.sleep(1)

    more_next = True
    if df is None:
        df = pd.DataFrame({"Poet":[], "Poem":[], "Title":[]})
        
    i=0
    while more_next and i < max_p:
        # Obtain all poems author
        author, titles, poems = iterate_poems(driver, df)

        df_aux = pd.DataFrame({"Poet":[author]*len(poems),
                                "Poem":poems,
                                "Title":titles})

        df = pd.concat([df, df_aux], ignore_index=True)
        del df_aux

        try:
            time.sleep(5)
            # Remove spam covering next
            link = driver.find_element(By.XPATH,
                                        '/html/body/w-div/span')
            link.click()
            time.sleep(0.5)
        except (NoSuchElementException, ElementNotInteractableException):
            pass

        try:     
            # Click next   
            link = driver.find_element(By.CSS_SELECTOR,
                                        '[aria-label="Go to next page"]')
            link.click()
        except (NoSuchElementException, ElementNotInteractableException):
            more_next = False
        
        i += 1
        
    return df

def extract_webs(webs, df=None):
    driver = webdriver.Chrome()
    if df is None:
        df = pd.DataFrame({"Poet":[], "Poem":[], "Title":[]})
        
    for web in webs:
        df = iterate_web(driver, web, df)
        df.to_csv(os.path.join(path_data, "PoetryData.csv"),index=False)
    
    #time.sleep(50)
    driver.close()
    return df

In [35]:
webs=[]
with open(os.path.join(path_data, "poets.txt"), "r") as f:
        webs = [w for w in f.readlines()]

print(webs)

['https://poets.org/poet/e-e-cummings']


In [36]:
df = pd.read_csv(os.path.join(path_data, "PoetryData.csv"))
df = extract_webs(webs, df)
# df = extract_webs(webs)

In [None]:
df.to_csv(os.path.join(path_data, "PoetryData.csv"),index=False)

### Adding bad apples

In [23]:
df_extra = extract_text(os.path.join(path_data, "bad_apples.txt"))
df = pd.concat([df, df_extra], ignore_index=True)

In [27]:
df.to_csv(os.path.join(path_data, "PoetryData.csv"),index=False)

## Stylistic features classification

### Feature extraction

In [28]:
df = pd.read_csv(os.path.join(path_data, "PoetryData.csv"))
df.columns

Index(['Title', 'Poem', 'Poet'], dtype='object')

In [29]:
df.groupby(['Poet'])['Title'].count().nlargest(20)

Poet
Christina Rossetti     30
E. E. Cummings         30
Emily Dickinson        30
John Keats             30
Langston Hughes        30
Naomi Shihab Nye       30
Robert Frost           30
W. B. Yeats            30
Walt Whitman           30
William Shakespeare    30
Name: Title, dtype: int64

In [30]:
df = df.dropna()

df = df.drop_duplicates(subset=['Title'], ignore_index=True)

In [31]:
df.groupby(['Poet'])['Title'].count().nlargest(20)

Poet
Christina Rossetti     30
E. E. Cummings         30
Emily Dickinson        30
John Keats             30
Langston Hughes        30
Naomi Shihab Nye       30
Robert Frost           30
W. B. Yeats            30
Walt Whitman           30
William Shakespeare    30
Name: Title, dtype: int64

In [32]:
df = df.groupby(['Poet']).head(30).reset_index(drop=True)

In [33]:
# cleaning
df.loc[:, "Poem"] = df.loc[:, "Poem"].str.replace("\r","")
df.loc[:, "Poem"] = df.loc[:, "Poem"].str.rstrip("\n")

df.loc[:, "Title"] = df.loc[:, "Title"].str.replace(r" {2,}", "", regex=True)
df.loc[:, "Title"] = df.loc[:, "Title"].str.replace("\n", "")
df.loc[:, "Title"] = df.loc[:, "Title"].str.replace("\r", "")

df.loc[:, "Poet"] = df.loc[:, "Poet"].str.replace("\n", "")
df.loc[:, "Poet"] = df.loc[:, "Poet"].str.replace("\r", "")

In [34]:
nan_poems = df["Poem"].isnull().sum()
nan_poets = df["Poet"].isnull().sum()
nan_title = df["Title"].isnull().sum()
print(f"Num NaN Poems: {nan_poems}" +
      f"\nNum NaN Poets: {nan_poets}" +
      f"\nNum NaN Title: {nan_title}")

Num NaN Poems: 0
Num NaN Poets: 0
Num NaN Title: 0


In [35]:
"""
Poem text files should be formatted as:
TITLE
AUTHOR
TEXTTEXTTEXT[...]
******
TITLE2
AUTHOR(2)
TEXT...
"""

with open(os.path.join(path_data, "style_input.txt"), "w", encoding='UTF-8') as f:
    aux = ""
    for _, row in df.iterrows():
        new_line = str(row["Title"]) + "\n" + str(row["Poet"]) + "\n" + str(row["Poem"]) + "\n******\n"
        
        f.write(new_line)

In [64]:
#Open output file, remove first file and also the separators
with open(os.path.join(path_data, "out.txt"), "r", encoding='UTF-8', errors='ignore') as f :
    output = f.readlines()
output = output[1:]
output = list(filter(lambda a: "***" not in a, output))

#Split for each line and store in a list (probably useless but idc)
out_list = []
for elem in output :
    content = elem.split("|")[2:]
    #remove \n of last e
    content[-1] = content[-1][:-1]
    
    out_list.append(content)

style_ds = np.array(out_list, dtype=np.float64)

with open(os.path.join(path_data, "out.npy"), 'wb') as f:
    np.save(f, style_ds)

In [65]:
with open(os.path.join(path_data, "out.npy"), 'rb') as f:
    style_ds = np.load(f)

style_ds.shape

(300, 84)

### Poems generated with GTP-J

In [84]:
# df_fake = extract_text(os.path.join(path_data, "fake_input.txt"))
df_fake = pd.read_csv(os.path.join(path_data, "PoetryData_fake.csv"))
df_fake.columns

Index(['Poet', 'Poem', 'Title'], dtype='object')

In [85]:
df_fake = df_fake.dropna()
df_fake = df_fake.drop_duplicates(subset=['Title'], ignore_index=True)
df_fake.to_csv(os.path.join(path_data, "PoetryData_fake.csv"),index=False)
df_fake.groupby(['Poet'])['Title'].count().nlargest(11)

Poet
Christina Rossetti     10
E. E. Cummings         10
Emily Dickinson        10
John Keats             10
Langston Hughes        10
Naomi Shihab Nye       10
Robert Frost           10
W. B. Yeats            10
Walt Whitman           10
William Shakespeare    10
Name: Title, dtype: int64

In [86]:
# cleaning
df_fake.loc[:, "Poem"] = df_fake.loc[:, "Poem"].str.replace("\r","")
df_fake.loc[:, "Poem"] = df_fake.loc[:, "Poem"].str.rstrip("\n")

df_fake.loc[:, "Title"] = df_fake.loc[:, "Title"].str.replace(r" {2,}", "", regex=True)
df_fake.loc[:, "Title"] = df_fake.loc[:, "Title"].str.replace("\n", "")
df_fake.loc[:, "Title"] = df_fake.loc[:, "Title"].str.replace("\r", "")

df_fake.loc[:, "Poet"] = df_fake.loc[:, "Poet"].str.replace("\n", "")
df_fake.loc[:, "Poet"] = df_fake.loc[:, "Poet"].str.replace("\r", "")

nan_poems = df_fake["Poem"].isnull().sum()
nan_poets = df_fake["Poet"].isnull().sum()
nan_title = df_fake["Title"].isnull().sum()
print(f"Num NaN Poems: {nan_poems}" +
      f"\nNum NaN Poets: {nan_poets}" +
      f"\nNum NaN Title: {nan_title}")

Num NaN Poems: 0
Num NaN Poets: 0
Num NaN Title: 0


In [87]:
"""
Poem text files should be formatted as:
TITLE
AUTHOR
TEXTTEXTTEXT[...]
******
TITLE2
AUTHOR(2)
TEXT...
"""

with open(os.path.join(path_data, "fake_input.txt"), "w", encoding='UTF-8') as f:
    aux = ""
    for _, row in df_fake.iterrows():
        new_line = str(row["Title"]) + "\n" + str(row["Poet"]) + "\n" + str(row["Poem"]) + "\n******\n"
        
        f.write(new_line)

In [66]:
#Open output file, remove first file and also the separators
with open(os.path.join(path_data, "fake_out.txt"), "r", encoding='UTF-8', errors='ignore') as f :
    fake_output = f.readlines()
fake_output = fake_output[1:]
fake_output = list(filter(lambda a: "***" not in a, fake_output))

#Split for each line and store in a list (probably useless but idc)
fake_out_list = []
for elem in fake_output :
    content = elem.split("|")[2:]
    #remove \n of last e
    content[-1] = content[-1][:-1]
    
    fake_out_list.append(content)

style_fake_ds = np.array(fake_out_list, dtype=np.float64)

with open(os.path.join(path_data, "fake_out.npy"), 'wb') as f:
    np.save(f, style_fake_ds)

In [67]:
with open(os.path.join(path_data, "fake_out.npy"), 'rb') as f:
    style_fake_ds = np.load(f)

style_fake_ds.shape

(100, 84)

### Creation of the points clouds 

In [None]:
"""distance = sr.Distance(spatial.distance.pdist(data, "euclidean"))
wisconsin_distances = distance.square_form()
i = 0
while i < len(wisconsin_distances):
    plt.hist(wisconsin_distances[i])
    i += 1

distributions = {}
distributions["0_15"] = sr.get_distribution(name="uniform", interval=[0,15])
distributions["5_20"] = sr.get_distribution(name="uniform", interval=[5,20])
distributions["10_25"] = sr.get_distribution(name="uniform", interval=[10,25])
distributions["10_30"] = sr.get_distribution(name="uniform", interval=[10,30])
distributions["15_30"] = sr.get_distribution(name="uniform", interval=[15,30])

probabilities = {}
for k in distributions.keys():
    probabilities[k] = distributions[k](wisconsin_distances)

number_instances=300
sample_size=30

start = timer()    
h0_sr = {}
h1_sr = {}
for k in  distributions.keys():
    h0_sr[k] = []
    h1_sr[k] = []
    for patient in wisconsin_distances:
        p = distributions[k](patient)
        s = sr.get_sample(number_instances, sample_size, p)
        f = distance.get_h0sr(sample=s,clustering_method="complete")
        b = distance.get_bc(sample=s, maxdim=1)
        g = sr.bc_to_sr(b,degree="H1")
        h0_sr[k].append(f)
        h1_sr[k].append(g)
end = timer()
print(timedelta(seconds=end-start))        

for k in distributions.keys():
    fig = plt.figure(k,figsize=(30,30))
    i = 0
    for f in h0_sr[k]:
        if classification[i] ==2:
            color = "black"
        else:
            color = "red"
        f.plot(color = color)
        i += 1"""


## Semantic classification

### Bert embeddings

In [None]:
# Creation in pipeline_from_csv_to_bert.ipynb
# Extraction from save file

with open(os.path.join(path_data, "bert.npy"), 'rb') as f:
    sem_ds = np.load(f)

sem_ds.shape


with open(os.path.join(path_data, "fake_bert.npy"), 'rb') as f:
    fake_sem_ds = np.load(f)

fake_sem_ds.shape