In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import torch
from torch.utils.data import Dataset, DataLoader

# import data 
data = pd.read_csv('PoetryFoundationData.csv')
data = data.dropna()

data

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
6,6,\n\n Invisible Fish\n\n ...,\n\nInvisible fish swim this ghost ocean now d...,Joy Harjo,"Living,Time & Brevity,Relationships,Family & A..."
7,7,\n\n Don’t Bother the Earth...,\n\nDon’t bother the earth spirit who lives he...,Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fa..."
9,9,"\n\n [""Hour in which I cons...","\n\nHour in which I consider hydrangea, a salt...",Simone White,"Living,Parenthood,The Body,The Mind,Nature,Tre..."
16,16,\n\n scars\n\n,\n\nmy father’s body is a map\n\na record of h...,Truong Tran,"The Body,Family & Ancestors"
17,17,\n\n what remains two\n\n ...,\n\nit has long been forgotten this practice o...,Truong Tran,"Infancy,Parenthood,The Body"
...,...,...,...,...,...
13835,1,\n\n !\n\n,"\n\nDear Writers, I’m compiling the first in w...",Wendy Videlock,"Relationships,Gay, Lesbian, Queer,Arts & Scien..."
13848,12,\n\n 1 January 1965\n\n ...,\n\nThe Wise Men will unlearn your name.\n\nAb...,Joseph Brodsky,"Living,Death,Growing Old,Time & Brevity,Nature..."
13849,13,\n\n 1-800-FEAR\n\n ...,\n\nWe'd like to talk with you about fe...,Jody Gladding,"Living,Social Commentaries,Popular Culture"
13852,0,\n\n 0\n\n,"\n\n Philosophic\n\nin its complex, o...",Hailey Leithauser,"Arts & Sciences,Philosophy"


In [2]:
# Delete the first column
data = data.drop(data.columns[0], axis=1)

# in the titles, drop all '\n\n' occurrences
data['Title'] = data['Title'].apply(lambda x: x.replace('\n\n', ''))


data

# check how many poems contain a ';' in the text
data['Poem'].apply(lambda x: '<LINE>' in x).sum()

# remove this
data['Poem'] = data['Poem'].apply(lambda x: x.replace('<LINE>', ''))

# replace all \n with a '<LINE>' character
data['Poem'] = data['Poem'].apply(lambda x: x.replace('\n', '<LINE>'))
# replace all double <LINE><LINE> with a single <LINE>
data['Poem'] = data['Poem'].apply(lambda x: x.replace('<LINE><LINE>', '<LINE>'))
# remove all leading and trailing <LINE> characters
data['Poem'] = data['Poem'].apply(lambda x: x.strip('<LINE>'))

# set all poems to lowercase
data['Poem'] = data['Poem'].apply(lambda x: x.lower())

# sometimes there are multiple spaces between words, replace them with a single space
data['Poem'] = data['Poem'].apply(lambda x: ' '.join(x.split()))

# set all tags to lowercase
data['Tags'] = data['Tags'].apply(lambda x: x.lower())

# remove all leading and trailing spaces
data['Tags'] = data['Tags'].apply(lambda x: x.strip())


# add poem and tags to new dataframe
poems = pd.DataFrame()
poems['Poem'] = data['Poem']
poems['Tags'] = data['Tags']

# save the poems to a new csv file
poems.to_csv('poems.csv')



In [5]:

class DataProcessor(object):
    def __init__(self, ):
        super().__init__()
        nlp = spacy.load("en_core_web_sm")
        nltk.download('omw-1.4')
        nltk.download("punkt")
        nltk.download("wordnet")
        nltk.download("stopwords")

    @staticmethod
    def preprocess_text(text):
        # Tokenize, remove punctuation and lowercase
        tokens = nltk.word_tokenize(text)
        tokens = [word.lower() for word in tokens if word.isalpha()]

        # Remove stopwords and lemmatize
        stop_words = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        processed_text = [
            lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
        ]

        return " ".join(processed_text)

    def process_batch(self, texts):
        return [self.preprocess_text(d) for d in texts]

Unnamed: 0,Poem,Tags
6,invisible fish swim this ghost ocean now descr...,"[living, time & brevity, relationships, family..."
7,don’t bother the earth spirit who life here. s...,"[religion, the spiritual, mythology & folklore..."
9,"hour in which i consider hydrangea, a salt or ...","[living, parenthood, the body, the mind, natur..."
16,my father’s body is a map^a record of his jour...,"[the body, family & ancestors]"
17,it ha long been forgotten this practice of the...,"[infancy, parenthood, the body]"
...,...,...
13835,"dear writers, i’m compiling the first in what ...","[relationships, gay, lesbian, queer, arts & ..."
13848,the wise men will unlearn your name.^above you...,"[living, death, growing old, time & brevity, n..."
13849,we'd like to talk with you about fear they sai...,"[living, social commentaries, popular culture]"
13852,"philosophic^in it complex, ovoid emptiness,^a ...","[arts & sciences, philosophy]"


In [None]:
class Tokenizer(object):
    def __init__(self, max_length=0):
        super().__init__()

        self.max_length = max_length

        self.alphabet_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

        self.alphabet = self.prepare_alphabet()
        self.decoded_alphabet = self.prepare_decoded_alphabet()

    def prepare_alphabet(self):
        # PREPARE THE ALPHABET (CHAR->INT)
        # as a dictionary
        alphabet = {}
        alphabet['pad'] = 0  # add 'pad'
        count = 1

        for letter in self.alphabet_letters:
            alphabet[letter] = count
            count += 1

        # add ' ', 'cls' tokens
        alphabet[' '] = count
        alphabet['cls'] = count + 1

        return alphabet

    def prepare_decoded_alphabet(self):
        # PREPARE DECODED ALPHABET (INT->CHAR)
        decoded_alphabet_ints = [i for i in range(len(self.alphabet_letters))]

        decoded_alphabet = {}
        decoded_alphabet[0] = 'pad'

        for i in decoded_alphabet_ints:
            decoded_alphabet[i+1] = self.alphabet_letters[i]

            decoded_alphabet[i+2] = ' '
        decoded_alphabet[i+3] = 'cls'

        return decoded_alphabet

    def encode(self, texts):
        N = len(texts)

        if self.max_length == 0:
            max_length = 0
            for i in range(N):
                len_i = len(texts[i])
                if len_i > max_length:
                    max_length = len_i
        else:
            max_length = self.max_length

        tokens = np.zeros((N, max_length+1))

        for i in range(N):
            len_i = len(texts[i])
            for j in range(-1, max_length):
                if j == -1:
                    tokens[i,j+1] = self.alphabet['cls']
                elif j >= len_i:
                    tokens[i,j+1] = self.alphabet['pad']
                else:
                    if texts[i][j] == 'é':
                        tokens[i,j+1] = self.alphabet['e']
                    elif texts[i][j] == 'í':
                        tokens[i,j+1] = self.alphabet['e']
                    elif texts[i][j] == 'á':
                        tokens[i,j+1] = self.alphabet['a']
                    elif texts[i][j] == 'ó':
                        tokens[i,j+1] = self.alphabet['o']
                    elif texts[i][j] == 'æ':
                        tokens[i,j+1] = self.alphabet['a']
                    elif texts[i][j] == 'ä':
                        tokens[i,j+1] = self.alphabet['a']
                    else:
                        tokens[i,j+1] = self.alphabet[texts[i][j]]

        return tokens

    def decode(self, tokens):
        texts = []

        for i in range(len(tokens)):
            tokens_i = tokens[i,:]
            text_i = ''
            for j in range(len(tokens_i)):
                if tokens_i[j] == 0:
                    break
                else:
                    if self.decoded_alphabet[tokens_i[j]] != 'cls':
                        text_i += self.decoded_alphabet[tokens_i[j]]
            texts.append(text_i)

        return texts

In [None]:
dataprocessor = DataProcessor()
tokenizer = Tokenizer(max_length=149)

# randomly split the data into training, test and validation sets
data = pd.read_csv('poems.csv')

# shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# split the data into training, test and validation sets
train_data = data[:int(0.7*len(data))]
test_data = data[int(0.7*len(data)):int(0.85*len(data))]
val_data = data[int(0.85*len(data)):]
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')
val_data.to_csv('val_data.csv')

# process the data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
val_data = pd.read_csv('val_data.csv')

train_poems = dataprocessor.process_batch(train_data['Poem'])
test_poems = dataprocessor.process_batch(test_data['Poem'])
val_poems = dataprocessor.process_batch(val_data['Poem'])

train_tokens = torch.from_numpy(tokenizer.encode(train_poems)).long()
test_tokens = torch.from_numpy(tokenizer.encode(test_poems)).long()
val_tokens = torch.from_numpy(tokenizer.encode(val_poems)).long()

