In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import torch
from torch.utils.data import Dataset, DataLoader


# import data 
data = pd.read_csv('PoetryFoundationData.csv')
data = data.dropna()

data

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
6,6,\r\n\r\n Invisible Fish\r\n...,\r\n\r\nInvisible fish swim this ghost ocean n...,Joy Harjo,"Living,Time & Brevity,Relationships,Family & A..."
7,7,\r\n\r\n Don’t Bother the E...,\r\n\r\nDon’t bother the earth spirit who live...,Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fa..."
9,9,"\r\n\r\n [""Hour in which I ...","\r\n\r\nHour in which I consider hydrangea, a ...",Simone White,"Living,Parenthood,The Body,The Mind,Nature,Tre..."
16,16,\r\n\r\n scars\r\n\r\n ...,\r\n\r\nmy father’s body is a map\r\n\r\na rec...,Truong Tran,"The Body,Family & Ancestors"
17,17,\r\n\r\n what remains two\r...,\r\n\r\nit has long been forgotten this practi...,Truong Tran,"Infancy,Parenthood,The Body"
...,...,...,...,...,...
13835,1,\r\n\r\n !\r\n\r\n ...,"\r\n\r\nDear Writers, I’m compiling the first ...",Wendy Videlock,"Relationships,Gay, Lesbian, Queer,Arts & Scien..."
13848,12,\r\n\r\n 1 January 1965\r\n...,\r\n\r\nThe Wise Men will unlearn your name.\r...,Joseph Brodsky,"Living,Death,Growing Old,Time & Brevity,Nature..."
13849,13,\r\n\r\n 1-800-FEAR\r\n\r\n...,\r\n\r\nWe'd like to talk with you about...,Jody Gladding,"Living,Social Commentaries,Popular Culture"
13852,0,\r\n\r\n 0\r\n\r\n ...,\r\n\r\n Philosophic\r\n\r\nin its co...,Hailey Leithauser,"Arts & Sciences,Philosophy"


In [2]:
# Delete the first column
data = data.drop(data.columns[0], axis=1)

# in the titles, drop all '\n\n' occurrences
data['Title'] = data['Title'].apply(lambda x: x.replace('\n\n', ''))


data

# check how many poems contain a ';' in the text
data['Poem'].apply(lambda x: '<LINE>' in x).sum()

# remove this
data['Poem'] = data['Poem'].apply(lambda x: x.replace('<LINE>', ''))

# replace all \n with a '<LINE>' character
data['Poem'] = data['Poem'].apply(lambda x: x.replace('\n', '<LINE>'))
# replace all double <LINE><LINE> with a single <LINE>
data['Poem'] = data['Poem'].apply(lambda x: x.replace('<LINE><LINE>', '<LINE>'))
# remove all leading and trailing <LINE> characters
data['Poem'] = data['Poem'].apply(lambda x: x.strip('<LINE>'))

# set all poems to lowercase
data['Poem'] = data['Poem'].apply(lambda x: x.lower())

# sometimes there are multiple spaces between words, replace them with a single space
data['Poem'] = data['Poem'].apply(lambda x: ' '.join(x.split()))

# set all tags to lowercase
data['Tags'] = data['Tags'].apply(lambda x: x.lower())

# remove all leading and trailing spaces
data['Tags'] = data['Tags'].apply(lambda x: x.strip())


# add poem and tags to new dataframe
poems = pd.DataFrame()
poems['Poem'] = data['Poem']
poems['Tags'] = data['Tags']

# save the poems to a new csv file
poems.to_csv('poems.csv')



In [3]:

class DataProcessor(object):
    def __init__(self, ):
        super().__init__()
        nlp = spacy.load("en_core_web_sm")
        nltk.download('omw-1.4')
        nltk.download("punkt")
        nltk.download("wordnet")
        nltk.download("stopwords")

    @staticmethod
    def preprocess_text(text):
        # Tokenize, remove punctuation and lowercase
        tokens = nltk.word_tokenize(text)
        tokens = [word.lower() for word in tokens if word.isalpha()]

        # Remove stopwords and lemmatize
        stop_words = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        processed_text = [
            lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
        ]

        return " ".join(processed_text)

    def process_batch(self, texts):
        return [self.preprocess_text(d) for d in texts]

In [4]:
class Tokenizer(object):
    def __init__(self, max_length=0):
        super().__init__()

        self.max_length = max_length

        self.alphabet_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

        self.alphabet = self.prepare_alphabet()
        self.decoded_alphabet = self.prepare_decoded_alphabet()

    def prepare_alphabet(self):
        # PREPARE THE ALPHABET (CHAR->INT)
        # as a dictionary
        alphabet = {}
        alphabet['pad'] = 0  # add 'pad'
        count = 1

        for letter in self.alphabet_letters:
            alphabet[letter] = count
            count += 1

        # add ' ', 'cls' tokens
        alphabet[' '] = count
        alphabet['cls'] = count + 1
        alphabet['line'] = count + 1

        return alphabet

    def prepare_decoded_alphabet(self):
        # PREPARE DECODED ALPHABET (INT->CHAR)
        decoded_alphabet_ints = [i for i in range(len(self.alphabet_letters))]

        decoded_alphabet = {}
        decoded_alphabet[0] = 'pad'

        for i in decoded_alphabet_ints:
            decoded_alphabet[i+1] = self.alphabet_letters[i]

            decoded_alphabet[i+2] = ' '
        decoded_alphabet[i+3] = 'cls'
        decoded_alphabet[i+4] = 'line' 

        return decoded_alphabet

    def encode(self, texts):
        N = len(texts)

        if self.max_length == 0:
            max_length = 0
            for i in range(N):
                len_i = len(texts[i])
                if len_i > max_length:
                    max_length = len_i
        else:
            max_length = self.max_length

        tokens = np.zeros((N, max_length+1))

        for i in range(N):
            len_i = len(texts[i])
            for j in range(-1, max_length):
                if j == -1:
                    tokens[i, j + 1] = self.alphabet['cls']
                elif j >= len_i:
                    tokens[i, j + 1] = self.alphabet['pad']
                else:
                    char = texts[i][j]
                    if char == '\n':
                        tokens[i, j + 1] = self.alphabet['line']
                    elif char in self.alphabet:
                        tokens[i, j + 1] = self.alphabet[char]
                    else:
                        if char == 'é':
                            tokens[i, j + 1] = self.alphabet['e']
                        elif char == 'í':
                            tokens[i, j + 1] = self.alphabet['e']
                        elif char == 'á':
                            tokens[i, j + 1] = self.alphabet['a']
                        elif char == 'ó':
                            tokens[i, j + 1] = self.alphabet['o']
                        elif char == 'æ':
                            tokens[i, j + 1] = self.alphabet['a']
                        elif char == 'ä':
                            tokens[i, j + 1] = self.alphabet['a']
                        else:
                            tokens[i, j + 1] = self.alphabet[' ']

        return tokens

    def decode(self, tokens):
        texts = []

        for i in range(len(tokens)):
            tokens_i = tokens[i,:]
            text_i = ''
            for j in range(len(tokens_i)):
                if tokens_i[j] == 0:
                    break
                else:
                    char = self.decoded_alphabet[tokens_i[j]]
                    if char == 'cls':
                        continue
                    elif char == 'line':
                        text_i += '\n'
                    else:
                        text_i += char
            texts.append(text_i)

        return texts

In [5]:
dataprocessor = DataProcessor()
tokenizer = Tokenizer(max_length=149)

# randomly split the data into training, test and validation sets
data = pd.read_csv('poems.csv')

# shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# split the data into training, test and validation sets
train_data = data[:int(0.7*len(data))]
test_data = data[int(0.7*len(data)):int(0.85*len(data))]
val_data = data[int(0.85*len(data)):]
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')
val_data.to_csv('val_data.csv')

# process the data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
val_data = pd.read_csv('val_data.csv')

train_poems = dataprocessor.process_batch(train_data['Poem'])
test_poems = dataprocessor.process_batch(test_data['Poem'])
val_poems = dataprocessor.process_batch(val_data['Poem'])

train_tokens = torch.from_numpy(tokenizer.encode(train_poems)).long()
test_tokens = torch.from_numpy(tokenizer.encode(test_poems)).long()
val_tokens = torch.from_numpy(tokenizer.encode(val_poems)).long()



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\20182672\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20182672\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20182672\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20182672\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
