In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import torch
from torch.utils.data import Dataset, DataLoader

# import data 
data = pd.read_csv('PoetryFoundationData.csv')
data = data.dropna()
data = data.drop(data.columns[0], axis=1)
#only keep the poem and tags
data = data[['Poem', 'Tags']]
#remove poems with no tags
data = data[data['Tags'].notna()]

data.to_csv('poetry.csv', index=False)

In [27]:
class DataProcessor(object):
    def __init__(self, ):
        super().__init__()
        nlp = spacy.load("en_core_web_sm")
        nltk.download('omw-1.4')
        nltk.download("punkt")
        nltk.download("wordnet")
        nltk.download("stopwords")

    @staticmethod
    def preprocess_text(text):
        # Replace newline characters with a unique placeholder
        #placeholder = ' NEW'
        #text = text.replace('\n', placeholder)
        
        # Tokenize, remove punctuation and lowercase
        try:
            tokens = nltk.word_tokenize(text, preserve_line=True)
        except TypeError as e:
            print("Error in tokenizing text \"%s\": %s", text, str(e))
            return ""

        #tokens = [word.lower() for word in tokens if word.isalpha()]

        # Remove stopwords and lemmatize
        stop_words = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        #processed_text = [
        #    lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
        #]
        #processed_text = [
         #   lemmatizer.lemmatize(word.lower()) if word.isalpha() else word for word in tokens if word.lower() not in stop_words or not word.isalpha()
        #]

        processed_text = []
        for word in tokens:
            if word.isalpha():  # Process only alphabetic tokens
                word = word.lower()
                if word not in stop_words:
                    lemmatized_word = lemmatizer.lemmatize(word)
                    processed_text.append(lemmatized_word)
            elif word in {'.', ',', '!', '?', ';', ':', '-', '(', ')', '"', "'"}:
                # Keep punctuation marks
                processed_text.append(word)

        return " ".join(processed_text)

        # Replace the placeholder back to newline characters
        #processed_text = [word.replace(placeholder, '%') for word in processed_text]
        
        return " ".join(processed_text)

    def process_batch(self, texts):
        return [self.preprocess_text(d) for d in texts]

In [28]:
class Tokenizer(object):
    def __init__(self, max_length=512, special_characters=[]):
        super().__init__()

        self.max_length = max_length
        self.special_characters = special_characters
        self.alphabet_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
        self.punctuation_marks = ['!', '?', '.', ',', ';', ':']

        self.alphabet = self.prepare_alphabet()
        self.decoded_alphabet = self.prepare_decoded_alphabet()
        self.char_map = self.prepare_char_map()

    def prepare_alphabet(self):
        # PREPARE THE ALPHABET (CHAR->INT)
        # as a dictionary
        alphabet = {}
        alphabet['pad'] = 0  # add 'pad'
        count = 1

        for letter in self.alphabet_letters:
            alphabet[letter] = count
            count += 1

        # add ' ', 'cls' tokens
        alphabet[' '] = count
        alphabet['cls'] = count + 1

        # add puncuation tokens
        for mark in self.punctuation_marks:
            count += 1
            alphabet[mark] = count

        return alphabet

    def prepare_decoded_alphabet(self):
        # PREPARE DECODED ALPHABET (INT->CHAR)
        decoded_alphabet_ints = [i for i in range(len(self.alphabet_letters))]

        decoded_alphabet = {}
        decoded_alphabet[0] = 'pad'

        for i in decoded_alphabet_ints:
            decoded_alphabet[i + 1] = self.alphabet_letters[i]

        offset = len(decoded_alphabet_ints) + 1
        decoded_alphabet[offset] = ' '
        decoded_alphabet[offset + 1] = 'cls'

        for j, mark in enumerate(self.punctuation_marks):
            decoded_alphabet[offset + 2 + j] = mark

        return decoded_alphabet

    def prepare_char_map(self):
        # Mapping of special characters to corresponding alphabet characters
        return {
            'é': 'e', 'í': 'i', 'á': 'a', 'ó': 'o', 'æ': 'a', 'ä': 'a', 'ū': 'u',
            'à': 'a', 'ç': 'c', 'ë': 'e', 'ñ': 'n', 'ö': 'o', 'ü': 'u', 'ú': 'u',
            'û': 'u', 'å': 'a', 'œ': 'o', 'ß': 's', 'ø': 'o', 'è': 'e', 'ï': 'i',
            'â': 'a', 'ê': 'e', 'î': 'i', 'ô': 'o', 'ō': 'o', 'ā': 'a', 'ī': 'i',
            'ē': 'e', 'ồ': 'o', 'ế': 'e', 'π': 'p', '∞': 'i', '∑': 's', '√': 'r',
            '∫': 'i', '≈': 'a', 'ﬂ': 'f', 'ﬁ': 'f', 'ﬀ': 'f', 'ﬃ': 'f', 'α': 'a',
            'β': 'b', 'γ': 'g', 'δ': 'd', 'ε': 'e', 'ζ': 'z', 'η': 'e', 'θ': 't',
            'ι': 'i', 'κ': 'k', 'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
            'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'u', 'φ': 'f', 'χ': 'c', 'ψ': 'p',
            'ω': 'w'
        }

    def encode(self, texts):
        N = len(texts)

        if self.max_length == 0:
            max_length = max(len(text) for text in texts)
        else:
            max_length = self.max_length

        tokens = np.zeros((N, max_length + 1))

        for i, text in enumerate(texts):
            len_i = len(text)
            for j in range(-1, max_length):
                if j == -1:
                    tokens[i, j + 1] = self.alphabet['cls']
                elif j >= len_i:
                    tokens[i, j + 1] = self.alphabet['pad']
                else:
                    char = text[j]
                    if char in self.char_map:
                        tokens[i, j + 1] = self.alphabet[self.char_map[char]]
                    elif char in self.special_characters:
                        tokens[i, j + 1] = self.alphabet['q']
                    elif char in self.punctuation_marks:
                        tokens[i, j + 1] = self.alphabet[char]
                    elif char.isalpha() == False:
                        break
                    else:
                        tokens[i,j+1] = self.alphabet[texts[i][j]]

        return tokens

    def decode(self, tokens):
        texts = []

        for i in range(len(tokens)):
            tokens_i = tokens[i,:]
            text_i = ''
            for j in range(len(tokens_i)):
                if tokens_i[j] == 0:
                    break
                else:
                    if self.decoded_alphabet[tokens_i[j]] != 'cls':
                        text_i += self.decoded_alphabet[tokens_i[j]]
            texts.append(text_i)

        return texts

In [29]:
# Initialize the DataProcessor
processor = DataProcessor()

# Process the text data
processed_texts = processor.process_batch(data['Poem'].tolist())

# Initialize the Tokenizer
tokenizer = Tokenizer()

# Encode the processed text
encoded_texts = tokenizer.encode(processed_texts)

# Print a sample of the encoded texts
print(encoded_texts[:5])

# Decode the encoded texts to verify
decoded_texts = tokenizer.decode(encoded_texts)
print(decoded_texts[:5])

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\20182672\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20182672\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20182672\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20182672\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyError: 'έ'