In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

In [2]:
import re
from pathlib import Path

# read the txt
shakespeare_path = Path("poem_data/shakespeare.txt")
spenser_path = Path("poem_data/spenser.txt")
syllable_dict_path = Path("E:\AschoolCLASS\BA3-2_UCSD_UPS\课程资料\HW3_Public\HW3_Public\poem_data\Syllable_dictionary.txt")

with open("E:\AschoolCLASS\BA3-2_UCSD_UPS\课程资料\HW3_Public\HW3_Public\poem_data\shakespeare.txt", 'r', encoding='utf-8') as f:
    shakespeare_text = f.read()

with open("E:\AschoolCLASS\BA3-2_UCSD_UPS\课程资料\HW3_Public\HW3_Public\poem_data\spenser.txt", 'r', encoding='utf-8') as f:
    spenser_text = f.read()

combined_text = shakespeare_text + "\n" + spenser_text

# read the dict
syllable_dict = {}
with open(syllable_dict_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        if parts:
            word = parts[0].lower()  # all lower
            syllables = parts[1:]
            syllable_dict[word] = syllables

# clean the txt and keep , and - 
def clean_line(line):
    line = re.sub(r"[^a-zA-Z0-9'\-\s]", '', line)  # delete 
    line = re.sub(r'\s+', ' ', line).strip()       # make the space suitable
    return line

# tokenize function
def tokenize_poem(text):
    lines = text.strip().split('\n')
    tokenized_lines = []

    for line in lines:
        line = clean_line(line)
        if line:
            tokens = line.split()
            tokenized_lines.append(tokens)
    return tokenized_lines

# get syllables function
def get_syllables(tokens):
    result = []
    for token in tokens:
        token_lc = token.lower()
        if token_lc in syllable_dict:
            result.append((token, syllable_dict[token_lc]))
        else:
            # each aeiou counts to one rough
            rough_count = len(re.findall(r'[aeiouy]+', token_lc))
            result.append((token, [str(max(1, rough_count))]))
    return result

# execute the process
tokenized_lines = tokenize_poem(combined_text)
token_lines_with_syllables = [get_syllables(line) for line in tokenized_lines]

# print the token lines with syllables
for line in token_lines_with_syllables[:5]:
    print(line)


[('1', ['1'])]
[('From', ['1']), ('fairest', ['2']), ('creatures', ['2']), ('we', ['1']), ('desire', ['2']), ('increase', ['2'])]
[('That', ['1']), ('thereby', ['2']), ("beauty's", ['2']), ('rose', ['1']), ('might', ['1']), ('never', ['E1', '2']), ('die', ['1'])]
[('But', ['1']), ('as', ['1']), ('the', ['1']), ('riper', ['2']), ('should', ['1']), ('by', ['1']), ('time', ['1']), ('decease', ['2'])]
[('His', ['1']), ('tender', ['2']), ('heir', ['1']), ('might', ['1']), ('bear', ['1']), ('his', ['1']), ('memory', ['3'])]


In [3]:
 

import torch
import torch.nn as nn
import torch.nn.functional as F
import random
 
raw_text = ' '.join([' '.join([word for word, _ in line]) for line in token_lines_with_syllables])
chars = sorted(list(set(raw_text)))
vocab_size = len(chars)

char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

 
encoded_text = torch.tensor([char_to_ix[c] for c in raw_text], dtype=torch.long)
 
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size=128):
        super(CharLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, vocab_size)  # One-hot like embedding
        self.lstm = nn.LSTM(vocab_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_seq, hidden=None):
        embedded = self.embed(input_seq)
        output, hidden = self.lstm(embedded, hidden)
        logits = self.fc(output)
        return logits, hidden

 
seq_length = 40
step = 3
hidden_size = 128
lr = 1e-3
epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



inputs = []
targets = []
for i in range(0, len(encoded_text) - seq_length, step):
    inputs.append(encoded_text[i:i+seq_length])
    targets.append(encoded_text[i+1:i+1+seq_length])

 
model = CharLSTM(vocab_size, hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

 
for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0
    for x, y in zip(inputs, targets):
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        output, _ = model(x.unsqueeze(1))
        output = output.squeeze(1)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch} - Loss: {total_loss / len(inputs):.4f}")



KeyboardInterrupt: 

In [None]:

 
def sample_poem(model, seed_text, temperature=1.0, max_lines=14):
    model.eval()
    input_seq = torch.tensor([char_to_ix[c] for c in seed_text], dtype=torch.long).to(device)
    hidden = None
    output_text = seed_text
    current_line = ''
    line_count = 0
    current_syllables = 0

    def count_syllables(word):
        word = re.sub(r'[^a-zA-Z\-]', '', word).lower()
        if word in syllable_dict:
            return int(syllable_dict[word][0])
        else:
            return max(1, len(re.findall(r'[aeiouy]+', word)))

    with torch.no_grad():
        for i in range(len(seed_text) - 1):
            _, hidden = model(input_seq[i].unsqueeze(0).unsqueeze(1), hidden)
        input_char = input_seq[-1]

        while line_count < max_lines:
            output, hidden = model(input_char.unsqueeze(0).unsqueeze(1), hidden)
            probs = F.softmax(output.squeeze() / temperature, dim=0)
            next_idx = torch.multinomial(probs, 1).item()
            next_char = ix_to_char[next_idx]

            if next_char == '\n':
                continue  # avoid accidental newlines

            current_line += next_char
            output_text += next_char
            input_char = torch.tensor(next_idx).to(device)

            # if we hit space or end of line, check syllables
            if next_char == ' ':
                words = current_line.strip().split()
                if words:
                    current_syllables += count_syllables(words[-1])
                if current_syllables >= 10:
                    output_text += '\n'
                    line_count += 1
                    current_line = ''
                    current_syllables = 0

    return output_text
 
seed = "SHALL I COMPARE THEE TO A SUMMER'S DAY?\n"
print("\n--- Temperature 1.5 ---\n")
print(sample_poem(model, seed_text=seed, temperature=1.5))

print("\n--- Temperature 0.75 ---\n")
print(sample_poem(model, seed_text=seed, temperature=0.75))

print("\n--- Temperature 0.25 ---\n")
print(sample_poem(model, seed_text=seed, temperature=0.25))


--- Temperature 1.5 ---



KeyError: '?'