## Jane-Austen Dataset

refer: Project Gutenberg
https://xpmethod.columbia.edu/knowledge-design-studio/2019-10-26-corpus-db.html

In [1]:
import json
import requests

In [2]:
# create queries for Corpus-DB
baseURL = "http://corpus-db.org"

In [3]:
def getTextAndMeta(author):
    metaResponse = requests.get(baseURL+"/api/author/"+author)
    textResponse = requests.get(baseURL+"/api/author/"+author+"/fulltext")
    meta = json.loads(metaResponse.text)
    texts = json.loads(textResponse.text)
    return meta, texts

In [4]:
austenMeta, austenTexts = getTextAndMeta("Austen, Jane")

In [5]:
len(austenMeta), len(austenTexts)

(20, 20)

In [6]:
[(book['id'], book['title']) for book in austenMeta]

[('105.0', 'Persuasion'),
 ('121.0', 'Northanger Abbey'),
 ('141.0', 'Mansfield Park'),
 ('158.0', 'Emma'),
 ('161.0', 'Sense and Sensibility'),
 ('946.0', 'Lady Susan'),
 ('1212.0', 'Love and Freindship [sic]'),
 ('1342.0', 'Pride and Prejudice'),
 ('21839.0', 'Sense and Sensibility'),
 ('25946.0', 'Gevoel en verstand'),
 ('31100.0',
  'The Complete Project Gutenberg Works of Jane Austen: A Linked Index of all PG Editions of Jane Austen'),
 ('33388.0', "Raison et sensibilité, ou les deux manières d'aimer (Tome 1)"),
 ('35151.0', "Raison et sensibilité, ou les deux manières d'aimer (Tome 2)"),
 ('35163.0', "Raison et sensibilité, ou les deux manières d'aimer (Tome 3)"),
 ('36777.0', 'Persuasion'),
 ('37431.0', "Pride and Prejudice, a play founded on Jane Austen's novel"),
 ('37634.0', "Raison et sensibilité, ou les deux manières d'aimer (Tome 4)"),
 ('42078.0',
  'The Letters of Jane Austen: Selected from the compilation of her great nephew, Edward, Lord Bradbourne'),
 ('42671.0', 'Pri

In [7]:
# keeping only english texts
myAustenCollection = [105, 121, 141, 158, 161, 946, 1212, 1342]

In [8]:
austenMetaSubset = [book for book in austenMeta if int(float(book['id'])) in myAustenCollection]
austenTextSubset = [book for book in austenTexts if int(book['id']) in myAustenCollection]

In [9]:
len(austenMetaSubset), len(austenTextSubset)

(8, 8)

In [10]:
combined_text = ""
for book in austenTextSubset:
    combined_text += book['text']

In [11]:
import re

# Replace sequences of one or more whitespace characters with a single space
whitespace_normalized_text = re.sub(r'\s+', ' ', combined_text)

# Strip leading and trailing whitespace
whitespace_normalized_text = whitespace_normalized_text.strip()

print(f"Length of combined_text: {len(combined_text)}")
print(f"Length of whitespace_normalized_text: {len(whitespace_normalized_text)}")
print(f"First 500 characters of whitespace_normalized_text:\n{whitespace_normalized_text[:500]}")

Length of combined_text: 4337498
Length of whitespace_normalized_text: 4318309
First 500 characters of whitespace_normalized_text:
by Al Haines. Persuasion by Jane Austen (1818) Chapter 1 Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who, for his own amusement, never took up any book but the Baronetage; there he found occupation for an idle hour, and consolation in a distressed one; there his faculties were roused into admiration and respect, by contemplating the limited remnant of the earliest patents; there any unwelcome sensations, arising from domestic affairs changed naturally into pity and contempt 


In [12]:
cleaned_text = whitespace_normalized_text

# Remove specific introductory/concluding phrases
cleaned_text = re.sub(r'by Al Haines\.', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'Finis', '', cleaned_text, flags=re.IGNORECASE)

# Remove Project Gutenberg disclaimers and similar meta-information
cleaned_text = re.sub(r'Project Gutenberg-tm is synonymous with the free distribution of electronic texts.', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'The Project Gutenberg EBook of [^\.]+\. This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever\. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www\.gutenberg\.org', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'ADVERTISEMENT BY THE AUTHORESS(?:, TO THE SECOND EDITION)?', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'End of the Project Gutenberg EBook of [^\.]+\.', '', cleaned_text, flags=re.IGNORECASE)

# Remove book titles and author names that appear as headers/footers but are not part of narrative
# This pattern needs to be more careful, but for now, focus on clear header-like structures
cleaned_text = re.sub(r'Persuasion by Jane Austen \(\d{4}\)', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'NORTHANGER ABBEY by Jane Austen \(\d{4}\)', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'MANSFIELD PARK \(\d{4}\) By Jane Austen', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'EMMA by Jane Austen \(\d{4}\)', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'SENSE AND SENSIBILITY by Jane Austen \(\d{4}\)', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'Lady Susan by Jane Austen \(\d{4}\)', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'LOVE AND FREINDSHIP \[sic\] by Jane Austen \(\d{4}\)', '', cleaned_text, flags=re.IGNORECASE)
cleaned_text = re.sub(r'PRIDE AND PREJUDICE By Jane Austen', '', cleaned_text, flags=re.IGNORECASE)


# Further normalize whitespace after removals
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

print(f"Length of whitespace_normalized_text: {len(whitespace_normalized_text)}")
print(f"Length of cleaned_text: {len(cleaned_text)}")
print(f"First 500 characters of cleaned_text:\n{cleaned_text[:500]}")


Length of whitespace_normalized_text: 4318309
Length of cleaned_text: 4317593
First 500 characters of cleaned_text:
Chapter 1 Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who, for his own amusement, never took up any book but the Baronetage; there he found occupation for an idle hour, and consolation in a distressed one; there his faculties were roused into admiration and respect, by contemplating the limited remnant of the earliest patents; there any unwelcome sensations, arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations 


In [13]:
output_filename = "austen_combined_cleaned.txt"
with open(output_filename, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print(f"Cleaned text saved to {output_filename}")

Cleaned text saved to austen_combined_cleaned.txt


## Bigram Language Model

In [14]:
with open('austen_combined_cleaned.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [15]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  4317593


In [16]:
# look at the first 1000 characters
print(text[:1000])

Chapter 1 Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who, for his own amusement, never took up any book but the Baronetage; there he found occupation for an idle hour, and consolation in a distressed one; there his faculties were roused into admiration and respect, by contemplating the limited remnant of the earliest patents; there any unwelcome sensations, arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the last century; and there, if every other leaf were powerless, he could read his own history with an interest which never failed. This was the page at which the favourite volume always opened: "ELLIOT OF KELLYNCH HALL. "Walter Elliot, born March 1, 1760, married, July 15, 1784, Elizabeth, daughter of James Stevenson, Esq. of South Park, in the county of Gloucester, by which lady (who died 1800) he has issue Elizabeth, born June 1, 1785; Anne, born August 9, 1787; a still-born son, November 

In [17]:
# unique characters that appear in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

 !"&'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz
79


In [18]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[60, 61, 61, 0, 72, 60, 57, 70, 57]
hii there


In [19]:
# encoding the entire data set and storing into Torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earlier will to the GPT look like this

torch.Size([4317593]) torch.int64
tensor([26, 60, 53, 68, 72, 57, 70,  0, 12,  0, 42, 61, 70,  0, 46, 53, 64, 72,
        57, 70,  0, 28, 64, 64, 61, 67, 72,  8,  0, 67, 58,  0, 34, 57, 64, 64,
        77, 66, 55, 60,  0, 31, 53, 64, 64,  8,  0, 61, 66,  0, 42, 67, 65, 57,
        70, 71, 57, 72, 71, 60, 61, 70, 57,  8,  0, 75, 53, 71,  0, 53,  0, 65,
        53, 66,  0, 75, 60, 67,  8,  0, 58, 67, 70,  0, 60, 61, 71,  0, 67, 75,
        66,  0, 53, 65, 73, 71, 57, 65, 57, 66, 72,  8,  0, 66, 57, 74, 57, 70,
         0, 72, 67, 67, 63,  0, 73, 68,  0, 53, 66, 77,  0, 54, 67, 67, 63,  0,
        54, 73, 72,  0, 72, 60, 57,  0, 25, 53, 70, 67, 66, 57, 72, 53, 59, 57,
        22,  0, 72, 60, 57, 70, 57,  0, 60, 57,  0, 58, 67, 73, 66, 56,  0, 67,
        55, 55, 73, 68, 53, 72, 61, 67, 66,  0, 58, 67, 70,  0, 53, 66,  0, 61,
        56, 64, 57,  0, 60, 67, 73, 70,  8,  0, 53, 66, 56,  0, 55, 67, 66, 71,
        67, 64, 53, 72, 61, 67, 66,  0, 61, 66,  0, 53,  0, 56, 61, 71, 72, 70,
      

In [20]:
# train-val split
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [21]:
block_size = 8
train_data[:block_size+1]

tensor([26, 60, 53, 68, 72, 57, 70,  0, 12])

In [22]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([26]) the target: 60
when input is tensor([26, 60]) the target: 53
when input is tensor([26, 60, 53]) the target: 68
when input is tensor([26, 60, 53, 68]) the target: 72
when input is tensor([26, 60, 53, 68, 72]) the target: 57
when input is tensor([26, 60, 53, 68, 72, 57]) the target: 70
when input is tensor([26, 60, 53, 68, 72, 57, 70]) the target: 0
when input is tensor([26, 60, 53, 68, 72, 57, 70,  0]) the target: 12


In [23]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel
block_size = 8 # what is the maximum context length for predictions

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[56, 57, 70,  0, 53,  0, 55, 67],
        [61, 59, 60, 54, 73, 70, 77,  0],
        [66, 57, 71, 72, 64, 77,  0, 72],
        [70, 55, 77, 23,  2,  0, 31, 57]])
targets:
torch.Size([4, 8])
tensor([[57, 70,  0, 53,  0, 55, 67, 66],
        [59, 60, 54, 73, 70, 77,  0, 60],
        [57, 71, 72, 64, 77,  0, 72, 70],
        [55, 77, 23,  2,  0, 31, 57,  0]])
----
when input is [56] the target: 57
when input is [56, 57] the target: 70
when input is [56, 57, 70] the target: 0
when input is [56, 57, 70, 0] the target: 53
when input is [56, 57, 70, 0, 53] the target: 0
when input is [56, 57, 70, 0, 53, 0] the target: 55
when input is [56, 57, 70, 0, 53, 0, 55] the target: 67
when input is [56, 57, 70, 0, 53, 0, 55, 67] the target: 66
when input is [61] the target: 59
when input is [61, 59] the target: 60
when input is [61, 59, 60] the target: 54
when input is [61, 59, 60, 54] the target: 73
when input is [61, 59, 60, 54, 73] the target: 70
when input is [61,

In [24]:
# input to the model
print(xb)

tensor([[56, 57, 70,  0, 53,  0, 55, 67],
        [61, 59, 60, 54, 73, 70, 77,  0],
        [66, 57, 71, 72, 64, 77,  0, 72],
        [70, 55, 77, 23,  2,  0, 31, 57]])


In [25]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 79])
tensor(4.6063, grad_fn=<NllLossBackward0>)
 D.(CHWmYgV6CiL4Q8k]MKC,?'pSZ2RvV3_rgj-3wI: HE-sTbK?(Sd7xk-T1c5n:u,._ykK?2PQYVIs]o,k!nZ"U5fY0vDH4bhcK


In [26]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [29]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.3879520893096924


In [30]:
# sampling from the output
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

 ofouly te, caroor varsug ave "f wherenorotthitheat om iourtowiaswangir t villllwn omambestrecod t."tho hind d Mivese. s Bry henoushas insorughongge; bldnecug. thinourellyey tortyisg oualise--fennend, hisUurets icutot hioober, unsul e urigr thadit I wr.--nllasendgit d Sh ul nemy Thel---mon blm, d, Mr anthig thepatoritalyof l bu wind osteverenime otabeald ccllad by " o artimpohe cicathild Jurs ovelWe pldontls, teras t nd weay suppreat sout Heanok anchilnwhinde. pe d gicalknd sh warice oc:4Byer ath
