## Shakespeare Corpus cleaning for fine tuning with GPT3

In [99]:
#pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0
Note: you may need to restart the kernel to use updated packages.


In [100]:
import re
from statistics import mean
import json
import jsonlines

## Import corpus

In [147]:
# import Hamlet
with open('../data/Hamlet.txt') as f:
    hamlet = f.readlines()

# import MacBeth
with open('../data/Macbeth.txt') as f:
    macbeth = f.readlines()

## Preprocessing

In [151]:
def clean_prep_corpus(corpus):
    # remove irrelevant lines
    for line in corpus:
        filter_caps = re.match("[A-Z].*[A-Z]", line) # match character name in all caps, "ACT", "SCENE".
        filter_brackets = re.match("\s\[([^]]+)\]", line) # match stage directions - between brackets.
        if filter_caps or filter_brackets:
            corpus.remove(line)

    # remove new lines
    while '\n' in corpus:
        corpus.remove('\n')

    # add one white space at the beggining of each line (necessary for fine tuning with openAI)
    corpus = [ ' {}'.format(line) for line in corpus]

    print(f"Cleaned corpus sample:\n\n{corpus[:50]}")
    return corpus


def get_corpus_metrics(cleaned_corpus, corpus_name):
    # tokenize corpus
    tokenized_corpus = []
    total_token_number = 0
    for lines in cleaned_corpus:
        tokens = lines.split()
        tokenized_corpus.extend(tokens)
        total_token_number += len(tokens)

    avg = total_token_number / len(cleaned_corpus)

    print(f"\nTokenized corpus sample:\n\n{tokenized_corpus[:50]}")
    print(f"\nMetrics\n\nThe cleaned {corpus_name} corpus has a total of {len(cleaned_corpus)} lines and {len(tokenized_corpus)} tokens.\nThe lines have an average of {avg} tokens.")
    

### Hamlet

In [152]:
corpus = hamlet.copy()
corpus_name = 'Hamlet'

hamlet_clean = clean_prep_corpus(corpus)
metrics = get_corpus_metrics(hamlet_clean, corpus_name)

Cleaned corpus sample:

['  Enter Francisco and Barnardo, two sentinels.\n', ' Who’s there?\n', ' Nay, answer me. Stand and unfold yourself.\n', ' Long live the King!\n', ' Barnardo?\n', ' He.\n', ' You come most carefully upon your hour.\n', ' ’Tis now struck twelve. Get thee to bed, Francisco.\n', ' For this relief much thanks. ’Tis bitter cold,\n', ' Have you had quiet guard?\n', ' Not a mouse stirring.\n', ' Well, good night.\n', ' The rivals of my watch, bid them make haste.\n', '  Enter Horatio and Marcellus.\n', ' I think I hear them. Stand, ho! Who is there?\n', ' Friends to this ground.\n', ' And liegemen to the Dane.\n', ' Give you good night.\n', ' O, farewell, honest soldier, who hath reliev’d you?\n', ' Barnardo has my place. Give you good-night.\n', ' Holla, Barnardo!\n', ' Say, what, is Horatio there?\n', ' A piece of him.\n', ' Welcome, Horatio. Welcome, good Marcellus.\n', ' What, has this thing appear’d again tonight?\n', ' I have seen nothing.\n', ' Horatio says ’tis

### Macbeth

In [153]:
corpus = macbeth.copy()
corpus_name = 'Macbeth'

macbeth_clean = clean_prep_corpus(corpus)
metrics = get_corpus_metrics(macbeth_clean, corpus_name)

Cleaned corpus sample:

['  Thunder and Lightning. Enter three Witches.\n', ' When shall we three meet again?\n', ' In thunder, lightning, or in rain?\n', ' When the hurlyburly’s done,\n', ' When the battle’s lost and won.\n', ' That will be ere the set of sun.\n', ' Where the place?\n', ' Upon the heath.\n', ' There to meet with Macbeth.\n', ' I come, Graymalkin!\n', ' Paddock calls.\n', ' Anon.\n', ' Fair is foul, and foul is fair:\n', ' Hover through the fog and filthy air.\n', '  Alarum within. Enter King Duncan, Malcolm, Donalbain, Lennox, with\n', '  Attendants, meeting a bleeding Captain.\n', ' What bloody man is that? He can report,\n', ' As seemeth by his plight, of the revolt\n', ' The newest state.\n', ' This is the sergeant\n', ' Who, like a good and hardy soldier, fought\n', ' ’Gainst my captivity.—Hail, brave friend!\n', ' As thou didst leave it.\n', ' Doubtful it stood;\n', ' As two spent swimmers that do cling together\n', ' (Worthy to be a rebel, for to that\n', ' The 

## Define training and validation sets

In [155]:
# batch 1 - Hamlet
train_batch1 = hamlet_clean[:1001] # first batch for training containing 1000 sentences (approx 7526 tokens)

# batch 2 - Hamlet + Macbeth
dataset = macbeth_clean + hamlet_clean
train_batch2 = dataset[:4001] # 4000 pairs, about half-half from each corpus
validate_batch2 = dataset[4001: 5001] # 1000 - from Hamlet

In [156]:
# create prompt-completion pairs and save as jsonlines

def generate_pairs(batch):
    pairs_list = []
    for line in batch:
        train_line_dict = {'prompt':'', 'completion':line}
        pairs_list.append(train_line_dict)
    
    print(pairs_list[:20])
    return pairs_list


def save_as_jsonlines(pairs_list, path):
    with jsonlines.open(path, 'w') as writer:
        writer.write_all(pairs_list)
    print(f"Dataset saved at {path}")

In [159]:
batch_pairs = generate_pairs(validate_batch2)
saved_pairs = save_as_jsonlines(batch_pairs, '../data/hamlet_macbeth_validate_batch2.jsonl')

[{'prompt': '', 'completion': ' And what’s in prayer but this twofold force,\n'}, {'prompt': '', 'completion': ' To be forestalled ere we come to fall,\n'}, {'prompt': '', 'completion': ' My fault is past. But O, what form of prayer\n'}, {'prompt': '', 'completion': ' That cannot be; since I am still possess’d\n'}, {'prompt': '', 'completion': ' My crown, mine own ambition, and my queen.\n'}, {'prompt': '', 'completion': ' May one be pardon’d and retain th’offence?\n'}, {'prompt': '', 'completion': ' In the corrupted currents of this world\n'}, {'prompt': '', 'completion': ' Offence’s gilded hand may shove by justice,\n'}, {'prompt': '', 'completion': ' And oft ’tis seen the wicked prize itself\n'}, {'prompt': '', 'completion': ' There is no shuffling, there the action lies\n'}, {'prompt': '', 'completion': ' In his true nature, and we ourselves compell’d\n'}, {'prompt': '', 'completion': ' Even to the teeth and forehead of our faults,\n'}, {'prompt': '', 'completion': ' Try what repen