## Shakespeare Corpus cleaning for fine tuning with GPT3

In [93]:
import re
from statistics import mean
import json

In [61]:
# import hamlet
with open('../data/Hamlet.txt') as f:
    hamlet = f.readlines()

In [67]:
print(hamlet[:100])

['ACT I\n', '\n', 'SCENE I. Elsinore. A platform before the Castle.\n', '\n', ' Enter Francisco and Barnardo, two sentinels.\n', '\n', 'BARNARDO.\n', 'Who’s there?\n', '\n', 'FRANCISCO.\n', 'Nay, answer me. Stand and unfold yourself.\n', '\n', 'BARNARDO.\n', 'Long live the King!\n', '\n', 'FRANCISCO.\n', 'Barnardo?\n', '\n', 'BARNARDO.\n', 'He.\n', '\n', 'FRANCISCO.\n', 'You come most carefully upon your hour.\n', '\n', 'BARNARDO.\n', '’Tis now struck twelve. Get thee to bed, Francisco.\n', '\n', 'FRANCISCO.\n', 'For this relief much thanks. ’Tis bitter cold,\n', 'And I am sick at heart.\n', '\n', 'BARNARDO.\n', 'Have you had quiet guard?\n', '\n', 'FRANCISCO.\n', 'Not a mouse stirring.\n', '\n', 'BARNARDO.\n', 'Well, good night.\n', 'If you do meet Horatio and Marcellus,\n', 'The rivals of my watch, bid them make haste.\n', '\n', ' Enter Horatio and Marcellus.\n', '\n', 'FRANCISCO.\n', 'I think I hear them. Stand, ho! Who is there?\n', '\n', 'HORATIO.\n', 'Friends to this ground.\n', 

In [88]:
corpus = hamlet.copy()

# remove irrelevant lines
for line in corpus:
    filter_caps = re.match("[A-Z].*[A-Z]", line) # match character name in all caps, "ACT", "SCENE".
    filter_brackets = re.match("\s\[([^]]+)\]", line) # match stage directions - between brackets.
    if filter_caps or filter_brackets:
        corpus.remove(line)

# remove new lines
while '\n' in corpus:
    corpus.remove('\n')

# add one white space at the beggining of each line (necessary for fine tuning with openAI)
corpus = [ ' {}'.format(line) for line in corpus]


print(corpus[:100])

['  Enter Francisco and Barnardo, two sentinels.\n', ' Who’s there?\n', ' Nay, answer me. Stand and unfold yourself.\n', ' Long live the King!\n', ' Barnardo?\n', ' He.\n', ' You come most carefully upon your hour.\n', ' ’Tis now struck twelve. Get thee to bed, Francisco.\n', ' For this relief much thanks. ’Tis bitter cold,\n', ' Have you had quiet guard?\n', ' Not a mouse stirring.\n', ' Well, good night.\n', ' The rivals of my watch, bid them make haste.\n', '  Enter Horatio and Marcellus.\n', ' I think I hear them. Stand, ho! Who is there?\n', ' Friends to this ground.\n', ' And liegemen to the Dane.\n', ' Give you good night.\n', ' O, farewell, honest soldier, who hath reliev’d you?\n', ' Barnardo has my place. Give you good-night.\n', ' Holla, Barnardo!\n', ' Say, what, is Horatio there?\n', ' A piece of him.\n', ' Welcome, Horatio. Welcome, good Marcellus.\n', ' What, has this thing appear’d again tonight?\n', ' I have seen nothing.\n', ' Horatio says ’tis but our fantasy,\n', ' 

In [89]:
# get metrics about lines in corpus
avg = mean([len(line) for line in corpus])
print(f"The average length of a line is: {avg} tokens")


The average length of a line is: 41.59392345895414 tokens


In [90]:
# tokenize corpus
tokenized_corpus = []
total_token_number = 0
for lines in corpus:
    tokens = lines.split()
    tokenized_corpus.extend(tokens)
    total_token_number += len(tokens)

avg = total_token_number / len(corpus)

print(tokenized_corpus[:200])
print(f"\nThe cleaned Hamlet corpus has a total of {len(tokenized_corpus)} tokens.\nThe lines have an average of {avg} tokens.")

['Enter', 'Francisco', 'and', 'Barnardo,', 'two', 'sentinels.', 'Who’s', 'there?', 'Nay,', 'answer', 'me.', 'Stand', 'and', 'unfold', 'yourself.', 'Long', 'live', 'the', 'King!', 'Barnardo?', 'He.', 'You', 'come', 'most', 'carefully', 'upon', 'your', 'hour.', '’Tis', 'now', 'struck', 'twelve.', 'Get', 'thee', 'to', 'bed,', 'Francisco.', 'For', 'this', 'relief', 'much', 'thanks.', '’Tis', 'bitter', 'cold,', 'Have', 'you', 'had', 'quiet', 'guard?', 'Not', 'a', 'mouse', 'stirring.', 'Well,', 'good', 'night.', 'The', 'rivals', 'of', 'my', 'watch,', 'bid', 'them', 'make', 'haste.', 'Enter', 'Horatio', 'and', 'Marcellus.', 'I', 'think', 'I', 'hear', 'them.', 'Stand,', 'ho!', 'Who', 'is', 'there?', 'Friends', 'to', 'this', 'ground.', 'And', 'liegemen', 'to', 'the', 'Dane.', 'Give', 'you', 'good', 'night.', 'O,', 'farewell,', 'honest', 'soldier,', 'who', 'hath', 'reliev’d', 'you?', 'Barnardo', 'has', 'my', 'place.', 'Give', 'you', 'good-night.', 'Holla,', 'Barnardo!', 'Say,', 'what,', 'is', 'H

In [92]:
# create a first batch for training containing 1000 sentences (approx 7526 tokens)
train_batch1 = corpus[:1001]
train_batch1_list = []

for line in train_batch1:
    train_line_dict = {'prompt':'', 'completion':line}
    train_batch1_list.append(train_line_dict)

print(train_batch1_list[:100])

# save as json
json.dumps(train_batch1_list)

[{'prompt': '',
  'completion': '  Enter Francisco and Barnardo, two sentinels.\n'},
 {'prompt': '', 'completion': ' Who’s there?\n'},
 {'prompt': '', 'completion': ' Nay, answer me. Stand and unfold yourself.\n'},
 {'prompt': '', 'completion': ' Long live the King!\n'},
 {'prompt': '', 'completion': ' Barnardo?\n'},
 {'prompt': '', 'completion': ' He.\n'},
 {'prompt': '', 'completion': ' You come most carefully upon your hour.\n'},
 {'prompt': '',
  'completion': ' ’Tis now struck twelve. Get thee to bed, Francisco.\n'},
 {'prompt': '',
  'completion': ' For this relief much thanks. ’Tis bitter cold,\n'},
 {'prompt': '', 'completion': ' Have you had quiet guard?\n'},
 {'prompt': '', 'completion': ' Not a mouse stirring.\n'},
 {'prompt': '', 'completion': ' Well, good night.\n'},
 {'prompt': '',
  'completion': ' The rivals of my watch, bid them make haste.\n'},
 {'prompt': '', 'completion': '  Enter Horatio and Marcellus.\n'},
 {'prompt': '',
  'completion': ' I think I hear them. Sta