### Sampling sentences (as for manual coding), this time for speaker recogniton using GPT4 (and ??).

In [1]:
# TO_EXCLUDE = ['Mini Beasties', 'Ten in the Bed and Other Counting Rhymes', 'The Rescue Party', 'One Snowy Night', 'The Night Before Christmas', 'One Starry Night'] # Books that currently has issues in the excel file
TO_EXCLUDE = []

In [2]:
import os
import pdfplumber
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import string
import spacy
from spacy import displacy
from spacy.lang.en.examples import sentences 

%matplotlib inline

### Does the transformer pipeline find different sentences? Not yet tested!

In [517]:
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

#nlp.add_pipe(nlp.create_pipe('sentencizer'))
#nlp.add_pipe(nlp.create_pipe("parser"))

In [539]:
labels = pd.read_excel('./Book-List-Final-NONA.xlsx', sheet_name='Sheet1')
labels = labels.rename(columns={'Author ': 'Author'})
labels = labels.loc[~labels.Title.isin(TO_EXCLUDE)]

In [540]:
os.chdir('../text_pdfs')

In [541]:
df = pd.DataFrame()

def grab_text(title, labels):
    
    start = labels.loc[labels.Title==title]['Starting Page']
    if len(start)==0:
        print(title, "no start")
        start = 0
    else:
        start = start.values[0]
    end = labels.loc[labels.Title==title]['Ending Page']
    if len(end)==0:
        print(title, "no end")
        end = 0
    else:
        end = end.values[0]
    
    title = title + '.pdf'
    all_text = ''
    with pdfplumber.open(title) as pdf:
        for i, page in enumerate(pdf.pages):
            if i+1 >= start and i < end:
                single_page_text = page.extract_text()

                if single_page_text is not None:
                    all_text = all_text + '\n' + single_page_text
                
    return all_text

df['Title'] = [file.split('.')[0] for file in os.listdir() if file.split('.')[1]=='pdf']
df['Text'] = [grab_text(title, labels) for title in df.Title]

In [542]:
df.head()

Unnamed: 0,Title,Text
0,The Night Before Christmas,\n'Twas the night before Christmas\nwhen all t...
1,Sugarlump and the Unicorn,"\nThe unicorn has a silver horn, Her\neyes are..."
2,The Gruffalo,\nA mouse took a stroll through the deep dark ...
3,The Monstrous Tale of Celery Crumble,\nHave you met Celery Crumble?\nThat’s her rig...
4,Peace at Last,"\nThe hour was late.\nMr Bear was tired, Mrs B..."


In [543]:
df = df.loc[df.Text != ''].reset_index()

In [544]:
len(df)

196

### Converting the full dataset into a dataframe of sentences

#### Note: using strip() here to remove trailing or leading spaces for improved performance.

In [545]:
sentences = pd.DataFrame()

book_col = []
sentences_col = []
length_col = []
index_col = []

for title, text in zip(df.Title, df.Text):
    text = text.replace('\n', ' ') # This is only safe provided the line break is not being used to separate sentences w/o puntctuation...
    text = text.replace('\t', ' ') # This allows us to save as tsv (and simplifies the whitespace)
    text = ' '.join(text.split())
    
    
    doc = nlp(text)
    sentence_list = list(doc.sents)
    
    for si, sen in enumerate(sentence_list):
        book_col.append(title)
        
        doc = sen #nlp(sen.text.strip())
        sentences_col.append(doc)
        length_col.append(len(doc.text.translate(str.maketrans('', '', string.punctuation)).split(' ')))
        index_col.append(si)

    
sentences['book'] = book_col
sentences['sentence_length'] = length_col
sentences['sentence'] = sentences_col
sentences['sentence_index'] = index_col

In [546]:
coding_sample = sentences.sample(frac=0.15, axis=0, random_state=42)

In [547]:
coding_sample

Unnamed: 0,book,sentence_length,sentence,sentence_index
12493,What The Ladybird Heard Next,22,"(Now, the, fat, red, hen, with, her, thin, bro...",1
1350,The Enormous Crocodile,11,"("", I, have, secret, plans, and, clever, trick...",83
3656,Gordon's Great Escape,11,"(She, took, him, to, the, park, to, play, on, ...",13
11731,Jesus' Christmas Party,1,"(“, There, .)",10
4560,Sir Charlie Stinky Socks and the Really Big Ad...,21,"(Inside, the, tower, a, windy, ,, windy, stair...",2
...,...,...,...,...
3784,Dogger,9,"(Dogger, had, just, been, bought, by, a, littl...",86
3076,Whatever Next!,25,"(He, found, a, space, helmet, on, the, drainin...",5
7799,Ravi's Roar,4,"(Even, Biscuits, the, dog, !)",2
12890,The Gruffalo's Child,4,"(“, You, ’re, not, the, Mouse, ., ”)",25


#### Check that this sample contains the same sentences that were manually coded!

In [548]:
os.chdir('../code_new_version/')

In [549]:
manually_coded = pd.read_csv('./sentences_for_coding/sample_15pc.csv', delimiter='\t', index_col=0)

In [550]:
text_equal = [
    i == j.text
    for i,j in
    zip(manually_coded.sentence, coding_sample.sentence)
]    

In [551]:
assert sum(text_equal) == len(text_equal)

In [84]:
sentences

Unnamed: 0,book,sentence_length,sentence,sentence_index
0,The Night Before Christmas,19,"(', Twas, the, night, before, Christmas, when,...",0
1,The Night Before Christmas,18,"(The, stockings, were, hung, by, the, chimney,...",1
2,The Night Before Christmas,17,"(The, children, were, nestled, all, snug, in, ...",2
3,The Night Before Christmas,1,"(as&ij-, ”)",3
4,The Night Before Christmas,41,"(And, mamma, in, her, kerchief, ,, and, I, in,...",4
...,...,...,...,...
14161,Little Monkey,9,"(“, And, very, lucky, !, ”, said, the, rest, o...",42
14162,Little Monkey,6,"(Little, Monkey, knew, they, were, right, .)",43
14163,Little Monkey,11,"(Because, the, smaller, you, are, ,, the, larg...",44
14164,Little Monkey,7,"(But, best, of, all, ., ., .)",45


In [85]:
prompt_string = f"""
    I provide you below with the full text of a children's book as a single string. 
    I then also provide a list of sentences and corresponding sentence ids from the book. 
    For each setence in the list, I want you to tell me if it contains direct speech. And if it does,
    to tell me who the speaker is and who they are speaking to. Be aware that a sentence might be contained 
    within a larger section of speech, and so when considered in isolation it might not have a speech mark attached. 
    You will therefore need to search for the sentence within the context of the full book text.
    
    Please provide the results in csv format
    with the following fields: sentence_id, direct_speech_flag, speaker, recipient, speech_text. 
    
    Book text: {df.iloc[0].Text}
    
    
    
    List of sentences: {list(sentences[sentences.book == df.iloc[0].Title].sentence)}
    List of sentence ids: {list(sentences[sentences.book == df.iloc[0].Title].sentence_index)}
"""

In [552]:
# prompt_string

In [174]:
df.iloc[0].Title

'The Night Before Christmas'

In [88]:
book_sentences = sentences[sentences.book == df.iloc[0].Title][['sentence_index', 'sentence']]

In [89]:
si = 7
book_sentences.iloc[si].sentence

More rapid than eagles, his coursers they came, And he whistled and shouted and called them by name: "Now, Dasher!

In [68]:
si = 8
book_sentences.iloc[si].sentence

now, Dancer!

In [50]:
si = 9
book_sentences.iloc[si].sentence

now, Prancer and Vixen!

In [51]:
si = 10
book_sentences.iloc[si].sentence

On Comet!

In [52]:
si = 29
book_sentences.iloc[si].sentence

But I heard him exclaim, 'ere he drove out of sight, "Happy Christmas to all, and to all a good night!"

In [53]:
book_sentences.iloc[si].sentence_index

29

In [58]:
# 13,True,St. Nicholas,Reindeer,"Now, Dasher! now, Dancer! now, Prancer and Vixen! On Comet! on Cupid! on Donner and Blitzen! To the top of the porch, to the top of the wall, Now, dash away! Dash away! Dash away all!”

#### GPT is finding the speech but it is not abel to match them to individual sentences at the moment (getting confused when a section of speech spans multiple sentences.) Let's try improving the prompt:

Note: in fact the issue above was caused by using comma for list separation in the list of sentences (we now used '<end>'). But still the below approach is probably better and more logical - identify full sections of speech first, and then map these to SpaCys sentences (for comparison with human coding and for analysis). Doing it the other way round - asking GPT4 to look at individual sentences first risks perhaps loosing context and might give worse results.

In [55]:
separator = '<end>'

In [69]:
improved_prompt_string = f"""
    I will provide you below with the full text of a children's book as a single string. 
    
    Please identify any sections of direct speech, and for each one tell me who is the speaker and who is the recipient.
    Provide the results in csv format with the following fields: speaker, recipient, speech_text, speech_section_id
    (where speech_section_id counts the number of sections of speech in this book)
    
    I then also provide you with a list of sentences and corresponding sentence ids from the book.
    In this list, each sentence is separated by the string: f{separator}
    For each sentence in the list that is part of one of the speech sections that you detected previously,  
    please it in csv format with the following fields: sentence_id, speech_text, speech_section_id
    
    Book text: {df.iloc[0].Text}
    
    List of sentences: [{separator.join(map(str,sentences[sentences.book == df.iloc[0].Title].sentence))}]
    List of sentence ids: {list(sentences[sentences.book == df.iloc[0].Title].sentence_index)}
"""

In [553]:
# improved_prompt_string

### Trying with JSON for input and output:

Example of using JSON mode here: https://community.openai.com/t/openai-api-guide-using-json-mode/557265

In [94]:
import json

In [685]:
book_id = 21
df.iloc[book_id].Title

'Harry and the Dinosaurs Go Wild'

In [503]:
book_id = 7

In [504]:
data = {
    'full_text': df.iloc[book_id].Text,
    'sentences': dict(
        zip(
            sentences[sentences.book == df.iloc[book_id].Title].sentence_index,
            [span.text for span in sentences[sentences.book == df.iloc[book_id].Title].sentence]
        )
    )
}

In [660]:
json_schema = {
    "speech_sections": {
        "speaker": "string",
        "recipient": "string",
        "speech_text": "string",
        "speech_section_id": "integer"
    },
    "sentences_with_speech": {
        "sentence_id (integer)": "speech_section_id (integer)"
    }
}
    
json_schema_str = ', '.join([f"'{key}': {value}" for key, value in json_schema.items()])

In [661]:
input_json_schema = {
    "full_text": "string",
    "sentences": {
        "sentence_id (integer)": "sentence (string)"
    }
}
    
input_json_schema_str = ', '.join([f"'{key}': {value}" for key, value in input_json_schema.items()])

In [507]:
#Please output the block of text that contains the speech, and also extract the spoken words only as 'speech_text'.
def get_prompt_string(data):
    
    return f"""
        I will provide you below with the following data in JSON format: {input_json_schema_str}

        where the dictionary elements are defined as follows:
            full_text: The full text of a children's book as a single string. 
            sentences: a dictionary of sentences from the book.

        Using the full text, please identify any sections of direct speech, and for each one tell me who is the speaker and who is the recipient.
        Remember that a section of direct speech can be broken up by information about who is speaking (see the answere here for an example: https://english.stackexchange.com/questions/421389/how-to-break-direct-speech-into-two-parts-right)     
        Provide the results in JSON format with the following fields: speaker, recipient, speech_text, speech_section_id
        (where speech_section_id counts the number of sections of speech in this book)

        Then, for each sentence in the 'sentences' dictionary that contains part of one of the speech sections that you detected previously,  
        please list it in JSON format with the following fields: sentence_id, speech_section_id

        Please use '\n' as the newline character and reproduce these as the occur.
        Please reproduce punctuation as it is written using regular double quotes "" for speech marks.

        Data: {data}
    """

In [508]:
improved_prompt_string = get_prompt_string(data)

In [694]:
results_dict

{'The Night Before Christmas': {'speech_sections': [{'speaker': 'St. Nicholas',
    'recipient': 'Reindeer',
    'speech_text': '"Now, Dasher! now, Dancer! now, Prancer and Vixen! On Comet! on Cupid! on Donner and Blitzen! To the top of the porch, to the top of the wall, Now, dash away! Dash away! Dash away all!”',
    'speech_section_id': 1},
   {'speaker': 'St. Nicholas',
    'recipient': 'Everyone',
    'speech_text': '"Happy Christmas to all, and to all a good night!"',
    'speech_section_id': 2}],
  'sentences_with_speech': {'7': 1,
   '8': 1,
   '9': 1,
   '10': 1,
   '11': 1,
   '12': 1,
   '13': 1,
   '14': 1,
   '15': 1,
   '29': 2}},
 'Sugarlump and the Unicorn': {'speech_sections': [{'speaker': 'Sugarlump',
    'recipient': 'himself',
    'speech_text': '"Here in the children\'s bedroom\nIs where I want to be.\nHappily rocking to and fro.\nThis is the life for me!"',
    'speech_section_id': 1},
   {'speaker': 'Sugarlump',
    'recipient': 'himself',
    'speech_text': '"Oh

#### Automating this using the Chat GPT API:

## TODO:
 - move deifnition of input json to system prompt?
 - add character/alias mapping to prompt for each book: use primary name only
 - When in pipeline to spellcheck/ correct typos? (e.g. Hany in the Dinosaurs (book 21).
 - what to do about inconsitent sentence detection? e.g "Now Dasher!" being at the end of sentence 7 was causing GPT confusion...
 - add an instruction about how to refer to 'general audience' or 'narrator' or 'I'
 - provide example of input and what the output should look like (within the prompt)
 - should temp be close to 0 (but not exactly 0)?
 - ask for output of reaosning/thought process?
 - ask for a confidence score?
 - do we need to specify (in system prompt), not to use MD or any other formatting in the json output?
 
## Note: ideas to explore if we need performance boost...

- system message to edit assistant role
- vary temperature or top_p parameter
- fine_tuning a model with bespoke training data (how much is necessary?)
- improved instructions or prompt engineering (see e.g. paper on iterative prompting)
- compare results with gpt-3.5-turbo? - does not seem to work weel for our use case!

In [509]:
with open('./key.txt', 'r') as infile:
    key = infile.read().splitlines()[0]

In [510]:
from openai import OpenAI

In [511]:
client = OpenAI(api_key=key)

completion = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": f"You are a data analysis assistant, capable of accurate and precise natural language processing. Output your response in JSON format using the following schema: {json_schema_str}. When reproducing text data please preserve newline characters and punctuation. Please start all indexing of lists and arrays at 0 rather than 1."},
    {"role": "user", "content": r"{}".format(improved_prompt_string)}
  ],
 temperature=0.0,
 response_format={"type": "json_object"},
)

In [512]:
print(completion.choices[0].message.content)

{
  "speech_sections": [
    {
      "speaker": "troll",
      "recipient": "tiny black creature",
      "speech_text": "“Who’s that trip-trapping over my bridge?”",
      "speech_section_id": 0
    },
    {
      "speaker": "tiny black creature",
      "recipient": "troll",
      "speech_text": "“I’m not trip-trapping, I’m scuttling,”",
      "speech_section_id": 1
    },
    {
      "speaker": "tiny black creature",
      "recipient": "troll",
      "speech_text": "“And I’m a spider.”",
      "speech_section_id": 2
    },
    {
      "speaker": "troll",
      "recipient": "spider",
      "speech_text": "“Oh bother, I thought you were a goat,”",
      "speech_section_id": 3
    },
    {
      "speaker": "spider",
      "recipient": "troll",
      "speech_text": "“No - goats have fur,”",
      "speech_section_id": 4
    },
    {
      "speaker": "troll",
      "recipient": "spider",
      "speech_text": "“Never mind, I’ll eat you anyway,”",
      "speech_section_id": 5
    },
    {
   

#### Running for several books to test outputs, save format etc:

In [414]:
import datetime

In [554]:
client = OpenAI(api_key=key)

book_df = {
    'title': [],
    'speech_section_count': 0,
    'completion_tokens': [],
    'prompt_tokens': [],
    'total_tokens': [],
    'runtime_seconds': []
}
results_dict = {
    
}

for book_id in range(10):
    
    print("Book: ", df.iloc[book_id].Title)
    start = datetime.datetime.now()    
    
    data = {
        'full_text': df.iloc[book_id].Text,
        'sentences': dict(
            zip(
                sentences[sentences.book == df.iloc[book_id].Title].sentence_index,
                [span.text for span in sentences[sentences.book == df.iloc[book_id].Title].sentence]
            )
        )
    }
    prompt = get_prompt_string(data)
    
    completion = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        {"role": "system", "content": f"You are a data analysis assistant, capable of accurate and precise natural language processing. Output your response in JSON format using the following schema: {json_schema_str}. When reproducing text data please preserve newline characters and punctuation."},
        {"role": "user", "content": r"{}".format(prompt)}
      ],
     temperature=0.0,
     response_format={"type": "json_object"},
    )
    
    json_response = json.loads(completion.choices[0].message.content)
    
    book_df['completion_tokens'].append(completion.usage.completion_tokens)
    book_df['prompt_tokens'].append(completion.usage.prompt_tokens)
    book_df['total_tokens'].append(completion.usage.total_tokens)
    book_df['title'].append(df.iloc[book_id].Title)
    book_df['runtime_seconds'].append((datetime.datetime.now() - start).seconds)
    book_df['speech_section_count'] += len(json_response['speech_sections'])
    
    results_dict[df.iloc[book_id].Title] = json_response
    
book_df = pd.DataFrame(book_df)

Book:  The Night Before Christmas
Book:  Sugarlump and the Unicorn
Book:  The Gruffalo
Book:  The Monstrous Tale of Celery Crumble
Book:  Peace at Last
Book:  Sing A Song Of Bottoms
Book:  Barry The Fish With Fingers
Book:  The Troll
Book:  The Storm Whale In Winter
Book:  There's A Monster In Your Book


In [555]:
book_df

Unnamed: 0,title,speech_section_count,completion_tokens,prompt_tokens,total_tokens,runtime_seconds
0,The Night Before Christmas,154,234,1949,2183,3
1,Sugarlump and the Unicorn,154,1114,2295,3409,17
2,The Gruffalo,154,2821,3055,5876,37
3,The Monstrous Tale of Celery Crumble,154,802,2631,3433,13
4,Peace at Last,154,754,1855,2609,11
5,Sing A Song Of Bottoms,154,71,1427,1498,1
6,Barry The Fish With Fingers,154,595,1530,2125,9
7,The Troll,154,2891,4854,7745,52
8,The Storm Whale In Winter,154,274,1544,1818,4
9,There's A Monster In Your Book,154,254,1351,1605,4


In [557]:
results_dict.keys()

dict_keys(['The Night Before Christmas', 'Sugarlump and the Unicorn', 'The Gruffalo', 'The Monstrous Tale of Celery Crumble', 'Peace at Last', 'Sing A Song Of Bottoms', 'Barry The Fish With Fingers', 'The Troll', 'The Storm Whale In Winter', "There's A Monster In Your Book"])

In [419]:
results_dict['Sugarlump and the Unicorn']

{'speech_sections': [{'speaker': 'Sugarlump',
   'recipient': 'children',
   'speech_text': '"Here in the children\'s bedroom\nIs where I want to be.\nHappily rocking to and fro.\nThis is the life for me!"',
   'speech_section_id': 1},
  {'speaker': 'Sugarlump',
   'recipient': 'himself',
   'speech_text': '"Oh to be out in the big wide world!\nI wish I could trot," he said.',
   'speech_section_id': 2},
  {'speaker': 'unicorn',
   'recipient': 'Sugarlump',
   'speech_text': '"Done!" came a voice, and there stood a beast\nWith a twisty silver horn.\n"I can grant horses\' wishes," Said the snow-\nwhite unicorn.',
   'speech_section_id': 3},
  {'speaker': 'Sugarlump',
   'recipient': 'himself',
   'speech_text': '"Here in the open countryside is where\nI like to be.\nClippety-dop, clippety-dop, This is the\nlife for me!"',
   'speech_section_id': 4},
  {'speaker': 'Sugarlump',
   'recipient': 'himself',
   'speech_text': '"Oh to be free of this heavy load.\nI wish I could gallop!"',
   '

In [642]:
import pickle as pk 
with open('./results/gpt4o_results_dict.pk', 'wb') as outfile:
    pk.dump(results_dict, outfile)

In [643]:
book_df.to_csv('./results/gpt4o_book_summary.csv')

#### We now extend the conversation to pull out only the spoken words only:

In [644]:
new_json_schema = {
    "speaker": "string",
    "recipient": "string",
    "spoken_words_only": "string",
    "speech_section_id": "integer"
}
    
new_json_schema_str = ', '.join([f"'{key}': {value}" for key, value in new_json_schema.items()])

In [None]:
For the speech sections that you just found, which I reproduce below, please pull out the words that are spoken
        and add them as a new field in the JSON called spoken_words_only, replacing the speech_text field.
        So you will need to remove all non-speech words such as 'she said' etc.
        
        Please use provide your response in JSON.
        Please reproduce punctuation as it is written using regular double quotes "" for speech marks.

        Data: {previous_response}

In [678]:
def get_second_prompt_string(previous_response):
    
    return f"""
        For the speech sections that you just found, please pull out the words that are spoken
        and add them as a new field in the JSON called spoken_words_only, replacing the speech_text field.
        So you will need to remove all non-speech words such as 'she said' etc.
        
        Please use provide your response in JSON.
        Please reproduce punctuation as it is written using regular double quotes "" for speech marks.
    """

In [686]:
book_id = 21
df.iloc[book_id].Title

'Harry and the Dinosaurs Go Wild'

In [687]:
data = {
    'full_text': df.iloc[book_id].Text,
    'sentences': dict(
        zip(
            sentences[sentences.book == df.iloc[book_id].Title].sentence_index,
            [span.text for span in sentences[sentences.book == df.iloc[book_id].Title].sentence]
        )
    )
}

In [688]:
client = OpenAI(api_key=key)

completion = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": f"You are a data analysis assistant, capable of accurate and precise natural language processing. Output your response in JSON format using the following schema: {json_schema_str}. When reproducing text data please preserve newline characters and punctuation. Please start all indexing of lists and arrays at 0 rather than 1."},
    {"role": "user", "content": r"{}".format(get_prompt_string(data))}
  ],
 temperature=0.0,
 response_format={"type": "json_object"},
)

In [689]:
completion.usage

CompletionUsage(completion_tokens=992, prompt_tokens=2418, total_tokens=3410)

In [690]:
second_completion = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": f"You are a data analysis assistant, capable of accurate and precise natural language processing. Output your response in JSON format using the following schema: {json_schema_str}. When reproducing text data please preserve newline characters and punctuation. Please start all indexing of lists and arrays at 0 rather than 1."},
    {"role": "user", "content": r"{}".format(get_prompt_string(data))},
    {"role": "assistant", "content": completion.choices[0].message.content},
    {"role": "system", "content": f"Please use the following schema for your JSON response: {new_json_schema}"},
    {"role": "user", "content": r"{}".format(get_second_prompt_string(json.loads(completion.choices[0].message.content)))}
  ],
 temperature=0.0,
 response_format={"type": "json_object"},
)

In [691]:
second_completion.usage

CompletionUsage(completion_tokens=944, prompt_tokens=3557, total_tokens=4501)

In [693]:
json.loads(completion.choices[0].message.content)

{'speech_sections': [{'speaker': 'Hany',
   'recipient': 'Apatosaurus',
   'speech_text': '“That’s a\nrhinoceros,” said Hany.',
   'speech_section_id': 0},
  {'speaker': 'Harry',
   'recipient': 'Mum',
   'speech_text': '“I\nwant to save some animals,” he said.\n“What can I do, Mum?”',
   'speech_section_id': 1},
  {'speaker': 'Sam',
   'recipient': 'Harry',
   'speech_text': '“Tuh! What a waste of time!”',
   'speech_section_id': 2},
  {'speaker': 'Harry',
   'recipient': 'Pterodactyl',
   'speech_text': '“Wait till I’ve finished my blue whale,” said Harry.\n“Blue whales are bigger than trains, bigger than\ndinosaurs, bigger than thirty-two elephants!”',
   'speech_section_id': 3},
  {'speaker': 'Triceratops',
   'recipient': 'Stegosaurus',
   'speech_text': '“Army tanks don’t need saving!” said Triceratops.\n“Do a tree frog instead.”',
   'speech_section_id': 4},
  {'speaker': 'Nan',
   'recipient': 'Harry',
   'speech_text': '“Why not talk to Mr Bopsom?\nHe might put up a poster in 

In [692]:
json.loads(second_completion.choices[0].message.content)

{'speech_sections': [{'speaker': 'Hany',
   'recipient': 'Apatosaurus',
   'spoken_words_only': '“That’s a rhinoceros,”',
   'speech_section_id': 0},
  {'speaker': 'Harry',
   'recipient': 'Mum',
   'spoken_words_only': '“I want to save some animals,” “What can I do, Mum?”',
   'speech_section_id': 1},
  {'speaker': 'Sam',
   'recipient': 'Harry',
   'spoken_words_only': '“Tuh! What a waste of time!”',
   'speech_section_id': 2},
  {'speaker': 'Harry',
   'recipient': 'Pterodactyl',
   'spoken_words_only': '“Wait till I’ve finished my blue whale,” “Blue whales are bigger than trains, bigger than dinosaurs, bigger than thirty-two elephants!”',
   'speech_section_id': 3},
  {'speaker': 'Triceratops',
   'recipient': 'Stegosaurus',
   'spoken_words_only': '“Army tanks don’t need saving!” “Do a tree frog instead.”',
   'speech_section_id': 4},
  {'speaker': 'Nan',
   'recipient': 'Harry',
   'spoken_words_only': '“Why not talk to Mr Bopsom? He might put up a poster in his shop window! Then

## Now trialling format for manual validation:

We have already used the student manual coding for validate speech detection, so now we can just focus on detected speech...

1. Select book at random, select passage of detected speech at random. 
2. Show user the passage and some of the text either side of the passage
3. Ask is it speech? Is speaker correct? Is recipient correct? [Give option to view more text]
4. Save result.

#### Note: handle case when sentence is not found in text e.g. The Troll sentence 7 is split across two setences (7 and 8) due to bad pdfplumber output.

In [393]:
from IPython.display import display, Markdown
from random import randint

In [628]:
# selection = randint(0, 1)
# book_selection = 4  # Peace at Last
book_selection = 1  # Sugarlump

In [629]:
selected_book = list(results_dict.keys())[book_selection]
selected_book

'Sugarlump and the Unicorn'

In [630]:
# speech_sections = json.loads(completion.choices[0].message.content)['speech_sections']
speech_sections = results_dict[selected_book]['speech_sections']

In [631]:
def validate(v_vec):
    pass

In [632]:
def display_section(df, res, speech_section_result, padding=200):

    speech_section = speech_section_result['speech_text']
    book_text = df[df.Title == selected_book].iloc[0].Text
    this_text = book_text[0:res] + '**' + book_text[res:res+len(speech_section)] + '**' + book_text[res+len(speech_section):]
    this_text = this_text[max(res-padding-2, 0):min(res+len(speech_section)+padding+2, len(this_text))]
    display(Markdown(this_text.replace('\n', '<br>')))

    display(Markdown('**' + 'Result:' + '**'))
    display(speech_section_result)

In [639]:
# section_selection = randint(0, len(speech_sections))
section_selection = 0

In [640]:
res = df[df.Title == selected_book].iloc[0].Text.find(speech_sections[section_selection]['speech_text']) 
res

251

In [641]:
display_section(df, res, speech_sections[section_selection])

ht and blue. And when<br>she hears a horse's wish, She can<br>make that wish come tine.<br>Sugarlump was a rocking horse.<br>He belonged to a girl and boy.<br>To and fro, to and fro,<br>They rode on their favourite toy.<br>**"Here in the children's bedroom<br>Is where I want to be.<br>Happily rocking to and fro.<br>This is the life for me!"**<br><br>But when the children were out at school Sugarlump hung his head.<br>"Oh to be out in the big wide world!<br>I wish I could trot," he said.<br><br>"Done!" came a voice, and there stood a beast<br>With a twisty s

**Result:**

{'speaker': 'Sugarlump',
 'recipient': 'himself',
 'speech_text': '"Here in the children\'s bedroom\nIs where I want to be.\nHappily rocking to and fro.\nThis is the life for me!"',
 'speech_section_id': 1}

### Please indicate with '1' which are correct: [speech, speaker, recipient]

In [440]:
validation_vector = [1, 1, 1]
validate(validation_vector)