### Manual validation of GPT4o results.

In [19]:
VALIDATION_RUN_NAME = 'cm_validation_newrun_190924_100_42'
# VALIDATION_RUN_NAME = 'cm_validation_250_42'

In [3]:
import os
import json
import pdfplumber
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import string
import spacy
from spacy import displacy
from spacy.lang.en.examples import sentences 
from openai import OpenAI
import pickle
from copy import copy
import re

%matplotlib inline

In [4]:
from IPython.display import display, Markdown, clear_output
from ipywidgets.widgets import Button, Layout, ButtonStyle, GridBox
from ipywidgets import widgets
from random import randint

In [5]:
with open('data/gpt4_speakers_recipients_processed.json', 'r') as outfile:
    speakers = pd.read_json(outfile)

In [6]:
with open('data/tempdf.pickle', 'rb') as outfile:
    df = pickle.load(outfile)

## Manual validation of GPT4o output:

We have already used the student manual coding for validate speech detection, so now we can just focus on detected speech.

1. Select book at random, select passage of detected speech at random. 
2. Show user the passage and some of the text either side of the passage. Assume correct unless otherwise indicated.
3. Ask is it speech? Is speaker correct? Is recipient correct? (both initial and matched) [Give option to view more text]
4. Ask is there abmiguity? (self talk, unclear, ??) 
6. Save result.

Note: record error if speech section not found!

In [7]:
books = speakers.book.unique()

In [18]:
100 * 200 / len(speakers)

6.842285323297982

In [8]:
validation_sample = speakers.sample(200, random_state=42, replace=False)
# debug_sample_ids = [3504, 414, 2998, 402]
# validation_sample = validation_sample.loc[debug_sample_ids]

In [8]:
validation_sample.to_json(f"data/validation_sample_{VALIDATION_RUN_NAME}.json")

In [9]:
def show_running_totals(arg):
    clear_output(wait=True)
    make_running_totals()
    
    c = copy(validation_result_dict)
    c['validation_sample_id'] = c['validation_sample_id'][:-1]
    c = pd.DataFrame(c)
    
    display(c)
    print(pd.DataFrame(c).sum(axis=0)/len(c))

In [10]:
def get_gtp_output(df, speakers, _validation_index, split_min_len=3):
    speech_section = speakers.loc[_validation_index]['speech_text']
    selected_book = speakers.loc[_validation_index]['book']
    
    display(Markdown(f"**Book: {selected_book}**"))
    display(Markdown(f"**Validation index: {_validation_index}**"))
    display(Markdown(f"**Example number: {validation_sample.index.get_loc(_validation_index)}**"))
    display(Markdown(f"**Speech text:** {speech_section}"))
    display(Markdown(f"**------------------------**"))
    display(Markdown(f"**In context:**"))
    
    # find the speech section in the full book text:
    book_text = df[df.Title == selected_book].iloc[0].Text
    res = book_text.find(speech_section)
    if res == -1:
        res = book_text.replace('\n', ' ').find(speech_section)
        
    if res == -1:
        split_speech = re.split('[?.,!]', speech_section)
        split_speech = [i for i in split_speech if len(i)>0]
        if len(split_speech[0].split(' ')) >= split_min_len or len(split_speech)==1:
            split_speech = split_speech[0]
        elif len(split_speech[1].split(' ')) >= split_min_len or len(split_speech)==2:
            split_speech = split_speech[1]
        else:
            split_speech = split_speech[2]
            
        speech_section = split_speech.strip()
        res = book_text.find(speech_section)
        
    display_section(book_text, res, speech_section)
    
    display(Markdown('**' + 'GPT4o Result:' + '**'))
    print(f"Speaker: {speakers.loc[_validation_index]['speaker']}")
    print(f"Speaker matched: {speakers.loc[_validation_index]['speaker_matched']}")
    print('\n')
    print(f"Recipient: {speakers.loc[_validation_index]['recipient']}")
    print(f"Recipient matched: {speakers.loc[_validation_index]['recipient_matched']}")
          
#     return split_speech

In [11]:
def display_section(book_text, res, speech_section, padding=200):
    
    this_text = book_text[0:res] + '**' + book_text[res:res+len(speech_section)] + '**' + book_text[res+len(speech_section):]
    this_text = this_text[max(res-padding-2, 0):min(res+len(speech_section)+padding+2, len(this_text))]
    display(Markdown(this_text.replace('\n', '<br>')))

In [12]:
error  = Button(description='Error (e.g. speech section not found or wrong section highlighted)',
                 layout=Layout(width='auto', grid_area='error'),
                 style=ButtonStyle(button_color='salmon'))

indirect  = Button(description='Indirect speech',
                 layout=Layout(width='auto', grid_area='indirect'),
                 style=ButtonStyle(button_color='moccasin'))

speaker_wrong = Button(description='Speaker wrong',
                 layout=Layout(width='auto', grid_area='speaker_wrong'),
                 style=ButtonStyle(button_color='palegreen'))
speaker_matched_wrong = Button(description='Speaker_matched wrong',
                 layout=Layout(width='auto', grid_area='speaker_matched_wrong'),
                 style=ButtonStyle(button_color='moccasin'))

speaker_ambiguity = Button(description='Speaker ambiguity',
                 layout=Layout(width='auto', grid_area='speaker_ambiguity'),
                 style=ButtonStyle(button_color='lightblue'))

recipient_wrong = Button(description='Recipient wrong',
                 layout=Layout(width='auto', grid_area='recipient_wrong'),
                 style=ButtonStyle(button_color='palegreen'))
recipient_matched_wrong = Button(description='Recipient_matched wrong',
                 layout=Layout(width='auto', grid_area='recipient_matched_wrong'),
                 style=ButtonStyle(button_color='moccasin'))

recipient_ambiguity = Button(description='Recipient ambiguity',
                 layout=Layout(width='auto', grid_area='recipient_ambiguity'),
                 style=ButtonStyle(button_color='lightblue'))

def clicked(arg):
    d = arg.description
    
    val_vec_map = {
        'Error (e.g. speech section not found or wrong section highlighted)': 0,
        'Speaker wrong': 1,
        'Speaker_matched wrong': 2, 
        'Speaker ambiguity': 3,
        'Recipient wrong': 4,
        'Recipient_matched wrong': 5, 
        'Recipient ambiguity': 6,
        'Indirect speech': 7
    }
    
    validation_vector[val_vec_map[d]] = 1

error.on_click(clicked)
indirect.on_click(clicked)
speaker_wrong.on_click(clicked)
speaker_matched_wrong.on_click(clicked)
speaker_ambiguity.on_click(clicked)
recipient_wrong.on_click(clicked)
recipient_matched_wrong.on_click(clicked)
recipient_ambiguity.on_click(clicked)

def make_buttons():
    display(
        GridBox(children=[
            error, indirect, speaker_wrong, speaker_matched_wrong, speaker_ambiguity, 
            recipient_wrong, recipient_matched_wrong, recipient_ambiguity, 
        ],
            layout=Layout(
                width='100%',
                grid_template_rows='auto auto auto',
                grid_template_columns='30% 40% 30%',
                grid_template_areas='''
                "error error indirect"
                "speaker_wrong speaker_matched_wrong speaker_ambiguity"
                "recipient_wrong recipient_matched_wrong recipient_ambiguity"
                ''')
           )
    )
    
def make_next():
    next_button = widgets.Button(description = 'Next example.')   
    next_button.on_click(next_example)
    display(next_button)
    
def make_running_totals():
    run_tot_button = widgets.Button(description = 'Show running totals.')   
    run_tot_button.on_click(show_running_totals)
    display(run_tot_button)

In [13]:
validation_iterator = iter(validation_sample.index)
validation_vector = [0, 0, 0, 0, 0, 0, 0, 1]
validation_result_dict = {
    'validation_sample_id': [],
    'Error (e.g. speech section not found or not highlighted)': [],
    'Speaker wrong': [],
    'Speaker_matched wrong': [], 
    'Speaker ambiguity': [],
    'Recipient wrong': [],
    'Recipient_matched wrong': [], 
    'Recipient ambiguity': [],
    'Indirect speech': []
}

In [14]:
def next_example(a, _validation_vector=validation_vector, _v_result_dict=validation_result_dict):
    clear_output(wait=True)
    make_next()
    
    
    next_sample_id = next(validation_iterator)
    _v_result_dict['validation_sample_id'].append(next_sample_id)
    
    if len(_v_result_dict['validation_sample_id']) > 1:
        for i,k in enumerate(_v_result_dict.keys()): 
            if k != 'validation_sample_id':
                _v_result_dict[k].append(_validation_vector[i-1])
    
    with open(f"./results/{VALIDATION_RUN_NAME}.pickle", 'wb') as outfile:
        pickle.dump(_v_result_dict, outfile)
    
    for i in range(len(_validation_vector)):
        _validation_vector[i] = 0
    
    get_gtp_output(df, speakers, next_sample_id)

In [15]:
make_buttons()

GridBox(children=(Button(description='Error (e.g. speech section not found or wrong section highlighted)', lay…

In [16]:
make_next()

Button(description='Next example.', style=ButtonStyle())

StopIteration: 

#### Comments:

* Comment on Everyone mathcing The Reader..
* Note that speaker found but not matched -> missing character.


### TODO: 
* Add button for speech wrong (indriect speech, or not a single section etc).

#### Notes:
* There are a few cases of indirect speech being picked up. Best soultion might be to ask flagging of indirect speech exlicitly and use in-context learning to demonstrate the difference. 
* Most errors are due to ambuguity. Particularly when there is a general audience and it is not clear if this is the reader or some unknown or unnamed group of characters.
* Some sections of speech by multiple characters...i.e. non single section.
* Maybe don't include 'with typos' in the instructions?
* Also two cases of letters/notes being flagged as speech - again need to include these in examples...
* Speech_text is often/sometimes replaced by spoken_words_only (i.e. converted to indirect 'she said' removed) - more examples needed for in-context learning?


In [17]:
make_running_totals()

Button(description='Show running totals.', style=ButtonStyle())

Unnamed: 0,validation_sample_id,Error (e.g. speech section not found or not highlighted),Speaker wrong,Speaker_matched wrong,Speaker ambiguity,Recipient wrong,Recipient_matched wrong,Recipient ambiguity,Indirect speech
0,1047,0,0,0,0,0,0,0,0
1,2394,0,0,0,0,0,0,0,0
2,1127,0,0,0,0,0,0,0,0
3,1392,0,0,0,0,1,1,1,0
4,679,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
194,239,0,0,0,0,0,0,0,0
195,2502,0,0,0,0,0,1,0,0
196,897,0,1,1,0,0,0,0,0
197,1691,0,0,0,0,0,0,0,0


validation_sample_id                                        1484.678392
Error (e.g. speech section not found or not highlighted)       0.080402
Speaker wrong                                                  0.015075
Speaker_matched wrong                                          0.030151
Speaker ambiguity                                              0.020101
Recipient wrong                                                0.035176
Recipient_matched wrong                                        0.055276
Recipient ambiguity                                            0.100503
Indirect speech                                                0.025126
dtype: float64


In [20]:
with open(f"./results/{VALIDATION_RUN_NAME}.pickle", 'rb') as infile:
    validation_result_dict = pickle.load(infile)

In [21]:
validation_result_dict['validation_sample_id'] = validation_result_dict['validation_sample_id'][:-1]
validation_result_dict = pd.DataFrame(validation_result_dict)

In [22]:
validation_result_dict

Unnamed: 0,validation_sample_id,Error (e.g. speech section not found or not highlighted),Speaker wrong,Speaker_matched wrong,Speaker ambiguity,Recipient wrong,Recipient_matched wrong,Recipient ambiguity,Indirect speech
0,1047,0,0,0,0,0,0,0,0
1,2394,0,0,0,0,0,0,0,0
2,1127,0,0,0,0,0,0,0,0
3,1392,0,0,0,0,1,1,1,0
4,679,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
194,239,0,0,0,0,0,0,0,0
195,2502,0,0,0,0,0,1,0,0
196,897,0,1,1,0,0,0,0,0
197,1691,0,0,0,0,0,0,0,0


In [118]:
c = copy(validation_result_dict)
# c['validation_sample_id'] = c['validation_sample_id'][:-1]
# c = pd.DataFrame(c)
    
#     display(c)
#     print(pd.DataFrame(c).sum(axis=0)/len(c))

In [119]:
S = 200

In [120]:
c['Speaker_matched wrong'].sum()# / S

6

In [121]:
c['Recipient_matched wrong'].sum() / S

0.055

In [125]:
c['Speaker ambiguity'].sum() / S

0.02

In [123]:
c['Recipient ambiguity'].sum() / S

0.1

In [117]:
c['Indirect speech'].sum() #/ S

5

In [80]:
c = validation_result_dict[validation_result_dict['Recipient ambiguity']==0]
# c['Recipient_matched wrong'].sum() / len(c)
len(c)

179

In [105]:
c = validation_result_dict[validation_result_dict['Speaker ambiguity']==0]
c['Speaker_matched wrong'].sum() / len(c)
len(c)

195

#### There are several cases where the spoke text cannot be found (for formatted display to the annotator). However, manual inspection reveals that the spoken word extraction is correct (i.e. subtask 2):

In [61]:
c['Error (e.g. speech section not found or not highlighted)'].sum()

16

In [39]:
c['Error (e.g. speech section not found or not highlighted)'].sum() / len(c)

0.08040201005025126

In [45]:
c[c['Error (e.g. speech section not found or not highlighted)']==1].validation_sample_id.values

array([ 912, 2863, 1712, 1739,   51, 2335, 1949, 2860, 2618, 1487, 2351,
       1920,   32, 1503, 1868, 1032])

In [46]:
validation_sample.loc[c[c['Error (e.g. speech section not found or not highlighted)']==1].validation_sample_id.values]

Unnamed: 0,book,speech_section_id,speaker,recipient,speaker_matched,recipient_matched,speech_text,spoken_words_only,spoken_word_count,chunk_titles,...,name_speaker,gender_speaker,human_speaker,alias_count_speaker,is_protagonist_speaker,name_recipient,gender_recipient,human_recipient,alias_count_recipient,is_protagonist_recipient
912,Wide-awake Hedgehog,7,North Wind,Isaac,wind,Isaac,"""I am the one who makes the trees dance. I hel...",I am the one who makes the trees dance. I help...,195,Wide-awake Hedgehog,...,wind,F,NH,0.0,0.0,Isaac,M,NH,2.0,1.0
2863,Cinder the Bubble-Blowing Dragon,6,Blaze,King,Blaze,King,"""Where is the King?"" thundered Blaze. ""I bring...",Where is the King? I bring fire for your Majesty.,49,Cinder the Bubble-Blowing Dragon,...,Blaze,M,NH,0.0,0.0,King,M,H,0.0,0.0
1712,Santa to the Rescue,20,Squirrel,Group,Squirrel,The Reader,More icing! Hurry!,More icing! Hurry!,18,Santa to the Rescue,...,Squirrel,NGS,NH,0.0,0.0,The Reader,NGS,H,0.0,
1739,The Bad-Tempered Ladybird,16,bad-tempered ladybird,sparrow,the bad-tempered ladybird,sparrow,"Hey you,” said the bad-\ntempered ladybird. Wa...",Hey you. Want to fight?,23,The Bad-Tempered Ladybird,...,the bad-tempered ladybird,NGS,NH,1.0,1.0,sparrow,NGS,NH,0.0,0.0
51,The Gruffalo,34,Owl,Mouse,owl,Mouse,"“Oh dear!” he said, “Goodbye, little mouse”","Oh dear! Goodbye, little mouse.",31,The Gruffalo,...,owl,M,NH,0.0,0.0,Mouse,M,NH,0.0,1.0
2335,Where's My Cuddle,9,witch,Jake,witch,Jake,I’ve given it to a wizard\nbecause his spell w...,I’ve given it to a wizard because his spell we...,55,Where's My Cuddle,...,witch,F,H,0.0,0.0,Jake,M,H,0.0,1.0
1949,The Polar Express,5,guard,children,guard,children,There is the North Pole.,There is the North Pole.,24,The Polar Express,...,guard,M,H,0.0,0.0,children,NGS,H,0.0,0.0
2860,Cinder the Bubble-Blowing Dragon,3,King,Cinder,King,Cinder,"""When he comes this afternoon you may be prese...",When he comes this afternoon you may be presen...,100,Cinder the Bubble-Blowing Dragon,...,King,M,H,0.0,0.0,Cinder,M,NH,0.0,1.0
2618,The Gruffalo's Child,8,Gruffalo's Child,Herself,Gruffalo's child,Gruffalo's child,“The Big Bad Mouse — so he does\nexist!”,The Big Bad Mouse — so he does exist!,37,The Gruffalo's Child,...,Gruffalo's child,F,NH,0.0,1.0,Gruffalo's child,F,NH,0.0,1.0
1487,Goldilocks and the Three Bears,2,Goldilocks,herself,Goldilocks,Goldilocks,"""Ha!"" she said-\n""I can’t see any bears.""",Ha! I can’t see any bears.,26,Goldilocks and the Three Bears,...,Goldilocks,F,H,0.0,1.0,Goldilocks,F,H,0.0,1.0


In [24]:
c[c['Recipient ambiguity']==0].sum(axis=0)/len(c)

validation_sample_id                                        1328.115578
Error (e.g. speech section not found or not highlighted)       0.075377
Speaker wrong                                                  0.015075
Speaker_matched wrong                                          0.030151
Speaker ambiguity                                              0.020101
Recipient wrong                                                0.010050
Recipient_matched wrong                                        0.020101
Recipient ambiguity                                            0.000000
Indirect speech                                                0.025126
dtype: float64

In [24]:
c[c['Speaker ambiguity']==0].sum(axis=0)/len(c)

validation_sample_id                                        1458.638191
Error (e.g. speech section not found or not highlighted)       0.080402
Speaker wrong                                                  0.005025
Speaker_matched wrong                                          0.020101
Speaker ambiguity                                              0.000000
Recipient wrong                                                0.035176
Recipient_matched wrong                                        0.055276
Recipient ambiguity                                            0.100503
Indirect speech                                                0.025126
dtype: float64

#### Experimenting with how to handle cases where speech_text cannot be found (due to missing \n characters):

In [131]:
validation_sample.head(30)

Unnamed: 0,book,speech_section_id,speaker,recipient,speaker_matched,recipient_matched,speech_text,spoken_words_only,spoken_word_count,chunk_titles,...,human_recipient,alias_count_recipient,speaker_is_mum,speaker_is_dad,speaker_is_granny,speaker_is_grandpa,recipient_is_mum,recipient_is_dad,recipient_is_granny,recipient_is_grandpa
3504,The Dinosaur That Pooped Christmas,1,Danny,himself,Danny,Danny,An egg?! Santa brought me an egg?!,An egg?! Santa brought me an egg?!,34,The Dinosaur That Pooped Christmas,...,H,1.0,False,False,False,False,False,False,False,False
414,Elmer and Grandpa Eldo,2,Monkey,Elmer,monkey,Elmer,Golden Grandpa Eldo. That’s nice.,Golden Grandpa Eldo. That’s nice.,33,Elmer and Grandpa Eldo,...,NH,0.0,False,False,False,False,False,False,False,False
2998,Elmer and the Stranger,37,Elmer,Lion and Tiger,Elmer,Lion and Tiger,Yes. And now we’re all... aah...,Yes. And now we’re all... aah...,32,Elmer and the Stranger,...,NH,0.0,False,False,False,False,False,False,False,False
402,The Most Wonderful Gift In The World,5,Bear,Esme,Bear,Esme,The TREACHEROUS path! I don’t like the sound o...,The TREACHEROUS path! I don’t like the sound o...,53,The Most Wonderful Gift In The World,...,H,0.0,False,False,False,False,False,False,False,False


In [132]:
tb = 'The Dinosaur That Pooped Christmas'
tid = 3504

In [133]:
df[df.Title == tb].Text.iloc[0]

