# Disambiguation Notebook

- In this notebook, We will be utilizing OpenAI's Batch Processsing in order to run three processes over an entire dataset:
  - A pronoun disambiguation prompt
  - A fine-tuned model that identifies incomplete questions
  - A prompt that adds needed context to incomplete questions

# Batch Processing

This code demonstrates how you can submit batch processing jobs to openai. Batch processing is 50% cheaper than synchronous processing, making it idea for processing large batches of data.

### Patch Processing documentation
- https://platform.openai.com/docs/guides/batch

### API reference
- https://platform.openai.com/docs/api-reference/batch

### Pricing:
- https://openai.com/api/pricing


# Batch Processing Utils

In [None]:
# Batch Processing
import re
import os
import json
import time
import pandas as pd
from typing import List, Optional, Type
from openai import OpenAI
from pydantic import BaseModel
from tqdm import tqdm

class BatchProcessor:
    def __init__(self, api_key: str, response_format: Type[BaseModel] = None, llm_model: str = 'gpt-4o-mini'):
        self.client = OpenAI(api_key=api_key)
        self.tasks = []
        self.batch_response = None
        self.schema = generate_schema(response_format)
        self.response_format = response_format
        self.llm_model = llm_model

    def set_schema(self, schema: dict):
        self.schema = schema

    def create_tasks(self,
                     df: pd.DataFrame,
                     system_prompt: str,
                     input_prompt: str
                     ):
        self.tasks = [
            {
                "custom_id": f"task_{i}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": self.llm_model,
                    "temperature": 0.7, # set the temperature
                    "messages": [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": input_prompt.format(**row.to_dict())}
                    ],
                    **(
                        {"response_format": {"type": "json_schema", "json_schema": self.schema["json_schema"]}}
                        if self.schema else {}
                    )
                }
            }
            for i, row in df.iterrows()
        ]
    def _save_file(self, filename: str, content: str, overwrite: bool = False) -> None:
        """
        Centralized file saving utility with overwrite protection

        Args:
            filename (str): Path to save file
            content (str): Content to write
            overwrite (bool): Allow overwriting existing files

        Raises:
        FileExistsError: If file exists and overwrite=False
        PermissionError: If unable to write to file
        ValueError: If filename is invalid
        """
        if os.path.exists(filename) and not overwrite:
            raise FileExistsError(f"File {filename} already exists")

        try:
            dir_path = os.path.dirname(filename)
            if dir_path and not os.path.exists(dir_path):
                os.makedirs(dir_path)

            with open(filename, 'w') as f:
                f.write(content)

        except Exception as e:
            print(f"Error saving file: {e}")
            raise

    def write_tasks_to_file(self, filename="batch_tasks.jsonl", overwrite=False):
        content = '\n'.join(json.dumps(task) for task in self.tasks)
        self._save_file(filename, content, overwrite)

    def upload_file(self, filename="batch_tasks.jsonl"):
        file_response = self.client.files.create(
            file=open(filename, 'rb'),
            purpose='batch'
        )
        return file_response.id

    def create_batch_job(self, file_id):
        self.batch_response = self.client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )
        batch_id = self.batch_response.id
        print(f"Batch created with ID: {batch_id}")
        return batch_id

    def _get_batch_status(self, batch_id: Optional[str] = None):
        if batch_id:
            return self.client.batches.retrieve(batch_id)
        elif self.batch_response:
            return self.client.batches.retrieve(self.batch_response.id)
        else:
            raise Exception("No batch_id provided and no active batch found.")

    def monitor_batch(self, batch_id: Optional[str] = None, interval: Optional[int] = 1000):
        batch_status = self._get_batch_status(batch_id)
        print("Monitoring batch job...")
        while True:
            batch_status = self.client.batches.retrieve(batch_status.id)
            if batch_status.status == 'completed':
                print("Batch job completed.")
                return batch_status
            elif batch_status.status == 'failed':
                print(f"Batch job failed: {batch_status.errors}")
                break
            else:
                print(f"Batch job is still processing... Status: {batch_status.status}")
            time.sleep(interval)

    def download_results(self, output_filename="batch_results.jsonl", overwrite=False, batch_id: Optional[str] = None):
        batch_status = self._get_batch_status(batch_id)

        if batch_status.status != 'completed':
            print(f"Batch status: {batch_status.status}")
            if hasattr(batch_status, 'errors'):
                print(f"Errors: {batch_status.errors}")
            return

        if hasattr(batch_status, 'output_file_id') and batch_status.output_file_id:
            result_file_id = batch_status.output_file_id
            print(f"Downloading file with ID: {result_file_id}")

            try:
                result_content = self.client.files.content(result_file_id)
                self._save_file(output_filename, result_content.text, overwrite)
                print("Results downloaded to file:", output_filename)
            except Exception as e:
                print(f"Error downloading results: {e}")
        else:
            print(f"Batch completed but no result file generated. Status details: {getattr(batch_status, 'status_details', None)}")

    def parse_results(self, filename="batch_results.jsonl"):
        results = []
        try:
            with open(filename, 'r') as f:
                for line in f:
                    try:
                        result = json.loads(line)
                        content = result.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content', None)
                        custom_id = result.get('custom_id')

                        # Extract only the numerical part after 'task_'
                        if custom_id:
                            match = re.search(r'task_(\d+)$', custom_id)
                            if match:
                                custom_id = int(match.group(1))

                        def normalize_quotes(content: str) -> str:
                            content = content.replace('“', '"').replace('”', '"')
                            content = content.replace('‘', "'").replace('’', "'")
                            return content

                        content = normalize_quotes(content)

                        if content:
                            if content.strip().startswith("{") and not content.strip().endswith("}"):
                                print(f"Attempting to repair truncated content...")
                                if content.strip().endswith('",'):
                                    content = content.rstrip('",') + '"},'
                                elif not content.strip().endswith('"'):
                                    content += '"}'
                                else:
                                    content += '}'

                            content = content.replace('","}', '"}')

                            # Parse using Pydantic model and add custom_id
                            parsed_result = self.response_format.model_validate_json(content)
                            parsed_data = parsed_result.model_dump()
                            parsed_data['custom_id'] = custom_id  # Add numeric custom_id

                            results.append(parsed_data)

                    except json.JSONDecodeError as e:
                        print(f"Error parsing JSON: {e}")
                        print(f"Raw content: {content}")

                    except Exception as e:
                        print(f"Error parsing result with model {self.response_format.__name__}: {e}")
                        print(f"Raw content: {content}")

        except FileNotFoundError:
            print(f"Results file '{filename}' not found.")

        if results:
            # Normalize into a DataFrame using dynamic keys
            df = pd.json_normalize(results)
            return df
        else:
            print("No valid results found.")
            return pd.DataFrame()



    def cancel_batch(self, batch_id: Optional[str] = None):
        batch_status = self._get_batch_status(batch_id)

        if batch_status.status in ["completed", "failed", "cancelled"]:
            print(f"Cannot cancel batch with status '{batch_status.status}'")
            return

        try:
            self.client.batches.cancel(batch_status.id)
            print(f"Batch '{batch_status.id}' has been cancelled.")
        except Exception as e:
            print(f"Error cancelling batch: {e}")

# Schema generator
from openai.lib._parsing._completions import type_to_response_format_param

def generate_schema(model):
    schema = type_to_response_format_param(model)
    return {"type": "json_schema", "json_schema": schema['json_schema']}


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    # Tokenize the text
    doc = nlp(text)
    return [token.text for token in doc]

# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

# Part 1: Disambiguation

We will use the base model to disambiguate pronouns

## Import Data


In [None]:
import pandas as pd

#sheet_id = "1E5gKTygkxLXyYsbAjR5trxaDgDx90VfHgsXlZExcnQA" # Boder Part A
sheet_id = "1VUMcV3_sv6GMAprTkNW2-RAsgRWpGDu5dxwvxqxWNrY" # Boder Part B Sheet name: Preprocessing Results- Version B
worksheet_name = "IsQuestion"
sh = gc.open_by_key(sheet_id)
worksheet = sh.worksheet(worksheet_name)
full_df = pd.DataFrame(worksheet.get_all_records())

In [None]:
full_df

Unnamed: 0,Unnamed: 1,index,file_part,file_num,Time,Names,Speaker,original_text,BrackInfo,IsKeep,Info,is_interviewee,text_length,needs_edits,text,context,token_count,is_question,is_command
0,4,49911,4,1,0:01:54,Ilmar A.,David Boder,Let me tell you frankly : Do not tell me some...,"[('[', '?', ']'), ('[', 'possibly “we are info...","['O', 'O', 'Y']",[],0,88,1,Let me tell you frankly: Do not tell me some g...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",165,TRUE,FALSE
1,6,49913,6,1,0:02:38,Ilmar A.,David Boder,… until today. But to you personally. Where di...,"[('[', 'Bend down to the microphone?', ']')]",['N'],['Bend down to the microphone?'],0,54,0,… until today. But to you personally. Where di...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",262,TRUE,FALSE
2,8,49915,8,1,0:03:19,Ilmar A.,David Boder,How old were you then? What grade were you in ...,[],[],[],0,11,0,How old were you then? What grade were you in ...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",361,TRUE,FALSE
3,10,49917,10,1,0:03:31,Ilmar A.,David Boder,What means “first grade”? The highest grade?,[],[],[],0,7,0,What means “first grade”? The highest grade?,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",409,TRUE,FALSE
4,12,49919,12,1,0:03:38,Ilmar A.,David Boder,… of the “Volks-”?,[],[],[],0,4,0,… of the “Volks-”?,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",441,TRUE,FALSE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15803,52856,48117,479,119,0:28:21,Janine Binder,David Boder,"Was she Jewish, a Jewish girl friend?",[],[],[],0,7,0,"Was she Jewish, a Jewish girl friend?",INTERVIEWER: Who liberated you in Stuttgart?\n...,500,TRUE,FALSE
15804,52858,48119,481,119,0:28:23,Janine Binder,David Boder,And she too was there as an Aryan woman?,[],[],[],0,9,0,And she too was there as an Aryan woman?,SUBJECT: Yes.\nINTERVIEWER: The French liberat...,493,TRUE,FALSE
15805,52860,48121,483,119,0:28:55,Janine Binder,David Boder,Found courage?,[],[],[],0,2,0,Found courage?,"SUBJECT: No, no, no. But one feels, one can … ...",495,TRUE,FALSE
15806,52862,48123,485,119,0:29:02,Janine Binder,David Boder,So?,[],[],[],0,1,0,So?,SUBJECT: Yes. But these were all in Polish.\nI...,488,TRUE,FALSE


In [None]:
input_df = full_df[['text', 'context']]

blacklist = ['Aha. Nu?','Yes?', 'What does that mean?','Nu?', 'Yes. And?', 'unintelligible ?', 'And?', 'Go on.', 'Yes? Nu?', 'Yes, and then?', 'Yes, go on.', 'OK?', 'Yes. And then?', 'Yes. Well?', 'I see. Well?', 'What?', 'Nu, go on.', 'Well, go on.', 'Yes, nu?', 'Yes, well?' 'Yes. Nu?', 'Yes, and?', 'Well?', 'Hm. Nu?', 'Mhm?']
input_df = input_df[~input_df['text'].isin(blacklist)]
input_df

Unnamed: 0,text,context
0,Let me tell you frankly: Do not tell me some g...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A...."
1,… until today. But to you personally. Where di...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A...."
2,How old were you then? What grade were you in ...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A...."
3,What means “first grade”? The highest grade?,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A...."
4,… of the “Volks-”?,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A...."
...,...,...
15803,"Was she Jewish, a Jewish girl friend?",INTERVIEWER: Who liberated you in Stuttgart?\n...
15804,And she too was there as an Aryan woman?,SUBJECT: Yes.\nINTERVIEWER: The French liberat...
15805,Found courage?,"SUBJECT: No, no, no. But one feels, one can … ..."
15806,So?,SUBJECT: Yes. But these were all in Polish.\nI...


Define Functions for processing data

## Define Schema

In [None]:
# Define Schema and Prompt

from pydantic import BaseModel, Field
from typing import List, Literal

class ResponseFormat(BaseModel):
    chain_of_thought: str = Field(..., description="Step-by-step reasoning behind the response.")
    contains_unresolved_pronoun: bool = Field(..., description="True if the Question contains at least one ambiguous pronoun that requires resolution; False otherwise, or if the pronoun would refer to the interviewer or interviewee.")
    enough_context_provided: bool = Field(..., description="True if there is sufficient context to resolve the pronoun(s) in the Question; False if more context is needed.")
    disambiguated_text: str = Field(..., description="If pronoun resolution is needed, and sufficient Previous Context is provided, rewrite the Question, referring to the interviewee as 'you'. Otherwise, respond with an empty string.")

system_prompt = "Your role is to process data factually without generating hallucinations or fabrications"

input_prompt = """Task:
The following text is from an interview.
Your task is to resolve pronouns in the Question with proper nouns or noun phrases based on the Previous Context.
Determine whether there are pronouns representing subjects or objects in the Question that need to be disambiguated.
Focus on pronouns like "he," "him," "she," "her," "they," and "them,", as these likely refer to subjects or objects mentioned earlier. Pronouns such as "you" or "I" refer to the interviewee and interviewer and do not need to be disambiguated. Phrases like "that" also don't need to be disambiguated.
Determine whether there is enough Previous Context to resolve the pronoun(s) in the Question.
If the pronouns can be resolved using the Previous Context, revise the Question.  Refer to the interviewee as 'you'.
If not, respond with an empty string.


Previous Context: {text}

Question: {context}
"""





## Run Batch Process

In [None]:
#@title Run Batch Process
from google.colab import userdata
import os

#set environment
os.environ['OPENAI_API_KEY'] = userdata.get('OPEN_AI_PROJECT_KEY')

In [None]:
# Initialize the processor
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat)

#Create Batch Process Compatible Schema
schema = generate_schema(ResponseFormat)
processor.set_schema(schema)

# Define Data to Process
process_df = input_df
#process_df = df.sample(10).copy() # Test on a sample

# Assemble Batch Processing File
processor.create_tasks(process_df, system_prompt, input_prompt)
processor.write_tasks_to_file(filename="batch_tasks_disambiguation.jsonl")

# Upload tasks
file_id = processor.upload_file(filename="batch_tasks_disambiguation.jsonl")

# Create batch job
processor.create_batch_job(file_id)


Copy the batch id from the output and paste it ito the code.

In [None]:
BATCH_ID = 'batch_67e26e414144819099b681d01efc6879' #replace with your batch_id

In [None]:
# Cancel batch job
#processor.cancel_batch(batch_id = BATCH_ID)

In [None]:
# Monitor progress
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat)
processor.monitor_batch(batch_id = BATCH_ID)

In [None]:
# Download results from Batch ID
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'),  response_format=ResponseFormat)
processor.download_results(output_filename='batch_results_disambiguation.json', batch_id = BATCH_ID, overwrite=True)

In [None]:
# Parse results
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat)
results_df = processor.parse_results(filename='batch_results_disambiguation.json')

In [None]:
# join results to original data frame on index
#df = df.reset_index() # You might need to reset index?
disambiguated_df = full_df.join(results_df.set_index('custom_id'), how='left')
disambiguated_df

In [None]:
disambiguated_df.to_csv("disambiguation_results.csv")

from google.colab import files
files.download("disambiguation_results.csv")

NameError: name 'df_combined' is not defined

# Part 2: Categorization

now, we will utilize a fine-tuned model to identify questions that are incomplete and need rewriting

## Import Data

In [None]:
# Run if you did the first part in a separate runtime

sheet_id = "1fXmcPPn7j5kg4ALdE5P2ZPgfaC_UpceuO9eDqWb_dP8" #Version A/B Results
worksheet_name = "Sheet1"
sh = gc.open_by_key(sheet_id)
worksheet = sh.worksheet(worksheet_name)
disambiguated_df = pd.DataFrame(worksheet.get_all_records())
disambiguated_df = disambiguated_df.set_index('a', drop = True)
disambiguated_df

Unnamed: 0_level_0,index,file_part,file_num,Time,Names,Speaker,original_text,BrackInfo,IsKeep,Info,...,context,token_count,is_question,is_command,chain_of_thought,contains_unresolved_pronoun,enough_context_provided,disambiguated_text,was_disambiguated,text_for_rewriting
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,49911,4,1,0:01:54,Ilmar A.,David Boder,Let me tell you frankly : Do not tell me some...,"[('[', '?', ']'), ('[', 'possibly “we are info...","['O', 'O', 'Y']",[],...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",165,TRUE,FALSE,"In the Question, the pronouns 'you' and 'your'...",FALSE,TRUE,Let me tell you frankly: Do not tell me some g...,TRUE,Let me tell you frankly: Do not tell me some g...
6,49913,6,1,0:02:38,Ilmar A.,David Boder,… until today. But to you personally. Where di...,"[('[', 'Bend down to the microphone?', ']')]",['N'],['Bend down to the microphone?'],...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",262,TRUE,FALSE,"In the provided Previous Context, the subject ...",FALSE,TRUE,"... until today. But to you personally, Mr. Il...",TRUE,… until today. But to you personally. Where di...
8,49915,8,1,0:03:19,Ilmar A.,David Boder,How old were you then? What grade were you in ...,[],[],[],...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",361,TRUE,FALSE,"In the provided context, the interviewee is Mr...",FALSE,TRUE,How old were you then? What grade were you in ...,TRUE,How old were you then? What grade were you in ...
10,49917,10,1,0:03:31,Ilmar A.,David Boder,What means “first grade”? The highest grade?,[],[],[],...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",409,TRUE,FALSE,The Question contains the pronoun 'the highest...,TRUE,FALSE,,FALSE,What means “first grade”? The highest grade?
12,49919,12,1,0:03:38,Ilmar A.,David Boder,… of the “Volks-”?,[],[],[],...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",441,TRUE,FALSE,"The question contains the pronoun ""the"" which ...",TRUE,FALSE,,FALSE,… of the “Volks-”?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52856,48117,479,119,0:28:21,Janine Binder,David Boder,"Was she Jewish, a Jewish girl friend?",[],[],[],...,INTERVIEWER: Who liberated you in Stuttgart?\n...,500,TRUE,FALSE,"In the Previous Context, the interviewee menti...",TRUE,FALSE,,FALSE,"Was she Jewish, a Jewish girl friend?"
52858,48119,481,119,0:28:23,Janine Binder,David Boder,And she too was there as an Aryan woman?,[],[],[],...,SUBJECT: Yes.\nINTERVIEWER: The French liberat...,493,TRUE,FALSE,"In the Question, the pronoun ""she"" likely refe...",TRUE,TRUE,And your girlfriend too was there as an Aryan ...,TRUE,And your girlfriend too was there as an Aryan ...
52860,48121,483,119,0:28:55,Janine Binder,David Boder,Found courage?,[],[],[],...,"SUBJECT: No, no, no. But one feels, one can … ...",495,TRUE,FALSE,The question 'Found courage?' does not contain...,FALSE,TRUE,,FALSE,Found courage?
52862,48123,485,119,0:29:02,Janine Binder,David Boder,So?,[],[],[],...,SUBJECT: Yes. But these were all in Polish.\nI...,488,TRUE,FALSE,The question 'So?' is too vague and does not c...,FALSE,TRUE,,FALSE,So?


In [None]:
# Blacklisted Questions and Long Questions are automatic Non rewrites and should not be inputted into the df

blacklist = ['Aha. Nu?','Yes?', 'What does that mean?','Nu?', 'Yes. And?', 'unintelligible ?', 'And?', 'Go on.', 'Yes? Nu?', 'Yes, and then?', 'Yes, go on.', 'OK?', 'Yes. And then?', 'Yes. Well?', 'I see. Well?', 'What?', 'Nu, go on.', 'Well, go on.', 'Yes, nu?', 'Yes, well?', 'Yes. Nu?', 'Yes, and?', 'Well?', 'Hm. Nu?', 'Mhm?', '?']
for index, row in disambiguated_df.iterrows():
  if row['text_for_rewriting'] in blacklist:
    disambiguated_df.at[index, 'decision'] = False
    disambiguated_df.at[index, 'reason'] = 'Blacklisted'
  if row['text_length'] >= 20:
    disambiguated_df.at[index, 'decision'] = False
    disambiguated_df.at[index, 'reason'] = 'Too long'
disambiguated_df

Unnamed: 0_level_0,index,file_part,file_num,Time,Names,Speaker,original_text,BrackInfo,IsKeep,Info,...,is_question,is_command,chain_of_thought,contains_unresolved_pronoun,enough_context_provided,disambiguated_text,was_disambiguated,text_for_rewriting,decision,reason
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,49911,4,1,0:01:54,Ilmar A.,David Boder,Let me tell you frankly : Do not tell me some...,"[('[', '?', ']'), ('[', 'possibly “we are info...","['O', 'O', 'Y']",[],...,TRUE,FALSE,"In the Question, the pronouns 'you' and 'your'...",FALSE,TRUE,Let me tell you frankly: Do not tell me some g...,TRUE,Let me tell you frankly: Do not tell me some g...,False,Too long
6,49913,6,1,0:02:38,Ilmar A.,David Boder,… until today. But to you personally. Where di...,"[('[', 'Bend down to the microphone?', ']')]",['N'],['Bend down to the microphone?'],...,TRUE,FALSE,"In the provided Previous Context, the subject ...",FALSE,TRUE,"... until today. But to you personally, Mr. Il...",TRUE,… until today. But to you personally. Where di...,False,Too long
8,49915,8,1,0:03:19,Ilmar A.,David Boder,How old were you then? What grade were you in ...,[],[],[],...,TRUE,FALSE,"In the provided context, the interviewee is Mr...",FALSE,TRUE,How old were you then? What grade were you in ...,TRUE,How old were you then? What grade were you in ...,,
10,49917,10,1,0:03:31,Ilmar A.,David Boder,What means “first grade”? The highest grade?,[],[],[],...,TRUE,FALSE,The Question contains the pronoun 'the highest...,TRUE,FALSE,,FALSE,What means “first grade”? The highest grade?,,
12,49919,12,1,0:03:38,Ilmar A.,David Boder,… of the “Volks-”?,[],[],[],...,TRUE,FALSE,"The question contains the pronoun ""the"" which ...",TRUE,FALSE,,FALSE,… of the “Volks-”?,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52856,48117,479,119,0:28:21,Janine Binder,David Boder,"Was she Jewish, a Jewish girl friend?",[],[],[],...,TRUE,FALSE,"In the Previous Context, the interviewee menti...",TRUE,FALSE,,FALSE,"Was she Jewish, a Jewish girl friend?",,
52858,48119,481,119,0:28:23,Janine Binder,David Boder,And she too was there as an Aryan woman?,[],[],[],...,TRUE,FALSE,"In the Question, the pronoun ""she"" likely refe...",TRUE,TRUE,And your girlfriend too was there as an Aryan ...,TRUE,And your girlfriend too was there as an Aryan ...,,
52860,48121,483,119,0:28:55,Janine Binder,David Boder,Found courage?,[],[],[],...,TRUE,FALSE,The question 'Found courage?' does not contain...,FALSE,TRUE,,FALSE,Found courage?,,
52862,48123,485,119,0:29:02,Janine Binder,David Boder,So?,[],[],[],...,TRUE,FALSE,The question 'So?' is too vague and does not c...,FALSE,TRUE,,FALSE,So?,,


In [None]:
input_df = disambiguated_df[disambiguated_df['decision'] != False]
input_df = pd.DataFrame(input_df['text_for_rewriting'])
input_df

Unnamed: 0_level_0,text_for_rewriting
a,Unnamed: 1_level_1
8,How old were you then? What grade were you in ...
10,What means “first grade”? The highest grade?
12,… of the “Volks-”?
14,Yes. The fourth grade of the “Mittelschule.” S...
16,"Yes. So, and in which city were you?"
...,...
52856,"Was she Jewish, a Jewish girl friend?"
52858,And your girlfriend too was there as an Aryan ...
52860,Found courage?
52862,So?


## Define Schema

In [None]:
from pydantic import BaseModel, Field
from typing import List, Literal

class ResponseFormat(BaseModel):
  decision: bool
  reason: str

input_prompt = "{text_for_rewriting}"

system_prompt = "Your job is to identify questions that are incomplete and need more context to be added to them. Your response should be true if the question needs to be rewritten and false if not. Justify your choice with reasoning. Your response should be in the format of decision. reason: "

## Batch Processes

In [None]:
#@title Run Batch Process
from google.colab import userdata
import os

#set environment
os.environ['OPENAI_API_KEY'] = userdata.get('OPEN_AI_PROJECT_KEY')

In [None]:
# Initialize the processor
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat, llm_model = "ft:gpt-4o-2024-08-06:ucla-cultural-heritage-ai-lab::B5h9mAHM")

#Create Batch Process Compatible Schema
schema = generate_schema(ResponseFormat)
processor.set_schema(schema)

# Define Data to Process
process_df = input_df

# Assemble Batch Processing File
processor.create_tasks(process_df, system_prompt, input_prompt)
processor.write_tasks_to_file(filename="batch_tasks_classification.jsonl")

# Upload tasks
file_id = processor.upload_file(filename="batch_tasks_classification.jsonl")

# Create batch job
processor.create_batch_job(file_id)


Batch created with ID: batch_67e780b6b9a08190878ba7ebc9a882a1


'batch_67e780b6b9a08190878ba7ebc9a882a1'

In [None]:
BATCH_ID = 'batch_67e780b6b9a08190878ba7ebc9a882a1' #replace with your batch_id

In [None]:
# Cancel batch job
#processor.cancel_batch(batch_id = BATCH_ID)

In [None]:
# Monitor progress
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat)
processor.monitor_batch(batch_id = BATCH_ID)

Monitoring batch job...
Batch job is still processing... Status: in_progress
Batch job is still processing... Status: in_progress
Batch job is still processing... Status: finalizing
Batch job is still processing... Status: finalizing
Batch job completed.


Batch(id='batch_67e780b6b9a08190878ba7ebc9a882a1', completion_window='24h', created_at=1743225014, endpoint='/v1/chat/completions', input_file_id='file-YAEpt4aT83HSSsHofPmF5H', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1743228143, error_file_id=None, errors=None, expired_at=None, expires_at=1743311414, failed_at=None, finalizing_at=1743226999, in_progress_at=1743225018, metadata=None, output_file_id='file-EvHsnPAV5Li5acT3tmzs5T', request_counts=BatchRequestCounts(completed=13224, failed=0, total=13224))

In [None]:
# Download results from Batch ID
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'),  response_format=ResponseFormat)
processor.download_results(output_filename='batch_results_classification.json', batch_id = BATCH_ID, overwrite=True)

Downloading file with ID: file-EvHsnPAV5Li5acT3tmzs5T
Results downloaded to file: batch_results_classification.json


In [None]:
# Parse results
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat)
results_df = processor.parse_results(filename='batch_results_classification.json')

In [None]:
results_df

Unnamed: 0,decision,reason,custom_id
0,True,The question lacks context regarding the speci...,8
1,True,The question lacks clarity regarding the conte...,10
2,True,The question lacks clarity on what specific as...,12
3,True,The question lacks clarity regarding the educa...,14
4,True,The question lacks clarity on the context of t...,16
...,...,...,...
13219,True,The question lacks specificity about who 'she'...,52856
13220,False,"The question is straightforward, asking about ...",52858
13221,True,The question lacks context regarding what spec...,52860
13222,True,The question 'So?' lacks context regarding wha...,52862


In [None]:
# join results to original data frame on index
#df = df.reset_index() # You might need to reset index?
for index, row in results_df.iterrows():
  disambiguated_df.at[row['custom_id'], 'decision'] = row['decision']
  disambiguated_df.at[row['custom_id'], 'reason'] = row['reason']
disambiguated_df

Unnamed: 0_level_0,index,file_part,file_num,Time,Names,Speaker,original_text,BrackInfo,IsKeep,Info,...,is_question,is_command,chain_of_thought,contains_unresolved_pronoun,enough_context_provided,disambiguated_text,was_disambiguated,text_for_rewriting,decision,reason
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,49911,4,1,0:01:54,Ilmar A.,David Boder,Let me tell you frankly : Do not tell me some...,"[('[', '?', ']'), ('[', 'possibly “we are info...","['O', 'O', 'Y']",[],...,TRUE,FALSE,"In the Question, the pronouns 'you' and 'your'...",FALSE,TRUE,Let me tell you frankly: Do not tell me some g...,TRUE,Let me tell you frankly: Do not tell me some g...,False,Too long
6,49913,6,1,0:02:38,Ilmar A.,David Boder,… until today. But to you personally. Where di...,"[('[', 'Bend down to the microphone?', ']')]",['N'],['Bend down to the microphone?'],...,TRUE,FALSE,"In the provided Previous Context, the subject ...",FALSE,TRUE,"... until today. But to you personally, Mr. Il...",TRUE,… until today. But to you personally. Where di...,False,Too long
8,49915,8,1,0:03:19,Ilmar A.,David Boder,How old were you then? What grade were you in ...,[],[],[],...,TRUE,FALSE,"In the provided context, the interviewee is Mr...",FALSE,TRUE,How old were you then? What grade were you in ...,TRUE,How old were you then? What grade were you in ...,True,The question lacks context regarding the speci...
10,49917,10,1,0:03:31,Ilmar A.,David Boder,What means “first grade”? The highest grade?,[],[],[],...,TRUE,FALSE,The Question contains the pronoun 'the highest...,TRUE,FALSE,,FALSE,What means “first grade”? The highest grade?,True,The question lacks clarity regarding the conte...
12,49919,12,1,0:03:38,Ilmar A.,David Boder,… of the “Volks-”?,[],[],[],...,TRUE,FALSE,"The question contains the pronoun ""the"" which ...",TRUE,FALSE,,FALSE,… of the “Volks-”?,True,The question lacks clarity on what specific as...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52856,48117,479,119,0:28:21,Janine Binder,David Boder,"Was she Jewish, a Jewish girl friend?",[],[],[],...,TRUE,FALSE,"In the Previous Context, the interviewee menti...",TRUE,FALSE,,FALSE,"Was she Jewish, a Jewish girl friend?",True,The question lacks specificity about who 'she'...
52858,48119,481,119,0:28:23,Janine Binder,David Boder,And she too was there as an Aryan woman?,[],[],[],...,TRUE,FALSE,"In the Question, the pronoun ""she"" likely refe...",TRUE,TRUE,And your girlfriend too was there as an Aryan ...,TRUE,And your girlfriend too was there as an Aryan ...,False,"The question is straightforward, asking about ..."
52860,48121,483,119,0:28:55,Janine Binder,David Boder,Found courage?,[],[],[],...,TRUE,FALSE,The question 'Found courage?' does not contain...,FALSE,TRUE,,FALSE,Found courage?,True,The question lacks context regarding what spec...
52862,48123,485,119,0:29:02,Janine Binder,David Boder,So?,[],[],[],...,TRUE,FALSE,The question 'So?' is too vague and does not c...,FALSE,TRUE,,FALSE,So?,True,The question 'So?' lacks context regarding wha...


In [None]:
processed_df = disambiguated_df.fillna('')
processed_df.to_csv("classified_results.csv")

from google.colab import files
files.download("classified_results.csv")

  processed_df = disambiguated_df.fillna('')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Part 3: Context Rewriting

## Import Data

In [None]:
sheet_id = "17oL-BD7dERURpVE4yKKo7a0mormQ9SRXvQ3hQP9Iek8"
worksheet_name = "Sheet1"
sh = gc.open_by_key(sheet_id)
worksheet = sh.worksheet(worksheet_name)
processed_df = pd.DataFrame(worksheet.get_all_records())

In [None]:
import pandas as pd
#processed_df = processed_df.set_index('a', drop = True)
processed_df = pd.read_csv('classified_results_boderb.csv')
processed_df = processed_df.set_index('a', drop = True)
processed_df.head()

Unnamed: 0_level_0,index,file_part,file_num,Time,Names,Speaker,original_text,BrackInfo,IsKeep,Info,...,is_question,is_command,chain_of_thought,contains_unresolved_pronoun,enough_context_provided,disambiguated_text,was_disambiguated,text_for_rewriting,decision,reason
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,49911,4,1,0:01:54,Ilmar A.,David Boder,Let me tell you frankly : Do not tell me some...,"[('[', '?', ']'), ('[', 'possibly “we are info...","['O', 'O', 'Y']",[],...,True,False,"In the Question, the pronouns 'you' and 'your'...",False,True,Let me tell you frankly: Do not tell me some g...,True,Let me tell you frankly: Do not tell me some g...,False,Too long
6,49913,6,1,0:02:38,Ilmar A.,David Boder,… until today. But to you personally. Where di...,"[('[', 'Bend down to the microphone?', ']')]",['N'],['Bend down to the microphone?'],...,True,False,"In the provided Previous Context, the subject ...",False,True,"... until today. But to you personally, Mr. Il...",True,… until today. But to you personally. Where di...,False,Too long
8,49915,8,1,0:03:19,Ilmar A.,David Boder,How old were you then? What grade were you in ...,[],[],[],...,True,False,"In the provided context, the interviewee is Mr...",False,True,How old were you then? What grade were you in ...,True,How old were you then? What grade were you in ...,True,The question lacks context regarding the speci...
10,49917,10,1,0:03:31,Ilmar A.,David Boder,What means “first grade”? The highest grade?,[],[],[],...,True,False,The Question contains the pronoun 'the highest...,True,False,,False,What means “first grade”? The highest grade?,True,The question lacks clarity regarding the conte...
12,49919,12,1,0:03:38,Ilmar A.,David Boder,… of the “Volks-”?,[],[],[],...,True,False,"The question contains the pronoun ""the"" which ...",True,False,,False,… of the “Volks-”?,True,The question lacks clarity on what specific as...


In [None]:
input_df = processed_df[processed_df['decision'] == True]
input_df = pd.DataFrame(input_df[['text_for_rewriting', 'context', 'reason']])
input_df.head()

Unnamed: 0_level_0,text_for_rewriting,context,reason
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,How old were you then? What grade were you in ...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",The question lacks context regarding the speci...
10,What means “first grade”? The highest grade?,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",The question lacks clarity regarding the conte...
12,… of the “Volks-”?,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",The question lacks clarity on what specific as...
14,Yes. The fourth grade of the “Mittelschule.” S...,"INTERVIEWER: Spool 152A, Spool 152A. Ilmar A....",The question lacks clarity regarding the educa...
16,"Yes. So, and in which city were you?","INTERVIEWER: Munich, September the 24th, 1946,...",The question lacks clarity on the context of t...


## Define Schema

In [None]:
# Define Schema and Prompt

from pydantic import BaseModel, Field
from typing import List, Literal


system_prompt = "Your role is to process data factually without generating hallucinations or fabrications"

class ResponseFormat(BaseModel):
    step_by_step_reasoning: str = Field(..., description="Step-by-step reasoning behind the response.")
    sufficient_context_provided: bool = Field(..., description="True if there is sufficient Previous Context to add correct information to the Question. You should be absolutely certain with your interpretation of the text; False if more Previous Context is needed.")
    question_with_clarified_context: str = Field(..., description="If there is enough Previous Context to clarify the question in the Question, revise the Question with as little added text as possible while still adding the neccessary context. Maintain the original langauge and tone of the historical material as closely as possible.. Only return the updated text, nothing else. Otherwise, respond with an empty string")



input_prompt = """Task:
The following text is from an interview.
Your task is to clarify generic and incomplete questions in the Question based on the Previous Context and the Reason for Rewriting.
When rewriting the current text, be as simple as possible. Add just enough context so that someone familiar with the interview can follow along, but do not add unneccessary details or phrases that just add words.
Determine if there is enough Previous Context to add correct context to the Current Text. If you are not certain of the meaning of the Question based on the Previous Context, there is not enough previous context to add context to the Question.
If there is enough Previous Context to clarify the question in the Question, revise the Question with as little added text as possible while still adding the neccessary context. Maintain the original langauge and tone of the historical material as closely as possible.
If not, respond with an empty string.


Previous Context: {context}

Question: {text_for_rewriting}

Reason for Rewriting: {reason}
"""

## Run Batch Processes

In [None]:
#@title Run Batch Process
from google.colab import userdata
import os

#set environment
os.environ['OPENAI_API_KEY'] = userdata.get('OPEN_AI_PROJECT_KEY')

In [None]:
# Initialize the processor
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat)

#Create Batch Process Compatible Schema
schema = generate_schema(ResponseFormat)
processor.set_schema(schema)

# Define Data to Process
process_df = input_df

# Assemble Batch Processing File
processor.create_tasks(process_df, system_prompt, input_prompt)
processor.write_tasks_to_file(filename="batch_tasks.jsonl")

# Upload tasks
file_id = processor.upload_file(filename="batch_tasks.jsonl")

# Create batch job
processor.create_batch_job(file_id)

Batch created with ID: batch_67e7a2b343648190bf95b2ae63788d34


'batch_67e7a2b343648190bf95b2ae63788d34'

In [None]:
BATCH_ID = 'batch_67e7a2b343648190bf95b2ae63788d34' #replace with your batch_id

In [None]:
# Cancel batch job
#processor.cancel_batch(batch_id = BATCH_ID)

In [None]:
# Monitor progress
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat)
processor.monitor_batch(batch_id = BATCH_ID)

Monitoring batch job...
Batch job completed.


Batch(id='batch_67e7a2b343648190bf95b2ae63788d34', completion_window='24h', created_at=1743233715, endpoint='/v1/chat/completions', input_file_id='file-6C2t6z4rMqedNyzyVcr8es', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1743236508, error_file_id=None, errors=None, expired_at=None, expires_at=1743320115, failed_at=None, finalizing_at=1743235701, in_progress_at=1743233718, metadata=None, output_file_id='file-BLqFCufhHaSQJEgyQ3G8NN', request_counts=BatchRequestCounts(completed=11430, failed=0, total=11430))

In [None]:
# Download results from Batch ID
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'),  response_format=ResponseFormat)
processor.download_results(output_filename='batch_results_classification.json', batch_id = BATCH_ID, overwrite=True)

Downloading file with ID: file-BLqFCufhHaSQJEgyQ3G8NN
Results downloaded to file: batch_results_classification.json


In [None]:
# Parse results
processor = BatchProcessor(api_key=os.getenv('OPENAI_API_KEY'), response_format=ResponseFormat)
results_df = processor.parse_results(filename='batch_results_classification.json')
results_df

Error parsing result with model ResponseFormat: 1 validation error for ResponseFormat
  Invalid JSON: expected `,` or `}` at line 1 column 552 [type=json_invalid, input_value='{"step_by_step_reasoning...er basic necessities?"}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/json_invalid
Raw content: {"step_by_step_reasoning":"1. The original question uses the term 'commodity' which is vague without context. 2. The Previous Context indicates that the Subject mentioned a lack of basic needs and facilities after arriving in Birkenau, implying that 'commodity' refers to essential facilities. 3. The rewritten question needs to clarify that the interviewer is specifically asking about toilet facilities in relation to the absence of other necessities mentioned by the Subject.","sufficient_context_provided":true,"question_with_clarified_context":""Commodity," you mean no toilet facilities and other basic necessities?"}
Error parsing result with model Resp

Unnamed: 0,step_by_step_reasoning,sufficient_context_provided,question_with_clarified_context,custom_id
0,1. The Previous Context discusses Mr. Ilmar's ...,True,How old were you in 1940 when the Soviets came...,8
1,1. The original question asks for clarificatio...,True,What does 'first grade' mean in the context of...,10
2,1. The original question is incomplete and unc...,True,What did you learn about the concept of 'Volks...,12
3,The question was unclear because it used speci...,True,Yes. The fourth grade of the 'Mittelschule.' S...,14
4,The original question asks for the city but do...,True,"Yes. So, and in which city were you living whe...",16
...,...,...,...,...
11412,"1. The original question 'Now tell me, how wer...",True,"Now tell me, how were you liberated in Stuttgart?",52852
11413,1. The question refers to 'she' without specif...,True,"Was your girlfriend, who was deported with you...",52856
11414,The original question 'Found courage?' is vagu...,True,Did you find courage during your imprisonment ...,52860
11415,1. The previous context discusses the subject'...,True,"So, what happened next after you were liberated?",52862


In [None]:
# join results to original data frame on index
#df = df.reset_index() # You might need to reset index?
context_df = processed_df.join(results_df.set_index('custom_id'), how='left')
context_df

Unnamed: 0_level_0,index,file_part,file_num,Time,Names,Speaker,original_text,BrackInfo,IsKeep,Info,...,contains_unresolved_pronoun,enough_context_provided,disambiguated_text,was_disambiguated,text_for_rewriting,decision,reason,step_by_step_reasoning,sufficient_context_provided,question_with_clarified_context
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,49911,4,1,0:01:54,Ilmar A.,David Boder,Let me tell you frankly : Do not tell me some...,"[('[', '?', ']'), ('[', 'possibly “we are info...","['O', 'O', 'Y']",[],...,False,True,Let me tell you frankly: Do not tell me some g...,True,Let me tell you frankly: Do not tell me some g...,False,Too long,,,
6,49913,6,1,0:02:38,Ilmar A.,David Boder,… until today. But to you personally. Where di...,"[('[', 'Bend down to the microphone?', ']')]",['N'],['Bend down to the microphone?'],...,False,True,"... until today. But to you personally, Mr. Il...",True,… until today. But to you personally. Where di...,False,Too long,,,
8,49915,8,1,0:03:19,Ilmar A.,David Boder,How old were you then? What grade were you in ...,[],[],[],...,False,True,How old were you then? What grade were you in ...,True,How old were you then? What grade were you in ...,True,The question lacks context regarding the speci...,1. The Previous Context discusses Mr. Ilmar's ...,True,How old were you in 1940 when the Soviets came...
10,49917,10,1,0:03:31,Ilmar A.,David Boder,What means “first grade”? The highest grade?,[],[],[],...,True,False,,False,What means “first grade”? The highest grade?,True,The question lacks clarity regarding the conte...,1. The original question asks for clarificatio...,True,What does 'first grade' mean in the context of...
12,49919,12,1,0:03:38,Ilmar A.,David Boder,… of the “Volks-”?,[],[],[],...,True,False,,False,… of the “Volks-”?,True,The question lacks clarity on what specific as...,1. The original question is incomplete and unc...,True,What did you learn about the concept of 'Volks...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52856,48117,479,119,0:28:21,Janine Binder,David Boder,"Was she Jewish, a Jewish girl friend?",[],[],[],...,True,False,,False,"Was she Jewish, a Jewish girl friend?",True,The question lacks specificity about who 'she'...,1. The question refers to 'she' without specif...,True,"Was your girlfriend, who was deported with you..."
52858,48119,481,119,0:28:23,Janine Binder,David Boder,And she too was there as an Aryan woman?,[],[],[],...,True,True,And your girlfriend too was there as an Aryan ...,True,And your girlfriend too was there as an Aryan ...,False,"The question is straightforward, asking about ...",,,
52860,48121,483,119,0:28:55,Janine Binder,David Boder,Found courage?,[],[],[],...,False,True,,False,Found courage?,True,The question lacks context regarding what spec...,The original question 'Found courage?' is vagu...,True,Did you find courage during your imprisonment ...
52862,48123,485,119,0:29:02,Janine Binder,David Boder,So?,[],[],[],...,False,True,,False,So?,True,The question 'So?' lacks context regarding wha...,1. The previous context discusses the subject'...,True,"So, what happened next after you were liberated?"


In [None]:
context_df = context_df.fillna('')


In [None]:
context_df.to_csv("rewritten_results.csv")

from google.colab import files
files.download("rewritten_results.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Generate Stats and Flag Questions

In [None]:
#@title import data
#context_df = pd.read_csv("final_results.csv")

In [None]:
import re

def remove_special_entries(lst):
    return [s for s in lst if not re.fullmatch(r'[^a-zA-Z0-9]+', s)]

def contains_only_numbers(word_list):
    for word in word_list:
        if word.isdigit():  # Checks if the word consists only of digits
            return True
    return False

In [None]:
def process_row(row):

  context_added = False
  text = ""
  if row['question_with_clarified_context'] != "":
    text = row['question_with_clarified_context']
    context_added = True
  else:
    text = row['text_for_rewriting']

  if "Interviewer:" in text:
    text = text.replace("Interviewer:", "")
  if "Interviewee:" in text:
    text = text.replace("Interviewee:", "")
  if "INTERVIEWER:" in text:
    text = text.replace("INTERVIEWER:", "")

  text_tokenized = tokenize(row['text'])
  final_tokenized = tokenize(text)
  text_tokenized = remove_special_entries(text_tokenized)
  final_tokenized = remove_special_entries(final_tokenized)

  removed_words = set(text_tokenized) - set(final_tokenized)
  added_words = set(final_tokenized) - set(text_tokenized)

  added_length = len(added_words)
  removed_length = len(removed_words)

  text_length = len(text_tokenized)
  final_length = len(final_tokenized)

  #Flag Questions

  is_flagged = False

  flagged_words = ['yes', 'nu', 'experience', 'considering', 'especially', 'given', 'particularly', 'relation', 'regarding', 'circumstances', 'suggesting', 'mentioned']

  flag_reasoning = ""

  added_words = [word.lower() for word in added_words]
  removed_words = [word.lower() for word in removed_words]

  for word in added_words:
    if word in flagged_words:
      is_flagged = True
      flag_reasoning = flag_reasoning + " " + word
      break

  if (text_length <= 1) and (added_length >= 10):
    is_flagged = True
    flag_reasoning = flag_reasoning + " " + "One word elaboration"

  if (added_length >= 15):
    is_flagged = True
    flag_reasoning = flag_reasoning + " " + "Added " + str(added_length) + " words"

  if '[' in text:
    is_flagged = True
    flag_reasoning = flag_reasoning + " " + "has closing bracket"

  if ']' in text:
    is_flagged = True
    flag_reasoning = flag_reasoning + " " + "has opening bracket"

  if contains_only_numbers(added_words):
    is_flagged = True
    flag_reasoning = flag_reasoning + " " + "contains only numbers"

  flag_dict = {
      'context_added': context_added,
      'final_text': text,
      'tokenized_text': text_tokenized,
      'tokenized_final_text': final_tokenized,
      'added_words': added_words,
      'removed_words': removed_words,
      'added_length': added_length,
      'removed_length': removed_length,
      'final_length': final_length,
      'is_flagged': is_flagged,
      'flag_reasoning': flag_reasoning
  }

  return flag_dict


In [None]:
from tqdm import tqdm

def process_dataframe(df):
    """
    Function to process the entire dataframe using the process_row function and add results to the DataFrame.
    """
    results = []
    index = 1
    final = len(df)
    # Use tqdm to create a progress bar
    pbar = tqdm(df.iterrows())
    for idx, row in pbar:
        response_dict = process_row(row)

        # Append the response dict to the previous results
        results.append(response_dict)
        pbar.set_description(f"Processing row {index} of {final}")
        index += 1


    # Create a DataFrame from the results
    results_df = pd.DataFrame(results, index=df.index)

    # Merge results_df with the original df based on the index
    df = df.join(results_df, how='left')

    return df

In [None]:
final_df = process_dataframe(context_df)

Processing row 15808 of 15808: : 15808it [05:07, 51.35it/s]


In [None]:
final_df.to_csv("final_results.csv")

from google.colab import files
files.download("final_results.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Processsing Flagged Questions

In [None]:
# Run if you did the first part in a separate runtime

sheet_id = "1fXmcPPn7j5kg4ALdE5P2ZPgfaC_UpceuO9eDqWb_dP8" #Version A/B Results
worksheet_name = "Results Before Manual Revision"
sh = gc.open_by_key(sheet_id)
worksheet = sh.worksheet(worksheet_name)
final_df = pd.DataFrame(worksheet.get_all_records())
final_df = final_df.set_index('a', drop = True)
final_df

Unnamed: 0_level_0,index,file_part,file_num,Time,Names,Speaker,original_text,BrackInfo,IsKeep,Info,...,tokenized_final_text,added_words,removed_words,added_length,removed_length,final_length,is_flagged,flag_reasoning,Overwritten,Rewritten Flagged Question
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,49911,4,1,0:01:54,Ilmar A.,David Boder,Let me tell you frankly : Do not tell me some...,"[('[', '?', ']'), ('[', 'possibly “we are info...","['O', 'O', 'Y']",[],...,"['Let', 'me', 'tell', 'you', 'frankly', 'Do', ...",[],[],0,0,76,FALSE,,,
6,49913,6,1,0:02:38,Ilmar A.,David Boder,… until today. But to you personally. Where di...,"[('[', 'Bend down to the microphone?', ']')]",['N'],['Bend down to the microphone?'],...,"['until', 'today', 'But', 'to', 'you', 'person...",[],[],0,0,52,FALSE,,,
8,49915,8,1,0:03:19,Ilmar A.,David Boder,How old were you then? What grade were you in ...,[],[],[],...,"['How', 'old', 'were', 'you', 'in', '1940', 'w...","['when', 'soviets', 'to', 'the', 'estonia', '1...",[],7,0,19,TRUE,contains only numbers,,
10,49917,10,1,0:03:31,Ilmar A.,David Boder,What means “first grade”? The highest grade?,[],[],[],...,"['What', 'does', 'first', 'grade', 'mean', 'in...","['it', 'mean', 'the', 'of', 'gymnasium', 'is',...","['means', 'the']",10,2,17,FALSE,,,
12,49919,12,1,0:03:38,Ilmar A.,David Boder,… of the “Volks-”?,[],[],[],...,"['What', 'did', 'you', 'learn', 'about', 'the'...","['your', 'what', 'did', 'about', 'learn', 'you...",[],9,0,12,FALSE,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52856,48117,479,119,0:28:21,Janine Binder,David Boder,"Was she Jewish, a Jewish girl friend?",[],[],[],...,"['Was', 'your', 'girlfriend', 'who', 'was', 'd...","['deported', 'your', 'girlfriend', 'you', 'wit...","['she', 'girl', 'a', 'friend']",7,4,9,FALSE,,,
52858,48119,481,119,0:28:23,Janine Binder,David Boder,And she too was there as an Aryan woman?,[],[],[],...,"['And', 'your', 'girlfriend', 'too', 'was', 't...","['girlfriend', 'your']",['she'],2,1,10,FALSE,,,
52860,48121,483,119,0:28:55,Janine Binder,David Boder,Found courage?,[],[],[],...,"['Did', 'you', 'find', 'courage', 'during', 'y...","['imprisonment', 'during', 'your', 'or', 'libe...",['found'],8,1,9,FALSE,,,
52862,48123,485,119,0:29:02,Janine Binder,David Boder,So?,[],[],[],...,"['So', 'what', 'happened', 'next', 'after', 'y...","['next', 'after', 'what', 'you', 'happened', '...",[],7,0,8,FALSE,,,


In [None]:
#@title Define Generic Prompting Function.
from openai import OpenAI
from pydantic import BaseModel, Field
from pydantic import TypeAdapter
from typing import List
from textwrap import dedent
import json

# Pass the API key to Open AI
client = OpenAI(api_key=os.environ.get("OPEN_AI_KEY"))


class ResponseTemplate(BaseModel):
  response: str = Field(
        ...,
        description="the response"
    )


# Define a wrapper function to call Open AI
def prompt_gpt(prompt,
               model = "gpt-4o-mini",
               max_tokens = 4096,
               temperature = 0.9,
               top_p = 0.9,
               response_format= ResponseTemplate,
               frequency_penalty = 0.2,
               presence_penalty = 0.0,
               system="Your role is to process data factually without generating hallucinations or fabrications"):
    '''
    Wrapper for querying the GPT API with a single function.

    Args:
        prompt (str): The query sent to the model API.
        model (str, optional): The version of GPT to be queried. Defaults to "gpt-4o-mini".
        max_tokens (int, optional): The maximum number of tokens to generate. Defaults to 4096.
        temperature (float, optional): The sampling temperature to use. Defaults to 0.9.
        top_p (float, optional): The nucleus sampling parameter. Defaults to 0.1.
        frequency_penalty (float, optional): The penalty for token frequency. Defaults to 0.2.
        presence_penalty (float, optional): The penalty for token presence. Defaults to 0.0.
        system (str, optional): The system-level prompt that defines the model's role.

    Returns:
        str: The text response generated by the model.
    '''
    response = client.beta.chat.completions.parse(
        model=model,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": dedent(prompt)}
        ],
        response_format= response_format,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        presence_penalty=presence_penalty,
    )

    output = response.choices[0].message.parsed

    # Dump the model to a dictionary with JSON-compatible formatting
    output_dict = output.model_dump(mode='json')

    # Return the dictionary with indentation using json.dumps()
    return output_dict



In [None]:
class ResponseFormat(BaseModel):
  rewritten_question: str = Field(..., description="The rewritten question")

prompt = "rewrite the following question to remove uneccessary words. question: {}"

In [None]:
def process_flags_row(index, row, df, prompt, response_format):
  questions = ['who', 'what', 'where', 'when', 'why', 'how', 'here']
  flagged_words = ['experience', 'considering', 'especially', 'given', 'particularly', 'relation', 'regarding', 'circumstances', 'suggesting', 'mentioned']
  if "One word elaboration" in row['flag_reasoning']:
    if any(word in row['text'].lower() for word in questions):
      df.at[index, 'Overwritten'] = False
    else:
      df.at[index, 'Overwritten'] = True
      df.at[index, 'Rewritten Flagged Question'] = row['text']
  elif any(word in row['flag_reasoning'].lower() for word in flagged_words):
    # Assuming 'prompt' is defined elsewhere
    prompt = prompt.format(row['final_text'])
    # Assuming 'prompt_gpt', 'ResponseFormat' are defined elsewhere
    rewritten_question = prompt_gpt(prompt, response_format=ResponseFormat, model = "gpt-4o-mini")
    df.at[index, 'Overwritten'] = True
    df.at[index, 'Rewritten Flagged Question'] = rewritten_question['rewritten_question']
  elif row['is_flagged'] == "FALSE":
    df.at[index, 'Overwritten'] = False
  else:
    df.at[index, 'Overwritten'] = "needs manual revision"




In [None]:
def process_flags(df, prompt, response_format):
  index = 1
  final = len(df)
  # Use tqdm to create a progress bar
  pbar = tqdm(df.iterrows())
  for idx, row in pbar:
      process_flags_row(idx, row, df, prompt, response_format)
      pbar.set_description(f"Processing row {index} of {final}")
      index += 1

In [None]:
process_flags(final_df, prompt, ResponseFormat)

Processing row 15808 of 15808: : 15808it [09:41, 27.16it/s]


In [None]:
final_df.to_csv("final_results.csv")

from google.colab import files
files.download("final_results.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>