# Pre-Processing Data, Version A

Using Large Language Models to add much needed context to interviews.

This is a pre-processing document that prepares the Boder 2020 testimonies for pronoun disambiguation and context rewriting.

This is following a similar format to the same clustering run in 2022 using SBert Clustering

Authors: Billy Peir, off code from Michelle Lee


#Import data

In [None]:
#@title Import libraries
import pandas as pd
import glob
from nltk import sent_tokenize
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#@title Import data
# load full dataframe of all testimony lines from Boder archive
boder_df = pd.read_csv('/content/drive/MyDrive/Holocaust and Genocide Studies Digital Research Lab/SBERT Clustering Documentation-20220830T233256Z-001/SBERT Clustering Documentation/Data/Boder_transcripts_clean_manipulated.csv')

# rename some columns
boder_df.rename(columns={'id_new': 'file_num', 'words':'text'}, inplace=True)
boder_df = boder_df.reset_index(drop=True)

# create an index column as reference (used later for answer extraction)
boder_df['manual_index'] = list(range(0, len(boder_df)))

#Preprocessing Data

Some additional preprocessing needs to be done for this corpus before we tokenize sentences. This includes removing lines with ellipses and certain bracketed comments.

In [None]:
# to clean up sentences for tokenizing,
# remove ellipses from each line of text
no_ellipses = []
for i in range(len(boder_df)):
    sentence = boder_df['text'].iloc[i]
    # Check if sentence is a string before applying replace
    if isinstance(sentence, str):
        no_ellipses.append(sentence.replace('. . .', ''))
    else:
        # Handle non-string values (e.g., NaN)
        no_ellipses.append(str(sentence))  # or any other desired handling
boder_df['text'] = no_ellipses

In [None]:
# remove bracketed statments except for 'unintelligble'
bracketless_texts_list = []

for i in range(len(boder_df)): # for each sentence string in the dataframe
    string = boder_df.text.iloc[i]
    for j in range(5):
        substring_to_remove = string[string.find("[")+1:string.find("]")]
        if substring_to_remove != 'unintelligible':
            string = string.replace("["+substring_to_remove+"]", '')
    as_list = boder_df.text.tolist()
    idx = as_list.index(boder_df.text.iloc[i])
    as_list[idx] = string
    boder_df.text = as_list

#Assemble Context

Now, to prepare to prompt into OpenAI, assemble the context window neccessary to run the prompts

In [None]:
# Set token limit and model
token_limit = 500
model = 'gpt-4o-mini'

# Filter out NaN values in 'text' column
boder_df_filtered = boder_df[boder_df['text'].notna()]

In [None]:
#@title Install Libraries
!pip install tiktoken #allows for token estimation

SyntaxError: invalid syntax (<ipython-input-10-4f786d061ea9>, line 2)

In [None]:
#@title Create is_interviewee column
boder_df_filtered['is_interviewee'] = [int(ele != "David Boder") for ele in boder_df_filtered['speaker']]
boder_df_filtered.head()

In [None]:
#@title Define a function to preprocess the dataset by assemblying the necessary context
import pandas as pd
import tiktoken

def generate_context(df, model, token_limit):
    """
    Generate context for each row with a rolling window of the previous context,
    constrained by a token limit. The context will include text up to, but not including, the current row.

    Args:
    df (pd.DataFrame): The DataFrame containing the text data.
    model (str): The name of the LLM model for tokenization.
    token_limit (int): The maximum number of tokens allowed for the context.

    Returns:
    pd.DataFrame: A new DataFrame with an additional 'context' column.
    """
    # Initialize the tokenizer based on the provided model
    encoding = tiktoken.encoding_for_model(model)

    contexts = []
    current_context_lines = []  # Use a list to manage context lines
    current_file = None
    current_token_count = 0  # Keep track of current token count
    tokens = []

    for idx, row in df.iterrows():

        # if 'is_interviewee' = 1, speaker = 'SUBJECT'
        # if 'is_interviewee' = 0, speaker = 'INTERVIEWER'
        # if 'is_interviewee' = 2, speaker = 'CREW'

        if row['is_interviewee'] == 1:
            speaker = 'SUBJECT'
        elif row['is_interviewee'] == 0:
            speaker = 'INTERVIEWER'
        else:
            speaker = 'CREW'

        file_num = row['file_num']

        # If it's a new interview file, reset the context
        if current_file != file_num:
            current_context_lines = []
            current_token_count = 0
            current_file = file_num

        # Join the current context lines into a single string (before adding the current line)
        context = ''.join(current_context_lines)
        contexts.append(context)
        tokens.append(current_token_count)

        # Add the current line to the context AFTER the current context is saved
        current_line = f"{speaker}: {row['text']}\n"
        current_line_tokens = encoding.encode(current_line)

        # Add tokens of the new line to the total token count
        current_token_count += len(current_line_tokens)
        current_context_lines.append(current_line)

        # Trim context if token count exceeds the limit
        while current_token_count > token_limit:
            # Remove the oldest line
            if len(current_context_lines) > 1:
              removed_line = current_context_lines.pop(0)
              removed_line_tokens = encoding.encode(removed_line)
              current_token_count -= len(removed_line_tokens)

            else:
              break
    # Add the context to the dataframe
    df['context'] = contexts
    df['token_count'] = tokens
    return df

In [None]:
context_df = generate_context(boder_df_filtered.copy(), model, token_limit)

# Extract Interviewer Questions

In [None]:
# for BERT clustering purposes, only keep rows with interview questions
# Since Boder is the sole interviewer for this corpus,
# we keep only the lines spoken by him
boder_qs = context_df.loc[context_df['speaker'] == 'David Boder']

len(boder_qs)

In [None]:
boder_qs.head()

In [None]:
# divide words column with multi-sentences into a list
# using sentence tokenizer
boder_df.text = boder_df.text.apply(lambda x: sent_tokenize(str(x)))

In [None]:
# Now, keep only the questions/interrogative statements spoken by the interviewer.
# we remove any sentence that doesn't include any of the key words/symbols
# included in the list below.
encoding = tiktoken.encoding_for_model(model)
keys = ['?', 'tell', 'describe', 'share', 'sing', 'message', 'photograph', 'ask', 'did you', 'were you']

id_list=[]
sent_num_list=[]
que_ans_list=[]
manual_index_list=[]
texts_list=[]
context_list = []

for i in range(len(boder_qs)):
    current_context = boder_qs.context.iloc[i]
    current_token_count = boder_qs.token_count.iloc[i]
    for j in list(boder_qs.text.iloc[i]): # j is a sentence
        if any(ele in j for ele in keys):
            id_list.append(boder_qs.file_num.iloc[i])
            sent_num_list.append(boder_qs.sent_num.iloc[i])
            que_ans_list.append(boder_qs.que_ans.iloc[i])
            manual_index_list.append(boder_qs.manual_index.iloc[i])
            texts_list.append(j)
            context_list.append(current_context)
        current_line = "INTERVIEWER: " + j + "\n"
        current_context = ''.join(current_context) + current_line
        current_token_count = current_token_count + len(encoding.encode(current_line))
        # Trim context if token count exceeds the limit
        while current_token_count > token_limit:
            current_context = current_context.split('\n')
            # Remove the oldest line
            if len(current_context) > 1:
              removed_line = current_context[0]
              current_context = '\n'.join(current_context[1:])
              removed_line_tokens = encoding.encode(removed_line)
              current_token_count -= len(removed_line_tokens)

            else:
              break


# append all questions and meta data into a data frame
boder_intqs = pd.DataFrame()
boder_intqs["file_num"]=id_list
boder_intqs["sent_num"]=sent_num_list
boder_intqs["manual_index"]=manual_index_list
boder_intqs["text"]=texts_list
boder_intqs["que_ans"]=que_ans_list

boder_intqs['context'] = context_list

In [None]:
# omit lines which only contain 'unintelligible'
boder_intqs = boder_intqs[boder_intqs['text'] != '[unintelligible]']
boder_intqs = boder_intqs[boder_intqs['text'] != '[unintelligible].']
boder_intqs = boder_intqs[boder_intqs['text'] != '[unintelligible] .']

# omit lines which only contain '?' or '.'
boder_intqs = boder_intqs[boder_intqs['text'] != '?']
boder_intqs = boder_intqs[boder_intqs['text'] != '.']

In [None]:
boder_intqs

In [None]:
#boder_intqs2.to_csv('Boder_questions_spe6.csv')

In [None]:
# get question and answer word count and save as a column in the dataframe
qword_count = []
for index in qs_df.index: # Iterate using the index of qs_df
    qword_count.append(len(str(qs_df.loc[index, 'text']).split()))

qs_df['question_length'] = qword_count

In [None]:
qs_df

In [None]:
break

In [None]:
qs_df.to_csv('Version_A_Preprocessing_Results.csv')