# Cleaning the dataset using gpt

The purpose of this notebook is to create an easily reproducible method of cleaning and organising the podcaste textfiles using the gpt API.

In [1]:
import pandas as pd #all functions need to be pre-pended with 'pd.' e.g. the DataFrame function must be written as 'pd.DataFrame'
import numpy as np
from openai import OpenAI
import tiktoken
import time
import os 


from helper_file import chunk_text_with_overlap, RateLimiter, get_model_response, find_overlap, merge_chunks

#from helper_file import * #
#Using the star with import means all functions are imported with their normal names.
#This is useful as you don't need to pre-pend anything. However, if two libraries have a function of the same 
#name one of them will be overwritten as python cannot distinguish between them. It is especially risky if you import an entire library instead of 
#simply specific functions. However, when using helper modules are so specific to the project it usually doesn't matter. 
#However, you will also import all the functions you have imported in your helper file, this can make it confusing as to what is 
#actually needed inthe script or notebook, so explicit import is probably the best move


# Set up the OpenAI API key from the '.env' file, this allows you to keep your key secret and not expose on github
#have the api key in like the below. You need to create a .env file.
#OPENAI_API_KEY = "my api key"

In [2]:
# Open the file in read mode ('r')
with open('./data/raw_podcasts/ab_and_jv_oct2023_mapping_sustainability_txt.txt', 'r') as file:
    # Read the entire contents of the file into a string
    file_contents = file.read()

In [3]:
enc = tiktoken.encoding_for_model('gpt-3.5-turbo')

tokens0 = enc.encode(file_contents)
#remove all new lines and replace with space, replace all double space with single space
#This is because the line breaks are basically random and related to time or something in the transcription algo

new_string = file_contents.replace("\n", " ").replace("\s+", " ")
tokens = enc.encode(new_string)
print(len(tokens0))
print(len(tokens))

10841
10045


In [4]:
chunk_list = chunk_text_with_overlap(new_string, 1500, 100)

In [6]:

response_list = []

rate_limiter = RateLimiter(50000)

chunk_num = 1

for chunk in chunk_list:
    start_time = time.time()  # Start timing

    prompt_text = f"""The below text comes from an automatically transcribed podcast in which an academic interviews 
    another academic on their research in food systems. 
    The text has no punctuation and no linebreaks. I need you to add in appropriate punctuation, lines, paragraphs as appropriate, to make the
    transcription easier to read. the transcript contains numerous filler words such as 'um', 'uh', 'like', and 'you know.' 
    Please remove these filler words to make the transcript more concise and easier to read, while ensuring the original meaning and tone of the speaker are preserved. 
    It's important that the edited transcript remains true to the speaker's intended message and style.
    Also note that due to the length of the podcast the test has been broken up into smaller chunks and the text may start or end mid sentence. 
    The podcast transcription is surrounded by triple colons ':::'
    ::: {chunk} :::
    """

    response = get_model_response(prompt_text, 'You are an expert in repairing transcription errors', 
                                  rate_limiter, engine="gpt-3.5-turbo").choices[0].message.content
    
    response_list.append(response)

    end_time = time.time()  # End timing
    chunk_time = end_time - start_time  # Calculate the time taken for this chunk

    print(f"text chunk: {chunk_num}/{len(chunk_list)} complete, time taken {chunk_time:.2f} seconds")
    chunk_num += 1


text chunk: 1/7 complete, time taken 34.93 seconds
text chunk: 2/7 complete, time taken 17.59 seconds
text chunk: 3/7 complete, time taken 33.77 seconds
text chunk: 4/7 complete, time taken 28.67 seconds
text chunk: 5/7 complete, time taken 31.46 seconds
text chunk: 6/7 complete, time taken 34.67 seconds
text chunk: 7/7 complete, time taken 26.54 seconds


In [7]:

# Example usage

overlap_size = 100  # Assume 10 words overlap

prev_chunk = response_list[0]

curr_chunk = response_list[1]

overlap = find_overlap(prev_chunk, curr_chunk, overlap_size)
merged_text = merge_chunks(prev_chunk, curr_chunk, overlap)


match found maerging text


In [8]:
# Initialize variables
merged_text = ""
overlap_size = 100  # Adjust this based on your expected overlap size

# Iterate through the chunks
for i in range(len(chunk_list)):
    # For the first chunk, just set it as the starting point of merged_text
    if i == 0:
        merged_text = chunk_list[i]
    else:
        # Find the overlap between the current state of merged_text and the next chunk
        overlap = find_overlap(merged_text, chunk_list[i], overlap_size)

        # Merge the current state of merged_text with the next chunk using the found overlap
        merged_text = merge_chunks(merged_text, chunk_list[i], overlap)

# At the end of the loop, merged_text contains all the chunks merged together


match found maerging text
match found maerging text
match found maerging text
match found maerging text
match found maerging text
match found maerging text


In [9]:
# Assuming merged_text contains your final combined text
file_path = os.path.join('./data', 'cleaned_podcasts',"ab_and_jv_oct2023_mapping_sustainability_txt.txt" ) # You can change the file path and name as needed

# Writing to a file
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(merged_text)

print(f"The merged text has been saved to {file_path}")


The merged text has been saved to ./data/cleaned_podcasts/ab_and_jv_oct2023_mapping_sustainability_txt.txt
