# test api 

In [14]:
import pandas as pd #all functions need to be pre-pended with 'pd.' e.g. the DataFrame function must be written as 'pd.DataFrame'
import numpy as np
from openai import OpenAI
import tiktoken
import time
import os 
import re
import json
import openai

from helper_file import chunk_text_with_overlap, RateLimiter, get_model_response, find_overlap, merge_chunks

enc = tiktoken.encoding_for_model('gpt-3.5-turbo')


data_path = "./data"

In [9]:
podcast_files = os.listdir(os.path.join(data_path,'raw_podcasts'))

In [54]:

#open_file = 
file_dict = []

for open_file in podcast_files:

    open_file_path = os.path.join(data_path,'raw_podcasts', open_file)

    with open(open_file_path, 'r', encoding='utf-8', errors='replace')as file:
        # Read the file and split into a list at each line break
        lines = file.read().replace("\n", "")
        lines = re.sub(r"\d+:\d+", " ", lines)

        tokens = len(enc.encode(lines))

        file_dict.append({'file':open_file, 'tokens':tokens, 'unknown_chars': lines.count('\uFFFD')})


df = pd.DataFrame(file_dict)

In [56]:
df

Unnamed: 0,file,tokens,unknown_chars
0,swn_jul2021_food_taxes_and_subsidies.txt,14855,0
1,cv_mar2023_equitable_food_policy.txt,16783,0
2,jc_oct2021_agricultural_inputs_sector.txt,9043,0
3,cr_may2023_food_system_silos.txt,15073,1
4,cc_jan2019_brexit_and_farming.txt,12925,0
5,ch_jun2021_fisheries.txt,12474,0
6,ab_and_jv_oct2023_mapping_sustainability.txt,10258,0
7,rm_jul2017_nutrition_security_in_brazil.txt,13932,0
8,af_nov2017_hunger_in_america.txt,9598,0
9,aw_and_gh_dec2018_global_nutrition.txt,22605,1


In [74]:
prompt_template = f"""The below text comes from an automatically transcribed podcast in which an academic interviews 
another academic on their research in food systems. 
Please summarise the please summarise the podcast to a 500 word abstract, which explains the main themes, 
do not mention the reasearchers/academics involved but the content of the research itself.
do not give the title of the podcast
do not say who presents/hosts the podcast
The podcast transcription is surrounded by triple colons ':::'
::: {lines} :::
"""

In [75]:
rate_limiter = RateLimiter(50000)

# Ensure the output directory exists
output_directory = os.path.join(data_path,"gpt_summary")
os.makedirs(output_directory, exist_ok=True)

chunk_num = 1

for open_file in podcast_files[0:6]:
    # Construct the expected summary filename
    summary_filename = os.path.join(output_directory, os.path.basename(open_file) + "_summary.txt")

    # Check if the summary file already exists
    if os.path.exists(summary_filename):
        print(f"Summary for {os.path.basename(open_file)} already exists. Skipping.")
        continue  # Skip this file and go to the next iteration

    start_time = time.time()  # Start timing

    # Assuming open_file_path is correctly defined based on open_file
    with open(open_file_path, 'r', encoding='utf-8', errors='replace') as file:
        lines = file.read().replace("\n", "")
        lines = re.sub(r"\d+:\d+", " ", lines)

    prompt_text = prompt_template.format(lines=lines)

    # Assuming get_model_response is correctly defined and returns the expected output
    response = get_model_response(prompt_text, 'You are an expert in summarisation of podcasts', 
                                  rate_limiter, engine="gpt-4-turbo-preview").choices[0].message.content
    
    # Save the response to the summary file
    with open(summary_filename, 'w', encoding='utf-8') as summary_file:
        summary_file.write(response)

    end_time = time.time()  # End timing
    chunk_time = end_time - start_time  # Calculate the time taken for this chunk

    print(f"text chunk: {chunk_num}/{len(podcast_files)} complete, time taken {chunk_time:.2f} seconds")
    chunk_num += 1


text chunk: 1/48 complete, time taken 18.75 seconds
text chunk: 2/48 complete, time taken 17.35 seconds
text chunk: 3/48 complete, time taken 23.81 seconds
text chunk: 4/48 complete, time taken 17.20 seconds
text chunk: 5/48 complete, time taken 12.90 seconds
text chunk: 6/48 complete, time taken 31.23 seconds


In [77]:
summary_texts = []

for filename in os.listdir(output_directory):
    # Construct the full path to the file
    file_path = os.path.join(output_directory, filename)
    
    # Check if it's a file and not a directory
    if os.path.isfile(file_path):
        # Open and read the file's contents
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
            summary_texts.append(file_content)


summary_texts

["The podcast delves into the critical importance of understanding food supply chains for the purpose of redesigning food systems. It highlights the work of Professor Lisa Jack, who specializes in uncovering the hidden aspects of food supply chains, particularly the everyday practices that are often overlooked but are crucial for initiating change within the conventional food system. Her research emphasizes the need for deep engagement and understanding of all aspects of the food system to facilitate sustainable change.\n\nThe discussion begins by acknowledging the complexity of food supply chains and the various assumptions made about them. Professor Jack's research aims to bring to light the practices within these supply chains, especially those in the middle segments that are less frequently researched. These include the companies involved in processing, packing, and distributing food products, which play a significant role in the overall functioning of the food system.\n\nOne of th

In [2]:
import openai

In [3]:
response = openai.embeddings.create(
    input="Your text here",
    model="text-embedding-3-large",
    dimensions=256
)

embedding =response.data[0].embedding

In [15]:


def generate_and_save_embeddings(summary_texts, summary_files, embeddings_directory):
    os.makedirs(embeddings_directory, exist_ok=True)
    
    for summary_text, summary_file in zip(summary_texts, summary_files):
        # Construct the filename for the embedding file
        embedding_filename = os.path.splitext(summary_file)[0] + "_embedding.json"
        embedding_file_path = os.path.join(embeddings_directory, embedding_filename)
        
        # Check if the embedding already exists to avoid reprocessing
        if not os.path.exists(embedding_file_path):
            # Generate the embedding
            response = openai.embeddings.create(
                input=summary_text,
                model="text-embedding-3-large",
                dimensions=256
            )
            embedding = response.data[0].embedding
            
            # Save the embedding to a file
            with open(embedding_file_path, 'w', encoding='utf-8') as file:
                json.dump(embedding, file)
                
            print(f"Saved embedding for {summary_file} to {embedding_file_path}")
        else:
            print(f"Embedding for {summary_file} already exists. Skipping.")

# Load the summaries into a list of strings and filenames
def load_summaries_and_filenames(directory):
    summary_texts = []
    summary_files = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                summary_texts.append(file.read())
                summary_files.append(filename)
    return summary_texts, summary_files


In [18]:
# Directory paths
summary_directory = './data/gpt_summary'
embeddings_directory = './data/gpt_embeddings'

# Load summaries and their filenames
summary_texts, summary_files = load_summaries_and_filenames(summary_directory)

# Generate and save embeddings
generate_and_save_embeddings(summary_texts, summary_files, embeddings_directory)

Saved embedding for jc_oct2021_agricultural_inputs_sector.txt_summary.txt to ./data/gpt_embeddings/jc_oct2021_agricultural_inputs_sector.txt_summary_embedding.json
Saved embedding for cr_may2023_food_system_silos.txt_summary.txt to ./data/gpt_embeddings/cr_may2023_food_system_silos.txt_summary_embedding.json
Saved embedding for cc_jan2019_brexit_and_farming.txt_summary.txt to ./data/gpt_embeddings/cc_jan2019_brexit_and_farming.txt_summary_embedding.json
Saved embedding for cv_mar2023_equitable_food_policy.txt_summary.txt to ./data/gpt_embeddings/cv_mar2023_equitable_food_policy.txt_summary_embedding.json
Saved embedding for ch_jun2021_fisheries.txt_summary.txt to ./data/gpt_embeddings/ch_jun2021_fisheries.txt_summary_embedding.json
Saved embedding for swn_jul2021_food_taxes_and_subsidies.txt_summary.txt to ./data/gpt_embeddings/swn_jul2021_food_taxes_and_subsidies.txt_summary_embedding.json
