# Extract Raw Transcript from existing company pitches

In [None]:
# Processes transcripts.json to output a clean sharktank pitch

"""
Example transcript.json

{
  "GarmaGuard": "hi sharks today... [FULL TRANSCRIPT]"
}

"""



In [None]:
!pip install openai requests



# Task
Extract the "full_pitch" section from each startup transcript in the "existing_transcripts_(119).json" file using the OpenAI API, handling errors by printing the company name, and write the extracted pitches to a new JSON file.

## Load the json data

### Subtask:
Read the `existing_transcripts_(119).json` file into a Python dictionary.


In [None]:
import json

with open('/content/existing_transcripts_(119).json', 'r') as f:
  transcripts_data = json.load(f)

print(f"Successfully loaded {len(transcripts_data)} transcripts.")

Successfully loaded 119 transcripts.


## Iterate and process

### Subtask:
Loop through each startup's transcript in the loaded data and prepare the user input for the API call.


## Call the api

### Subtask:
For each transcript, call the OpenAI API with the defined system context and instructions to extract the pitch.


In [None]:
from openai import OpenAI
import json
from google.colab import userdata
import time # Import the time module
from pydantic import BaseModel # Import BaseModel

OPENAI_API_KEY=userdata.get("OPENAI_API_KEY")

# Define the Pydantic model for the desired output structure
class PitchExtraction(BaseModel):
    Product: str
    Full_Pitch: str

sys_context="""
You are a helpful transcript bot.

Do Not Summarize.

leave the text as is.
"""

instructions = """
Please read this transcript and extract the "full_pitch" portion for each startup. The "full_pitch" typically starts with a greeting like "Hi Sharks" and ends before the final question posed by the entrepreneur (the line after the pitch is usually a question from the sharks or the entrepreneur).

For each startup, identify:
1. The name of the product being pitched.
2. The entire text of the full pitch.

Return the information in a readable JSON format as a list of objects, where each object has the following keys:
"Product": <Product Name>
"Full_Pitch": <Entire text of the full pitch>

Return only the list in JSON format.
"""

verbose = True

def call_api(sys_context, user_input, temperature=0.2):
    client = OpenAI(api_key=OPENAI_API_KEY)
    if verbose:
        print("Calling OpenAI API")

    # Use client.responses.parse for structured data extraction
    response = client.responses.parse(
        model="gpt-4o-mini-2024-07-18",
        input=[
            {"role": "system", "content": sys_context},
            {"role": "user", "content": user_input},
        ],
        text_format=PitchExtraction, # Use the defined BaseModel
        temperature=temperature,
        # max_completion_tokens=max_completion_tokens, # Removed max_completion_tokens
    )

    if verbose:
        # Note: Usage information might be accessed differently with client.responses.parse
        # Check the OpenAI documentation for the correct way to access usage details.
        print("Usage information might not be directly available with client.responses.parse in the same format.")


    return response.output_parsed # Return the parsed output


extracted_pitches = []
count = 0  # Initialize a counter
for startup_name, transcript in transcripts_data.items():
    try:
        user_input = instructions + transcript
        api_response = call_api(sys_context, user_input)
        # Since the response is already parsed by Pydantic, we can directly append it
        extracted_pitches.append(api_response)

        count += 1 # Increment the counter

        # Print for the first response and every 10th response
        if count == 1 or count % 10 == 0:
            print(api_response) # Print the parsed response for verification
            print(f"Successfully processed {startup_name} (response {count})") # Indicate successful processing

    except Exception as e:
        print(f"Error processing {startup_name}: {e}")

    time.sleep(5) # Add a 5-second timeout after each API call


# The extracted_pitches list now contains the extracted pitch data.
# We will process this list further in the next step.

Calling OpenAI API
Usage information might not be directly available with client.responses.parse in the same format.
Product='Garment Guard' Full_Pitch="Hi Sharks my name is pete badaway and this is my beautiful white bianca we're seeking 100 000 for 10 equity and our company sharks we want to introduce you to joe joe here he's just like all of us gets up gets himself ready puts his mask on and heads off to work the second joe leaves his house these nasty little germs which are filled with odor dirt grind and even bacteria are flying at them from all different directions [Music] i love it whether he's grabbing a quick latte yes at a crowded cafe walking past a sneezing co-worker and let's not forget all those nasties he picks up during his commute to and from work he's washing and cleaning his hands throughout the day but what's he doing about his clothes nothing sharks he does nothing that crazy nasty dirt that he's collecting throughout the day is still stuck on his clothes let's fac

In [None]:
# Write the extracted pitches to a JSON file
output_file = 'extracted_pitches.json'
with open(output_file, 'w') as f:
    # Convert Pydantic objects to dictionaries before writing to JSON
    json_compatible_pitches = [pitch.model_dump() for pitch in extracted_pitches]
    json.dump(json_compatible_pitches, f, indent=4)

print(f"Successfully wrote {len(extracted_pitches)} extracted pitches to {output_file}")

Successfully wrote 119 extracted pitches to extracted_pitches.json
