In [None]:
# Processes transcripts.json to output a clean sharktank pitch

"""
Example transcript.json

{
  "GarmaGuard": "hi sharks today... [FULL TRANSCRIPT]"
}

"""



In [None]:
!pip install openai requests



# Task
Extract the "full_pitch" section from each startup transcript in the "existing_transcripts_(119).json" file using the OpenAI API, handling errors by printing the company name, and write the extracted pitches to a new JSON file.

## Load the json data

### Subtask:
Read the `existing_transcripts_(119).json` file into a Python dictionary.


In [None]:
import json

with open('/content/transcripts_(126).json', 'r') as f:
  transcripts_data = json.load(f)

print(f"Successfully loaded {len(transcripts_data)} transcripts.")

Successfully loaded 126 transcripts.


## Iterate and process

### Subtask:
Loop through each startup's transcript in the loaded data and prepare the user input for the API call.


## Call the api

### Subtask:
For each transcript, call the OpenAI API with the defined system context and instructions to extract the pitch.


In [None]:
from openai import OpenAI
import json
from google.colab import userdata
import time # Import the time module
from pydantic import BaseModel # Import BaseModel

OPENAI_API_KEY=userdata.get("OPENAI_API_KEY")

# Define the Pydantic model for the desired output structure
class PitchExtraction(BaseModel):
    Product: str
    Full_Pitch: str

sys_context="""
You are a precise transcript extraction bot.
Do NOT summarize or paraphrase. Preserve original punctuation, casing, and line breaks.
Return only strict JSON that matches the schema. No extra text.
"""

instructions = """
You will receive a Shark Tank transcript for a single startup (pitch + Q&A/negotiation).

Extract the **verbatim** "Full_Pitch" span only.

Definition — Full_Pitch:
- START: the entrepreneur’s greeting/intro (e.g., “Hi Sharks”, “Hello Sharks”, “Hey Sharks”, “My name is…”, “I’m/We’re seeking …”).
- END: **immediately before** the first line spoken by anyone other than the entrepreneur (typically a Shark) OR the first explicit question/interjection. Stop as soon as a Shark speaks.

Common Shark labels to stop on (as line prefixes; case-insensitive, match whole word + colon):
Mark:, Lori:, Kevin:, Barbara:, Daymond:, Robert:, Guest:, Shark:, Narrator:, Voiceover:, Host:, [Investor Name]:

Heuristic guardrails:
- Typical Full_Pitch length is 1,000–2,000 characters (can be more but rarely).
- If the pitch exceeds 2,200 characters, re-check that you didn’t include Shark lines.
- The Full_Pitch must NOT contain a line starting with any of the Shark labels above.

Also extract a best-effort Product_Name from the first ~3 entrepreneur lines (leave null if genuinely unclear).

Schema (return ONLY this JSON object):
{
  "Product_Name": "<string>",
  "Full_Pitch": "<verbatim text of the pitch only>"
}
"""

verbose = True

def call_api(sys_context, user_input, temperature=0.2):
    client = OpenAI(api_key=OPENAI_API_KEY)
    # if verbose:
    #     print("Calling OpenAI API")

    # Use client.responses.parse for structured data extraction
    response = client.responses.parse(
        model="gpt-4o-mini-2024-07-18",
        input=[
            {"role": "system", "content": sys_context},
            {"role": "user", "content": user_input},
        ],
        text_format=PitchExtraction, # Use the defined BaseModel
        temperature=temperature,
        # max_completion_tokens=max_completion_tokens, # Removed max_completion_tokens
    )

    # if verbose:
        # Note: Usage information might be accessed differently with client.responses.parse
        # Check the OpenAI documentation for the correct way to access usage details.
        # print("Usage information might not be directly available with client.responses.parse in the same format.")


    return response.output_parsed # Return the parsed output


extracted_pitches = []
count = 0  # Initialize a counter
for startup_name, transcript in transcripts_data.items():
    try:
        user_input = instructions + transcript
        api_response = call_api(sys_context, user_input)

        # Convert the Pydantic model to a dictionary and update the Product field
        api_response_dict = api_response.model_dump()
        api_response_dict["Product"] = startup_name # override LLM output with existing name

        extracted_pitches.append(api_response_dict)

        count += 1 # Increment the counter

        # Print for the first response and every 10th response
        if count == 1 or count % 10 == 0:
            print(api_response_dict) # Print the updated dictionary for verification
            print(f"Successfully processed {startup_name} (response {count})") # Indicate successful processing

    except Exception as e:
        print(f"Error processing {startup_name}: {e}")

    time.sleep(2) # Add a 5-second timeout after each API call


# The extracted_pitches list now contains the extracted pitch data.
# We will process this list further in the next step.

{'Product': 'Uprising Food', 'Full_Pitch': "next up is a healthier version of an indulgence [Music] hi sharks i'm william and i'm kristen we're a husband wife dynamic duo co-founders of uprising food from cincinnati ohio we're here today seeking 500 000 for three percent equity in our company hello sharks we want to revolutionize health and it starts with bread we teamed up with the best artisan bakers in the industry to bring you the healthiest tastiest most powerful bread in the game and did we mention our bread also known as the uprising cube is keto paleo and boosts gut health that's right and it's dairy and gluten free some say it even tastes like it's straight out of heaven we've truly done the impossible our cubes are packed with fiber and created with clean and boosted superfood ingredients now who wants to join our health revolution and bring this mouth-watering taste like heaven bread to the masses now who wants to taste heaven here so what we have today is for you to experie

In [None]:
# Write the extracted pitches to a JSON file
output_file = 'extracted_pitches.json'
with open(output_file, 'w') as f:
    # Convert Pydantic objects to dictionaries before writing to JSON
    # json_compatible_pitches = [pitch.model_dump() for pitch in extracted_pitches] # This line is no longer needed
    json.dump(extracted_pitches, f, indent=4)

print(f"Successfully wrote {len(extracted_pitches)} extracted pitches to {output_file}")

Successfully wrote 126 extracted pitches to extracted_pitches.json


In [None]:
import json

file_path = "extracted_pitches.json"
with open(file_path, 'r') as f:
    pitches_data = json.load(f)

print(f"Loaded {len(pitches_data)} pitch entries.")

Loaded 126 pitch entries.


In [None]:
pitch_lengths = []

for pitch_entry in pitches_data:
    full_pitch = pitch_entry.get("Full_Pitch")
    if isinstance(full_pitch, str):
        pitch_lengths.append(len(full_pitch))

print(f"Extracted lengths for {len(pitch_lengths)} pitches.")

Extracted lengths for 126 pitches.


In [None]:
import numpy as np

median_pitch_length = np.median(pitch_lengths)
q1_pitch_length = np.percentile(pitch_lengths, 25)
q3_pitch_length = np.percentile(pitch_lengths, 75)

print(f"Median pitch length: {median_pitch_length}")
print(f"First quartile (25th percentile): {q1_pitch_length}")
print(f"Third quartile (75th percentile): {q3_pitch_length}")

Median pitch length: 1550.0
First quartile (25th percentile): 1248.0
Third quartile (75th percentile): 2800.5


In [None]:
significantly_shorter_pitches = []
exceptionally_longer_pitches = []

for pitch_entry in pitches_data:
    full_pitch = pitch_entry.get("Full_Pitch")
    if isinstance(full_pitch, str):
        if len(full_pitch) < q1_pitch_length:
            significantly_shorter_pitches.append({"Product": pitch_entry.get("Product"), "Length": len(full_pitch)})
        elif len(full_pitch) > q3_pitch_length:
             exceptionally_longer_pitches.append({"Product": pitch_entry.get("Product"), "Length": len(full_pitch)})

print(f"Found {len(significantly_shorter_pitches)} pitches significantly shorter than the first quartile ({q1_pitch_length:.2f} characters):")
for pitch in significantly_shorter_pitches:
    print(f"- {pitch['Product']}: {pitch['Length']} characters")

print(f"\nFound {len(exceptionally_longer_pitches)} pitches exceptionally longer than the third quartile ({q3_pitch_length:.2f} characters):")
for pitch in exceptionally_longer_pitches:
    print(f"- {pitch['Product']}: {pitch['Length']} characters")

Found 32 pitches significantly shorter than the first quartile (1248.00 characters):
- Pavlok: 1120 characters
- KidsLuv: 1208 characters
- OverEZ Chicken Coops: 1111 characters
- Surfset: 1222 characters
- Larq: 1245 characters
- Tog Samphel: 1177 characters
- Buggy Beds: 996 characters
- Pinole Blue: 1142 characters
- ZUUM: 1007 characters
- Plated: 1059 characters
- Slice of Sauce: 849 characters
- Doorbot: 1043 characters
- Minus Cal: 1208 characters
- Peanut Butter Pump: 1107 characters
- Sparketh: 1072 characters
- Wicked Good Cupcakes: 1176 characters
- Plop Star: 1088 characters
- ROQ Innovation: 990 characters
- Bertello Pizza Oven: 1091 characters
- Zuum: 1206 characters
- Bad Birdie: 1205 characters
- ChompShop: 1028 characters
- Chill Soda: 759 characters
- First Defense Nasal Screeners: 462 characters
- gourmet pretzel business: 1228 characters
- Knife Aid: 1048 characters
- The Cereal Killerz Kitchen: 1073 characters
- Incredible Eats: 1076 characters
- Genius Juice: 689 

# After Data Cleaning

In [None]:
import json

file_path = "refined_pitches_(126).json"
with open(file_path, 'r') as f:
    pitches_data = json.load(f)

print(f"Loaded {len(pitches_data)} pitch entries.")

Loaded 126 pitch entries.


In [None]:
pitch_lengths = []

for pitch_entry in pitches_data:
    full_pitch = pitch_entry.get("Full_Pitch")
    if isinstance(full_pitch, str):
        pitch_lengths.append(len(full_pitch))

print(f"Extracted lengths for {len(pitch_lengths)} pitches.")

Extracted lengths for 126 pitches.


In [None]:
import numpy as np

median_pitch_length = np.median(pitch_lengths)
q1_pitch_length = np.percentile(pitch_lengths, 25)
q3_pitch_length = np.percentile(pitch_lengths, 75)

print(f"Median pitch length: {median_pitch_length}")
print(f"First quartile (25th percentile): {q1_pitch_length}")
print(f"Third quartile (75th percentile): {q3_pitch_length}")

Median pitch length: 1345.0
First quartile (25th percentile): 1113.25
Third quartile (75th percentile): 1668.25


In [None]:
significantly_shorter_pitches = []
exceptionally_longer_pitches = []

for pitch_entry in pitches_data:
    full_pitch = pitch_entry.get("Full_Pitch")
    if isinstance(full_pitch, str):
        if len(full_pitch) < q1_pitch_length:
            significantly_shorter_pitches.append({"Product": pitch_entry.get("Product"), "Length": len(full_pitch)})
        elif len(full_pitch) > q3_pitch_length:
             exceptionally_longer_pitches.append({"Product": pitch_entry.get("Product"), "Length": len(full_pitch)})

print(f"Found {len(significantly_shorter_pitches)} pitches significantly shorter than the first quartile ({q1_pitch_length:.2f} characters):")
for pitch in significantly_shorter_pitches:
    print(f"- {pitch['Product']}: {pitch['Length']} characters")

print(f"\nFound {len(exceptionally_longer_pitches)} pitches exceptionally longer than the third quartile ({q3_pitch_length:.2f} characters):")
for pitch in exceptionally_longer_pitches:
    print(f"- {pitch['Product']}: {pitch['Length']} characters")

Found 32 pitches significantly shorter than the first quartile (1113.25 characters):
- Uprising Food: 822 characters
- OverEZ Chicken Coops: 1111 characters
- CUPBOP: 657 characters
- Hampton Adams: 943 characters
- Buggy Beds: 996 characters
- ZUUM: 1007 characters
- Bouquet Bar: 1049 characters
- Plated: 992 characters
- Slice of Sauce: 849 characters
- Back 9 Dips: 809 characters
- Doorbot: 1043 characters
- Kin Apparel: 1107 characters
- Peanut Butter Pump: 1107 characters
- Plunge: 998 characters
- Sparketh: 1072 characters
- LovePop Cards: 884 characters
- Plop Star: 1088 characters
- Fried Green Tomatoes: 711 characters
- Soupergirl: 978 characters
- ROQ Innovation: 990 characters
- Bertello Pizza Oven: 1011 characters
- ChompShop: 1028 characters
- First Defense Nasal Screeners: 462 characters
- Bee D'Vine Honey Wine: 887 characters
- Knife Aid: 1048 characters
- The Cereal Killerz Kitchen: 1073 characters
- Ta Ta Towels: 1024 characters
- Incredible Eats: 979 characters
- Geni