# This notebook is meant to turn the refined pitches into a dataset with pitch facts and the refined pitch as a ground truth

In [None]:
!pip install -q google-genai pydantic tqdm

In [None]:
from google.colab import userdata


In [None]:
example_full_pitch="Hi Sharks my name is pete badaway and this is my beautiful white bianca we're seeking 100 000 for 10 equity and our company sharks we want to introduce you to joe joe here he's just like all of us gets up gets himself ready puts his mask on and heads off to work the second joe leaves his house these nasty little germs which are filled with odor dirt grind and even bacteria are flying at them from all different directions [Music] i love it whether he's grabbing a quick latte yes at a crowded cafe walking past a sneezing co-worker and let's not forget all those nasties he picks up during his commute to and from work he's washing and cleaning his hands throughout the day but what's he doing about his clothes nothing sharks he does nothing that crazy nasty dirt that he's collecting throughout the day is still stuck on his clothes let's face the sharks we can't all walk around with these things on all day to keep us protected there are products to wash and clean our hands on the go but why not for our clothes which are even more exposed and that is why we created garment guard garment is the first natural garment and fabric cleanser of its kind which uses natural propellants to keep you safe and your clothes fresh and clean applying our garment spray it's easy you simply just spray it onto your clothes helps eliminate odor freshen your fabric and keep that dirt and grime under control there you go joe much better so no matter who you are or what you do garment guard gives you the freedom to live your life without having to wear one of these things on all day to keep you protected so which one of you sharks wants to team up with us to help garma guard the world?"

In [None]:
"""
item in refined_pitches_(126).json
{
        "Product": "GarmaGuard",
        "Full_Pitch": "Hi Sharks my name is pete badaway and this is my beautiful white bianca we're seeking 100 000 for 10 equity and our company sharks we want to introduce you to joe joe here he's just like all of us gets up gets himself ready puts his mask on and heads off to work the second joe leaves his house these nasty little germs which are filled with odor dirt grind and even bacteria are flying at them from all different directions [Music] i love it whether he's grabbing a quick latte yes at a crowded cafe walking past a sneezing co-worker and let's not forget all those nasties he picks up during his commute to and from work he's washing and cleaning his hands throughout the day but what's he doing about his clothes nothing sharks he does nothing that crazy nasty dirt that he's collecting throughout the day is still stuck on his clothes let's face the sharks we can't all walk around with these things on all day to keep us protected there are products to wash and clean our hands on the go but why not for our clothes which are even more exposed and that is why we created garment guard garment is the first natural garment and fabric cleanser of its kind which uses natural propellants to keep you safe and your clothes fresh and clean applying our garment spray it's easy you simply just spray it onto your clothes helps eliminate odor freshen your fabric and keep that dirt and grime under control there you go joe much better so no matter who you are or what you do garment guard gives you the freedom to live your life without having to wear one of these things on all day to keep you protected so which one of you sharks wants to team up with us to help garma guard the world?"
    }
"""

## Load the data

### Subtask:
Ensure the `refined_pitches_(126).json` file is loaded into a pandas DataFrame or a list of dictionaries.


## Load the data

### Subtask:
Retry loading the `refined_pitches_(126).json` file into a list of dictionaries. The previous attempt failed because the file was not found. Ensure the file path is correct and the file exists before attempting to load it.


In [None]:
import os
import json

file_path = "combined_refined_pitches_(245).json"

if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        pitches_data = json.load(f)
    print(f"Loaded {len(pitches_data)} pitch entries.")
else:
    pitches_data = []
    print(f"File not found: {file_path}")

Loaded 245 pitch entries.


## Initialize the gemini model

### Subtask:
Set up the Gemini 2.5 Flash model with the API key.


# This was our first attempt at preparing the Dataset, we made it too easy for the models because the input was too rich

In [None]:
# --- Setup & imports ---
import os, json, time, typing as t
from pathlib import Path
import uuid

from google import genai
from google.genai import types
from pydantic import BaseModel, ValidationError

# NEW: try tqdm for progress bar
try:
    from tqdm import tqdm  # type: ignore
    _HAS_TQDM = True
except Exception:
    _HAS_TQDM = False

# Optional for Colab: read key from the notebook's Secrets
try:
    from google.colab import userdata  # type: ignore
    os.environ.setdefault("GEMINI_API_KEY", userdata.get("GEMINI_API_KEY") or "")
except Exception:
    pass


# --- Pydantic schema (lean, aligned to your example) ---
class InitialOffer(BaseModel):
    amount: str
    equity: str

class ProblemStory(BaseModel):
    persona: str
    routine: list[str]
    core_problem: str
    hygiene_gap: str
    problem_keywords: list[str]

class ProductSolution(BaseModel):
    name: str
    product_category: str
    key_differentiator: str
    application: str
    features_keywords: list[str]
    benefits_keywords: list[str]

class ClosingTheme(BaseModel):
    call_to_action: str
    mission: str
    target_audience: str

class PitchFacts(BaseModel):
    founders: list[str]
    company_name: str
    initial_offer: InitialOffer
    problem_story: ProblemStory
    product_solution: ProductSolution
    closing_theme: ClosingTheme

class PitchData(BaseModel):
    pitch_facts: PitchFacts


# --- Prompt template ---
PROMPT_TEMPLATE = """You are given a startup pitch transcript.

Return ONLY JSON matching this schema (no markdown, no commentary):
PitchData = {{
  "pitch_facts": {{
    "founders": [str],
    "company_name": str,
    "initial_offer": {{"amount": str, "equity": str}},
    "problem_story": {{
      "persona": str,
      "routine": [str],
      "core_problem": str,
      "hygiene_gap": str,
      "problem_keywords": [str]
    }},
    "product_solution": {{
      "name": str,
      "product_category": str,
      "key_differentiator": str,
      "application": str,
      "features_keywords": [str],
      "benefits_keywords": [str]
    }},
    "closing_theme": {{
      "call_to_action": str,
      "mission": str,
      "target_audience": str
    }}
  }}
}}

Constraints:
{company_name_constraint}

Transcript:
{pitch_text}
"""

def _build_prompt(pitch_text: str, exact_company_name: t.Optional[str]) -> str:
    if exact_company_name and exact_company_name.strip():
        constraint = (
            f'- Use EXACTLY this string for "company_name": "{exact_company_name}". '
            f'Do not change casing, spacing, or punctuation.'
        )
    else:
        constraint = "- If company_name is unclear, infer from the transcript."
    return PROMPT_TEMPLATE.format(
        pitch_text=pitch_text,
        company_name_constraint=constraint,
    )


def _safe_write_jsonl(path: t.Union[str, Path], obj: dict) -> None:
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")


def _call_model_with_retries(
    client: genai.Client,
    model: str,
    contents: str,
    schema_type: t.Type[BaseModel],
    max_retries: int = 5,
    base_delay: float = 1.0,
    thinking_budget: int = 0,
) -> str:
    attempt = 0
    while True:
        try:
            resp = client.models.generate_content(
                model=model,
                contents=contents,
                config=types.GenerateContentConfig(
                    response_mime_type="application/json",
                    response_schema=schema_type,
                    thinking_config=types.ThinkingConfig(thinking_budget=thinking_budget),
                ),
            )
            return resp.text
        except Exception as e:
            attempt += 1
            if attempt > max_retries:
                raise
            time.sleep(base_delay * (2 ** (attempt - 1)))


def process_refined_pitches(
    input_json_path: t.Union[str, Path] = "refined_pitches.json",
    output_jsonl_path: t.Union[str, Path] = "parsed_pitches.jsonl",
    errors_jsonl_path: t.Union[str, Path] = "errors.jsonl",
    pitch_text_key: str = "Full_Pitch",
    company_name_key: str = "Product",
    model: str = "gemini-2.5-flash",
    start_index: int = 0,
    limit: t.Optional[int] = None,
    thinking_budget: int = 0,
    use_tqdm: bool = True,            # NEW: control loading bar
) -> dict:
    """
    Reads items and writes JSONL records of the form:
    {
      "_meta_index": idx,
      "input": { ... pitch_facts ... },
      "output": "<original Full_Pitch>"
    }
    """
    input_path = Path(input_json_path)
    if not input_path.exists():
        raise FileNotFoundError(f"Input not found: {input_path.resolve()}")

    with input_path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("Input JSON must be a list of objects.")

    api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY") or ""
    if not api_key:
        raise EnvironmentError("No GEMINI_API_KEY (or GOOGLE_API_KEY) found in environment.")
    client = genai.Client()

    processed = successes = failures = 0
    end_index = len(data) if limit is None else min(len(data), start_index + limit)
    total = end_index - start_index

    # truncate output files
    Path(output_jsonl_path).write_text("", encoding="utf-8")
    Path(errors_jsonl_path).write_text("", encoding="utf-8")

    # choose iterator (with or without tqdm)
    if use_tqdm and _HAS_TQDM:
        iterator = tqdm(range(start_index, end_index), desc="Processing pitches", unit="pitch")
    else:
        iterator = range(start_index, end_index)

    for idx in iterator:
        item = data[idx]
        processed += 1

        pitch_text = item.get(pitch_text_key) or ""
        if not isinstance(pitch_text, str) or not pitch_text.strip():
            failures += 1
            _safe_write_jsonl(errors_jsonl_path, {
                "index": idx,
                "reason": f"Missing or empty '{pitch_text_key}'",
                "item_keys": list(item.keys())
            })
            # fallback progress print if no tqdm
            if not (_HAS_TQDM and use_tqdm):
                print(f"[{processed}/{total}] pitch {idx} failed (empty pitch)")
            continue

        # pull canonical company name if present
        exact_company_name = item.get(company_name_key)
        if isinstance(exact_company_name, str):
            exact_company_name = exact_company_name.strip()
        else:
            exact_company_name = None

        try:
            prompt = _build_prompt(pitch_text, exact_company_name)
            raw = _call_model_with_retries(
                client=client,
                model=model,
                contents=prompt,
                schema_type=PitchData,
                thinking_budget=thinking_budget,
            )

            parsed: PitchData = PitchData.model_validate_json(raw)
            as_dict = parsed.model_dump()

            # Hard-enforce exact company name if provided
            if exact_company_name:
                as_dict["pitch_facts"]["company_name"] = exact_company_name

            record = {
                "id": str(uuid.uuid4()),
                "_meta_index": idx,
                "input": as_dict["pitch_facts"],
                "output": pitch_text
            }

            _safe_write_jsonl(output_jsonl_path, record)
            successes += 1

        except (json.JSONDecodeError, ValidationError) as e:
            failures += 1
            _safe_write_jsonl(errors_jsonl_path, {
                "index": idx,
                "error_type": "SchemaOrJSON",
                "message": str(e),
                "raw_response_excerpt": (raw[:500] if isinstance(raw, str) else None),
            })
        except Exception as e:
            failures += 1
            _safe_write_jsonl(errors_jsonl_path, {
                "index": idx,
                "error_type": "Unhandled",
                "message": str(e),
            })

        # fallback progress print if tqdm not available
        if not (_HAS_TQDM and use_tqdm):
            print(f"[{processed}/{total}] processed pitch {idx} (ok={successes}, fail={failures})")

    summary = {
        "input": str(input_path),
        "processed": processed,
        "successes": successes,
        "failures": failures,
        "output_jsonl": str(Path(output_jsonl_path).resolve()),
        "errors_jsonl": str(Path(errors_jsonl_path).resolve()),
    }
    print("Done.\n", json.dumps(summary, indent=2))
    return summary

In [None]:
# --- Example call ---
summary = process_refined_pitches(
    input_json_path="combined_refined_pitches_(245).json",
    output_jsonl_path="parsed_pitches.jsonl",
    errors_jsonl_path="errors.jsonl",
    pitch_text_key="Full_Pitch",
    model="gemini-2.5-flash",   # stays fast & cheap; switch if you like
    start_index=0,
    limit=None,                  # or an int to process a subset
    use_tqdm=True,           # shows bar
    thinking_budget=0            # if you switch to a model that *requires* thinking, set > 0
)


Processing pitches: 100%|██████████| 245/245 [10:35<00:00,  2.59s/pitch]

Done.
 {
  "input": "combined_refined_pitches_(245).json",
  "processed": 245,
  "successes": 245,
  "failures": 0,
  "output_jsonl": "/content/parsed_pitches.jsonl",
  "errors_jsonl": "/content/errors.jsonl"
}





# Split Dataset into train set & test set (80/20)

In [None]:
import json
import random

# --- Configuration ---
input_filename = "transformed.jsonl"
train_filename = "train.jsonl"
test_filename = "test.jsonl"
train_percentage = 0.8
# ---------------------

all_data = []

# 1. Read all data from the input file
try:
    with open(input_filename, 'r', encoding='utf-8') as infile:
        for line in infile:
            # Skip any empty lines
            if line.strip():
                try:
                    # Check that it's valid JSON
                    json.loads(line)
                    all_data.append(line)
                except json.JSONDecodeError:
                    print(f"Warning: Skipping malformed JSON line: {line.strip()}")

    if not all_data:
        print(f"❌ Error: No data found in '{input_filename}'.")
        print("Please make sure the file is not empty.")
    else:
        # 2. Shuffle the data
        print(f"Read {len(all_data)} total records. Shuffling...")
        random.shuffle(all_data)

        # 3. Calculate the split point
        split_index = int(len(all_data) * train_percentage)

        # 4. Create the train and test lists
        train_data = all_data[:split_index]
        test_data = all_data[split_index:]

        # 5. Write the train.jsonl file
        with open(train_filename, 'w', encoding='utf-8') as outfile:
            for line in train_data:
                outfile.write(line) # line already includes a newline

        print(f"\n✅ Wrote {len(train_data)} records to {train_filename}")

        # 6. Write the test.jsonl file
        with open(test_filename, 'w', encoding='utf-8') as outfile:
            for line in test_data:
                outfile.write(line) # line already includes a newline

        print(f"✅ Wrote {len(test_data)} records to {test_filename}")
        print("\nAll done!")

except FileNotFoundError:
    print(f"❌ Error: File not found: '{input_filename}'")
    print("Please make sure you have uploaded the file to your Colab session.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Read 245 total records. Shuffling...

✅ Wrote 196 records to train.jsonl
✅ Wrote 49 records to test.jsonl

All done!


# Realised that we should remove the Founder's names from the input fields in the dataset for PII cleaning purposes

# Task
Replace the "founders" list in each record of the "train.jsonl" file with a list of strings "Founder 1", "Founder 2", etc., based on the original number of founders, and save the modified data to a new file named "train_modified.jsonl".

## Read data

### Subtask:
Read the data from the `train.jsonl` file into a list of dictionaries.


In [None]:
import json

train_data = []
train_filename = "sharktank_dataset.jsonl"

try:
    with open(train_filename, 'r', encoding='utf-8') as infile:
        for line in infile:
            if line.strip():
                try:
                    train_data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"Warning: Skipping malformed JSON line: {line.strip()}")

    print(f"Read {len(train_data)} records from {train_filename}")

except FileNotFoundError:
    print(f"❌ Error: File not found: '{train_filename}'")
    print("Please make sure the file exists.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Read 245 records from sharktank_dataset.jsonl


In [None]:
modified_train_data = []

for record in train_data:
    if 'input' in record and 'founders' in record['input'] and isinstance(record['input']['founders'], list):
        num_founders = len(record['input']['founders'])
        record['input']['founders'] = [f"Founder {i+1}" for i in range(num_founders)]
    modified_train_data.append(record)

print(f"Modified 'founders' for {len(modified_train_data)} records.")

Modified 'founders' for 245 records.


In [None]:
output_filename = "main_modified.jsonl"

try:
    with open(output_filename, 'w', encoding='utf-8') as outfile:
        for record in modified_train_data:
            outfile.write(json.dumps(record) + '\n')

    print(f"Successfully wrote modified data to {output_filename}")

except Exception as e:
    print(f"An error occurred while writing the file: {e}")

Successfully wrote modified data to main_modified.jsonl


# Task
Transform the data from the "sharktank_anon_dataset.jsonl" file to match the "To-Be" schema provided in the user message, using the `gemini-2.5-flash` model.

## Load data

### Subtask:
Load the data from the `sharktank_dataset.jsonl` file into a list of dictionaries.


In [None]:
import json

all_data = []
input_filename = "sharktank_anon_dataset.jsonl"

try:
    with open(input_filename, 'r', encoding='utf-8') as infile:
        for line in infile:
            if line.strip():
                try:
                    all_data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"Warning: Skipping malformed JSON line: {line.strip()}")

    print(f"Read {len(all_data)} records from {input_filename}")

except FileNotFoundError:
    print(f"❌ Error: File not found: '{input_filename}'")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Read 245 records from sharktank_anon_dataset.jsonl


# Task
Process the "sharktank_anon_dataset.jsonl" file by extracting the "output" field (pitch), and using the Gemini 2.5 Flash model with thinking set to 0, extract information from the pitch to match the following JSON schema:

```json
{
  "company": "Frobert",
  "founder": ["Founder 1", "Founder 1"]
  "offer": "125000 for 20%",
  "problem_summary": "Billions of pounds of cosmetically imperfect produce are wasted every year, hurting farm profitability and sustainability.",
  "solution_summary": "Frobert turns these misfit fruits and vegetables into premium frozen desserts that reduce waste and create new revenue streams.",
}
```

Save the results to a new JSONL file.

## Define target schema

### Subtask:
Define the Pydantic schema for the desired output format.


In [None]:
from pydantic import BaseModel
import typing as t

class SharkTankFacts(BaseModel):
    company: str
    founder: t.List[str]
    offer: str
    problem_summary: str
    solution_summary: str

## Define prompt template

### Subtask:
Create a prompt template for the Gemini model that includes instructions and the target schema.


In [None]:
PROMPT_TEMPLATE_ANON = """You are given a startup pitch transcript.

Extract the following information from the transcript and return it as a JSON object matching the schema below.

Return ONLY JSON matching this schema (no markdown, no commentary):
{{
  "company": str,
  "founder": [str],
  "offer": str,
  "problem_summary": str,
  "solution_summary": str
}}

Instructions:
- For "company", extract the name of the company being pitched.
- For "founder", create a list of strings representing the founders. Replace the actual names with "Founder 1", "Founder 2", etc., based on the number of founders mentioned.
- For "offer", extract the initial investment amount and equity offered in the format "Amount for Equity" (e.g., "100,000 for 10%").
- For "problem_summary", provide a concise summary of the problem the startup is trying to solve, based on the pitch.
- For "solution_summary", provide a concise summary of the startup's product or service that addresses the problem, based on the pitch.

Transcript:
{pitch_text}
"""

## Process data

### Subtask:
Iterate through the loaded data, extract the pitch text from the "output" field, call the Gemini model with the prompt and pitch text, and parse the model's JSON response according to the target schema.


In [None]:
import os
import json
from google import genai
from pydantic import ValidationError
import typing as t
from google.colab import userdata
import uuid
from tqdm import tqdm  # <-- progress bar

processed_data = []
api_key = (
    userdata.get("GEMINI_API_KEY")
    or os.environ.get("GEMINI_API_KEY")
    or os.environ.get("GOOGLE_API_KEY")
    or ""
)

if not api_key:
    print("❌ Error: No GEMINI_API_KEY (or GOOGLE_API_KEY) found in environment.")
else:
    client = genai.Client(api_key=api_key)
    model_name = "gemini-2.5-flash"  # Using the specified model

    # tqdm progress bar for all_data
    for record in tqdm(all_data, desc="Processing pitches", unit="pitch"):
        try:
            pitch_text = record.get("output", "")
            if not pitch_text:
                tqdm.write(f"⚠️ Skipping record with no pitch text: {record.get('id', 'N/A')}")
                continue

            prompt = PROMPT_TEMPLATE_ANON.format(pitch_text=pitch_text)

            # Call the Gemini model
            response = client.models.generate_content(
                model=model_name,
                contents=prompt,
                config=genai.types.GenerateContentConfig(
                    response_mime_type="application/json",
                    response_schema=SharkTankFacts,
                    thinking_config=genai.types.ThinkingConfig(thinking_budget=0),
                ),
            )

            # Parse the model's JSON response
            parsed_data = SharkTankFacts.model_validate_json(response.text)

            # Create the new dictionary
            processed_record = {
                "id": record.get("id", str(uuid.uuid4())),  # Use existing id or generate new one
                "input": parsed_data.model_dump(),
                "output": pitch_text
            }
            processed_data.append(processed_record)

        except (json.JSONDecodeError, ValidationError) as e:
            tqdm.write(f"❌ Error processing record {record.get('id', 'N/A')}: JSON/validation error - {e}")
        except Exception as e:
            tqdm.write(f"❌ Unexpected error for record {record.get('id', 'N/A')}: {e}")

    print(f"\n✅ Successfully processed {len(processed_data)} records out of {len(all_data)}.")

Processing pitches: 100%|██████████| 245/245 [05:28<00:00,  1.34s/pitch]


✅ Successfully processed 245 records out of 245.





In [None]:
import json

all_data = []
input_filename = "sharktank_anon_dataset.jsonl"

try:
    with open(input_filename, 'r', encoding='utf-8') as infile:
        for line in infile:
            if line.strip():
                try:
                    all_data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"Warning: Skipping malformed JSON line: {line.strip()}")

    print(f"Read {len(all_data)} records from {input_filename}")

except FileNotFoundError:
    print(f"❌ Error: File not found: '{input_filename}'")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Read 245 records from sharktank_anon_dataset.jsonl


In [None]:
for idx in range(0,len(processed_data)):
  company_name = all_data[idx]["input"]["company_name"]
  processed_data[idx]["input"]["company"] = company_name


In [None]:
output_filename = "transformed.jsonl"

try:
    with open(output_filename, 'w', encoding='utf-8') as outfile:
        for record in processed_data:
            outfile.write(json.dumps(record) + '\n')

    print(f"Successfully wrote transformed data to {output_filename}")

except Exception as e:
    print(f"An error occurred while writing the file: {e}")

Successfully wrote transformed data to transformed.jsonl


## Summary:

### Data Analysis Key Findings

*   Successfully read 245 records from the input file "sharktank_anon_dataset.jsonl".
*   Skipped some records during transformation due to missing 'input' or 'pitch_facts' keys.
*   Successfully transformed the data according to the specified structure.
*   Successfully wrote the transformed data to the output file "sharktank_anon_dataset_transformed.jsonl".

### Insights or Next Steps

*   Investigate the records that were skipped to understand why they are missing 'input' or 'pitch_facts' keys and determine if they should be handled differently or excluded.
*   Verify the content of the transformed data in "sharktank_anon_dataset_transformed.jsonl" to ensure the `company_name` was correctly inserted into the new `input` field for all processed records.
