# Shark Tank Pitch Analyzer — Runnable Notebook

This notebook includes audio feature extraction, a transcript-based content scorer, and a simple persona feedback generator. Run cells top-to-bottom. See instructions inside for optional Whisper/LLM steps (Colab required for model downloads).

In [None]:
# Imports (install packages in Colab if needed)
import os, json, math
import numpy as np
from pathlib import Path
print('Basic imports ready.')

## 1) Upload or point to an audio file
Set `AUDIO_PATH` to your audio file path (WAV/MP3).

In [None]:
AUDIO_PATH = '/mnt/data/sample_pitch.wav'  # change as needed
if os.path.exists(AUDIO_PATH):
    print('Found audio:', AUDIO_PATH)
else:
    print('No audio found at', AUDIO_PATH)

## 2) Audio loading & preprocessing
This cell will load and plot the waveform (requires librosa & matplotlib).

In [None]:
try:
    import librosa, soundfile as sf, matplotlib.pyplot as plt
    def load_audio(path, sr=16000):
        y, _ = librosa.load(path, sr=sr, mono=True)
        return y, sr, len(y)/sr
    if os.path.exists(AUDIO_PATH):
        y, sr, duration = load_audio(AUDIO_PATH)
        print(f'Loaded: duration={duration:.2f}s, sr={sr}')
    else:
        print('No audio to load.')
except Exception as e:
    print('librosa not available or failed to load:', e)

## 3) Feature extraction
Extract RMS, simple pitch (librosa.yin), and pauses using an energy threshold.

In [None]:
def extract_audio_features(y, sr, hop_length=512, frame_length=2048):
    import numpy as np, librosa
    rms = librosa.feature.rms(y=y, hop_length=hop_length, frame_length=frame_length)[0]
    rms_mean = float(np.mean(rms))
    try:
        f0 = librosa.yin(y, fmin=50, fmax=500, sr=sr, frame_length=frame_length, hop_length=hop_length)
        f0_valid = f0[~np.isnan(f0)]
        f0_mean = float(np.mean(f0_valid)) if len(f0_valid)>0 else 0.0
        f0_std = float(np.std(f0_valid)) if len(f0_valid)>0 else 0.0
    except:
        f0_mean = f0_std = 0.0
    # energy frames for pause detection
    S = np.abs(librosa.stft(y, n_fft=frame_length, hop_length=hop_length))
    energies = np.mean(S**2, axis=0)
    energy_thresh = 0.02 * np.max(energies) if energies.size else 0.0
    silent = energies < energy_thresh if energies.size else np.array([])
    frame_dur = hop_length / sr
    pauses = []
    i=0
    while i < len(silent):
        if silent[i]:
            start=i
            while i < len(silent) and silent[i]:
                i+=1
            pauses.append((i-start)*frame_dur)
        else:
            i+=1
    return {'rms_mean':rms_mean,'f0_mean':f0_mean,'f0_std':f0_std,'num_pauses':len(pauses),'mean_pause': float(np.mean(pauses)) if pauses else 0.0}

# Run if audio exists
if os.path.exists(AUDIO_PATH):
    y, sr, duration = load_audio(AUDIO_PATH)
    feats = extract_audio_features(y, sr)
    print('Features:', feats)
else:
    print('No audio available for feature extraction.')

## 4) Transcription (optional)
You can run Whisper in Colab by uncommenting the cell. Otherwise, paste your transcript in the next cell.

In [None]:
# Optional Whisper pseudocode (uncomment in Colab with internet):
# import whisper
# model = whisper.load_model('small')
# result = model.transcribe(AUDIO_PATH)
# transcript = result['text']

# Fallback: paste transcript here
transcript = """"  # <-- paste your transcript between the triple quotes
if transcript.strip() == "":
    print('No transcript provided. Paste it into the "transcript" variable.')
else:
    print('Transcript provided — length:', len(transcript.split()), 'words')

## 5) Filler detection & WPM

In [None]:
import re
FILLERS = ['um','uh','you know','like','so','actually','basically','right','i mean']

def detect_fillers(transcript):
    text = transcript.lower()
    counts = {}
    total=0
    for f in FILLERS:
        c = len(re.findall(r'\b'+re.escape(f)+r'\b', text))
        if c>0:
            counts[f]=c
            total+=c
    return counts,total

def speech_rate_wpm(transcript, seconds):
    words = re.findall(r'\w+', transcript)
    if seconds <=0: return 0.0
    return (len(words)/seconds)*60.0

if transcript.strip() != "" and 'duration' in globals():
    wpm = speech_rate_wpm(transcript, duration)
    fillers, total_fillers = detect_fillers(transcript)
    print('WPM:', round(wpm,1))
    print('Fillers total:', total_fillers, fillers)
else:
    print('No transcript or audio to compute WPM/fillers.')

## 6) Content scoring (simple heuristic or embeddings if installed)

In [None]:
# Simple keyword-based content scoring (0..1 per category)
categories = {
    'problem_clarity': ['problem','pain','need','users','customers','issue'],
    'product_differentiation': ['different','unique','novel','patent','proprietary'],
    'business_model': ['revenue','price','pricing','subscribe','subscription','sell','monetize'],
    'market_opportunity': ['market','tam','sam','som','segment','growth','size'],
    'competition_awareness': ['competitor','competition','alternative','vs','existing'],
    'revenue_logic': ['price','margin','cogs','cost','lifetime value','cac','ltv']
}

def keyword_score(transcript, keywords):
    if transcript.strip() == "": return 0.0
    t = transcript.lower()
    hits = sum(1 for kw in keywords if kw in t)
    return min(1.0, hits/len(keywords))

def compute_business_scores(transcript):
    return {k: keyword_score(transcript, kw) for k,kw in categories.items()}

if transcript.strip() != "":
    biz_scores = compute_business_scores(transcript)
    print('Business component scores:')
    for k,v in biz_scores.items():
        print(f'  {k}: {v:.2f}')
else:
    print('No transcript for business scoring.')

## 7) Aggregate Delivery & Business Score

In [None]:
def compute_delivery_score(feats, wpm=None, filler_count=0):
    clarity = 0.5 if wpm is None else max(0.0, min(1.0, 1 - abs(wpm-135)/100))
    pause_penalty = min(1.0, feats.get('num_pauses',0)/10.0)
    filler_penalty = min(1.0, filler_count/10.0)
    confidence = max(0.0, 1.0 - 0.6*pause_penalty - 0.4*filler_penalty)
    rms = feats.get('rms_mean', 0.0)
    energy = max(0.0, min(1.0, (rms - 0.001)/0.05))
    f0std = feats.get('f0_std', 0.0)
    pitch_var = max(0.0, min(1.0, f0std/50.0))
    delivery = 0.4*clarity + 0.35*confidence + 0.15*energy + 0.10*pitch_var
    return float(delivery*100)

def compute_business_score(biz_scores):
    weights = {'problem_clarity':0.15,'product_differentiation':0.15,'business_model':0.20,'market_opportunity':0.20,'competition_awareness':0.10,'revenue_logic':0.20}
    s=0.0
    for k,w in weights.items():
        s+=biz_scores.get(k,0.0)*w
    return float(s*100)

# compute
if 'feats' in globals():
    filler_total = 0
    if transcript.strip() != "":
        _, filler_total = detect_fillers(transcript)
    wpm_val = speech_rate_wpm(transcript, duration) if transcript.strip()!="" and 'duration' in globals() else None
    delivery_score = compute_delivery_score(feats, wpm=wpm_val, filler_count=filler_total)
    business_score = compute_business_score(biz_scores) if 'biz_scores' in globals() else 0.0
    print('Delivery Score:', round(delivery_score,1))
    print('Business Score:', round(business_score,1))
else:
    print('Audio features not present — cannot compute delivery score.')

## 8) Shark Panel (template) & Save Report

In [None]:
def persona_feedback(name, delivery_score, business_score, strengths, weaknesses):
    if business_score >= 75 and delivery_score >= 70:
        verdict='Invest'
    elif business_score >= 60 and delivery_score >= 50:
        verdict='Need More Info'
    else:
        verdict='Not Invest'
    txt = f'Persona: {name}\nVerdict: {verdict}\nDelivery={delivery_score:.1f}, Business={business_score:.1f}\n'
    txt += 'Strengths:\n' + '\n'.join('- '+s for s in strengths) + '\nWeaknesses:\n' + '\n'.join('- '+w for w in weaknesses)
    return txt

if 'delivery_score' in globals():
    strengths=[]; weaknesses=[]
    if delivery_score>65: strengths.append('Good delivery')
    else: weaknesses.append('Delivery needs improvement')
    if business_score>65: strengths.append('Strong business content')
    else: weaknesses.append('Business content needs more detail')
    panel = {p: persona_feedback(p, delivery_score, business_score, strengths, weaknesses) for p in ['Visionary','Finance','Customer Advocate','Skeptic']}
    report = {'delivery_score':delivery_score, 'business_score':business_score, 'features':feats, 'biz_components': biz_scores if 'biz_scores' in globals() else {}, 'panel':panel}
    out_path='/mnt/data/pitch_report.json'
    with open(out_path,'w') as f:
        json.dump(report,f,indent=2)
    print('Report saved to', out_path)
    for p,txt in panel.items():
        print('---',p,'---')
        print(txt)
else:
    print('Scores not computed; run previous cells first.')