In [1]:
import pandas as pd

In [2]:
!pip install git+https://github.com/openai/whisper.git


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-m0b3212h
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-m0b3212h
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install ffmpeg



In [5]:
import whisper

def subs(index):
  # Load the Whisper model
  model = whisper.load_model("base")  # or "small", "medium", "large"

  # Transcribe the video file (replace 'your_video.mp4' with your actual file)
  result = model.transcribe(f"Atul{index}.mp4", verbose=True)

  # Save subtitles to an SRT file
  with open(f"subtitles{index}.srt", "w", encoding="utf-8") as srt_file:
      for i, segment in enumerate(result["segments"]):
          start = segment["start"]
          end = segment["end"]
          text = segment["text"].strip()

          # Format time
          def format_time(seconds):
              h = int(seconds // 3600)
              m = int((seconds % 3600) // 60)
              s = int(seconds % 60)
              ms = int((seconds - int(seconds)) * 1000)
              return f"{h:02}:{m:02}:{s:02},{ms:03}"

          srt_file.write(f"{i+1}\n")
          srt_file.write(f"{format_time(start)} --> {format_time(end)}\n")
          srt_file.write(f"{text}\n\n")


In [6]:
import re
from datetime import datetime

def parse_timestamp(ts):
    ts = ts.replace('.', ',')
    return datetime.strptime(ts, "%H:%M:%S,%f")

def extract_joy_cause_pairs(text_generated):
    pattern = r"Cause Timestamp: (.*?) --> (.*?)\nCause Text: (.*?)\nEmotion Timestamp: (.*?) --> (.*?)\nEmotion Text: (.*?)(?:\n|$)"

    matches = re.findall(pattern, text_generated)

    pairs = []

    for i, match in enumerate(matches, 1):
        cause_start, cause_end, cause_text, emotion_start, emotion_end, emotion_text = match

        min_start = min(parse_timestamp(cause_start), parse_timestamp(emotion_start))
        max_end = max(parse_timestamp(cause_end), parse_timestamp(emotion_end))

        pairs.append({
            "timestamp_range": f"{min_start.strftime('%H:%M:%S,%f')[:-3]} --> {max_end.strftime('%H:%M:%S,%f')[:-3]}",
            "cause_text": cause_text.strip(),
            "emotion_text": emotion_text.strip()
        })

    return pairs


In [7]:
import requests
import time

def get_all_subtitles(srt_file):
    with open(srt_file, 'r', encoding='utf-8') as f:
        return f.read().split('\n\n')  # Split each subtitle block

def join_srt_subs(sub_blocks):
    return '\n\n'.join(sub_blocks)

def create_prompt(srt_chunk):
    return f"""
You are an expert at analyzing subtitles and emotions in conversations.

Given subtitles in `.srt` format, extract all pairs of dialogue segments where:
- One subtitle expresses **joy** or **happiness** or **surprise**
- Another (previous or nearby) subtitle contains the **cause** or **reason** for that joy
- Time difference between the pairs should be 5 seconds max

For each pair, return:
1. Start and end timestamps of both subtitles
2. The text of both subtitles
3. Clearly mention which line is the "Emotion" and which one is the "Cause"

Only output pairs (Cause first then Emotion) where you can confidently identify joy and its cause. Ignore all other subtitles.

Follow the following Output Format:
Pair:
Cause Timestamp:
Cause Text:
Emotion Timestamp:
Emotion Text:

Now, analyze the following `.srt` content:

{srt_chunk}
"""

def send_to_gemini(prompt, API_KEY):
    endpoint = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
    headers = {"Content-Type": "application/json"}
    body = {
        "contents": [
            {
                "parts": [
                    {
                        "text": prompt
                    }
                ]
            }
        ]
    }

    response = requests.post(f"{endpoint}?key={API_KEY}", headers=headers, json=body)
    if response.status_code == 200:
        output = response.json()
        return output["candidates"][0]["content"]["parts"][0]["text"]
    else:
        print(f"Error: {response.status_code}\n{response.text}")
        return None

def process_subtitles(index):
    API_KEY = "AIzaSyCFZkfFpCJ3kX8XpcfnlgyAOcZIlWIZb_8"
    srt_file = f"subtitles{index}.srt"
    all_subs = get_all_subtitles(srt_file)
    total = len(all_subs)
    step = 30
    window = 50

    results = []

    for i, start in enumerate(range(0, total, step)):
        end = min(start + window, total)
        chunk = join_srt_subs(all_subs[start:end])
        prompt = create_prompt(chunk)

        print(f"Processing subtitles {start} to {end} (chunk {i+1})...")
        response_text = send_to_gemini(prompt, API_KEY)
        if response_text:
            chunk_res = extract_joy_cause_pairs(response_text)
            # print(response_text)
            # print(chunk_res)
            results.extend(chunk_res)

        if (i + 1) % 12 == 0:
            print("Sleeping for 60 seconds to avoid rate limiting...")
            time.sleep(60)

    return results


In [None]:
for i in range(2,9):
  time.sleep(60)
  subs(i)
  results = process_subtitles(i)
  len(results)
  df = pd.DataFrame(results)
  df.to_csv(f'Atul{i}.csv',index = False)



Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:02.000]  I'm gonna be honest.
[00:02.800 --> 00:03.800]  I nearly threw up.
[00:03.800 --> 00:05.800]  LAUGHTER
[00:06.800 --> 00:07.800]  I'm gonna do it.
[00:18.800 --> 00:19.800]  Pops, hop, pop!
[00:20.400 --> 00:21.400]  Ho-ee!
[00:22.200 --> 00:23.000]  Pop!
[00:23.000 --> 00:24.000]  Yeah, very good.
[00:24.000 --> 00:25.600]  It's fair, good, fair to us.
[00:25.600 --> 00:27.600]  Payton, how about now?
[00:27.600 --> 00:29.800]  I'm good, how you're?
[00:29.800 --> 00:32.600]  I am thirsty.
[00:32.600 --> 00:35.400]  I'm thirsty too, but not as thirsty as you always.
[00:35.400 --> 00:37.400]  What is that supposed to mean?
[00:37.400 --> 00:39.000]  I can't find out.
[00:39.000 --> 00:41.000]  That's trying to hide his erection.
[00:45.000 --> 00:48.000]  OK, should we...
[00:48.000 --> 00:49.000]  ...do it as his...
[00:49.000 --> 00:50.0



Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:02.000]  I think you're gonna use mouth and get a pump and down on this.
[00:02.000 --> 00:03.000]  You're gonna actually fix it.
[00:03.000 --> 00:04.000]  You gotta erase that.
[00:04.000 --> 00:05.000]  You mean?
[00:05.000 --> 00:07.000]  You're gonna erase that.
[00:21.000 --> 00:22.000]  Oh!
[00:22.000 --> 00:23.000]  Hey buddy.
[00:25.000 --> 00:26.000]  How are you?
[00:26.000 --> 00:27.000]  It's been a while.
[00:27.000 --> 00:28.000]  I'm good.
[00:28.000 --> 00:29.000]  Yeah?
[00:29.000 --> 00:31.000]  Is that how usual way of saying hello, isn't it?
[00:31.000 --> 00:32.000]  Hello.
[00:32.000 --> 00:33.000]  It's good to see you, man.
[00:33.000 --> 00:34.000]  Likewise.
[00:34.000 --> 00:35.000]  What's been going on?
[00:35.000 --> 00:37.000]  Um, I don't know.
[00:37.000 --> 00:38.000]  Not much.
[00:38.000 --> 00:39.000]  I'm runni