In [2]:
import cv2
import numpy as np

def get_valid_time_intervals(video_path, threshold=10, min_duration=0.1):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    timestamp = 0
    valid_intervals = []
    start_time = None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        mean_intensity = np.mean(gray)

        if mean_intensity > threshold:  # Not black
            if start_time is None:
                start_time = timestamp
        else:  # Black frame
            if start_time is not None:
                duration = timestamp - start_time
                if duration >= min_duration:  # Ignore very short flashes
                    valid_intervals.append((start_time, timestamp))
                start_time = None
        timestamp += 1 / fps

    # Handle case where video ends on valid segment
    if start_time is not None:
        valid_intervals.append((start_time, timestamp))

    cap.release()
    return valid_intervals  # e.g., [(2.5, 8.3), (12.1, 15.7), ...]

In [3]:
from pydub import AudioSegment

# Load full audio (same timeline as video)
audio = AudioSegment.from_wav("DatasetCercetare/Audio/Daemahni_Gianna.wav")

# Extract audio during non-black periods
valid_intervals = get_valid_time_intervals("DatasetCercetare/Videos/Daemahni_on_DaemahniGianna.mov")
target_audio = AudioSegment.silent(duration=0)

for start_sec, end_sec in valid_intervals:
    start_ms = int(start_sec * 1000)
    end_ms = int(end_sec * 1000)
    target_audio += audio[start_ms:end_ms]

target_audio.export("target_person_audio.wav", format="wav")

KeyboardInterrupt: 

In [10]:
from faster_whisper import WhisperModel

model_size = "small" # Or "tiny", "base", "medium", "large-v1", "large-v2", "large-v3"
model = WhisperModel(model_size, device="cpu", compute_type="int8") # Adjust device/compute_type as needed

segments, info = model.transcribe("target_person_audio.wav", beam_size=5)

print("Detected language:", info.language)
full_text = ""
for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
    full_text += segment.text + " "

print("\nFull Transcription:", full_text.strip())

  from .autonotebook import tqdm as notebook_tqdm


Detected language: en
[0.00s -> 0.90s]  I'm nervous as hell.
[0.90s -> 1.40s]  Really?
[1.40s -> 3.04s]  No, I'm not.
[3.04s -> 4.32s]  So you work at a gym.
[4.32s -> 5.72s]  Do you like working out?
[5.72s -> 6.22s]  I do.
[6.22s -> 7.12s]  I love working out.
[7.12s -> 8.60s]  I want to become a Pilates instructor.
[8.60s -> 9.60s]  That's like my goal.
[9.60s -> 10.10s]  Yeah.
[10.10s -> 10.60s]  Yeah.
[10.60s -> 11.10s]  OK.
[11.10s -> 12.08s]  Do you like working out?
[12.08s -> 12.48s]  Yes.
[12.48s -> 15.84s]  I was actually a personal trainer for a short period of time.
[15.84s -> 17.36s]  I kind of just did it as a second job.
[17.36s -> 18.04s]  OK.
[18.04s -> 20.84s]  Do you have any facts about yourself
[20.84s -> 23.84s]  that might be interesting?
[23.84s -> 25.08s]  Mm, yeah.
[25.08s -> 26.36s]  I haven't been doing it recently,
[26.36s -> 27.56s]  but I want to get back into it.
[27.56s -> 28.68s]  OK.
[28.68s -> 29.64s]  I literally have a camera.
[29.68s -> 30.60s]  

In [None]:
from torch import nn
from torch.nn import MultiheadAttention

class AttractionModelWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, window_size=20):
        super().__init__()
        self.rnn = nn.GRU(input_dim, hidden_dim, batch_first=True)
        self.attn = MultiheadAttention(embed_dim=hidden_dim, num_heads=4, batch_first=True)
        self.window = window_size
        self.head = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        # x: (B, T, D)
        rnn_out, _ = self.rnn(x)  # (B, T, H)

        # Local attention: only attend to last `window` steps
        T = rnn_out.size(1)
        context = rnn_out[:, max(0, T-self.window):, :]  # (B, W', H)
        attn_out, _ = self.attn(context, context, context)  # (B, W', H)

        # Use last attended state
        final_rep = attn_out[:, -1, :]  # (B, H)
        attraction = self.head(final_rep)  # (B, 1)
        return attraction