In [1]:
from pathlib import Path
from dotenv import load_dotenv

try:
    start = Path(__file__).resolve()
except NameError:
    start = Path.cwd()

supporting_files = next(p / "00-supporting-files" for p in start.parents if (p / "00-supporting-files").exists())

In [2]:
project_parent = next(p for p in (start.resolve(), *start.resolve().parents) if (p / "00-supporting-files").exists()).parent
project_parent

PosixPath('/home/bedhedd/Documents/development_projects/bedhedd_projects/dougpt')

In [3]:
video_dir = project_parent / "large-files"
video_dir

PosixPath('/home/bedhedd/Documents/development_projects/bedhedd_projects/dougpt/large-files')

In [None]:
frames_dir = supporting_files / "data"
data_dir

PosixPath('/home/bedhedd/Documents/development_projects/bedhedd_projects/dougpt/dougpt/00-supporting-files/data')

In [5]:
example_frame = supporting_files / "data" / "example.png"
example_frame

PosixPath('/home/bedhedd/Documents/development_projects/bedhedd_projects/dougpt/dougpt/00-supporting-files/data/example.png')

In [12]:
import os, time

EXTRACTIONS_OUT = data_dir / "extractions" / "extractions.jsonl"
METRICS_OUT = data_dir / "extractions" / "metrics.jsonl"
FAILED_EXTRACTIONS_OUT =  data_dir / "extractions" / "failed_extractions.jsonl"

In [13]:
import pandas as pd
df = pd.read_csv(EXTRACTIONS_OUT)

ParserError: Error tokenizing data. C error: Expected 2 fields in line 21, saw 3


In [None]:
import json
with EXTRACTIONS_OUT.open("r", encoding="utf-8") as f:
    frames = json.load(f)

In [None]:
frames

[{'filename': 'frame_000000_t000000.000.png',
  'chat_messages': [{'user_name': 'ToaSTy_T0aST',
    'message': 'nvm its on a cooldown :pepehands:',
    'emotes': [{'emote_name': 'peepehands',
      'description': 'pepe the frong crying'}]},
   {'user_name': 'frickelodeon',
    'message': '@iamkaalhode believe in the doubt',
    'emotes': []},
   {'user_name': 'sour_appel',
    'message': "So when Doug won't open the oxygen gates, we have to rely on the fart barons",
    'emotes': []},
   {'user_name': 'whamer100',
    'message': 'yeah why is the prediction open for so long',
    'emotes': [{'emote_name': 'owl', 'description': 'owl emoji'}]},
   {'user_name': 'iamkaalhode',
    'message': 'I have betted too much to lose D.',
    'emotes': [{'emote_name': 'owl', 'description': 'owl emoji'}]},
   {'user_name': 'Gavyn_J', 'message': '', 'emotes': []},
   {'user_name': 'TheHolyPangolin', 'message': '', 'emotes': []},
   {'user_name': 'iamkaalhode',
    'message': '@whamer100 Because RIGGED'

In [None]:
from collections import Counter

counts = Counter(f["filename"] for f in frames)
dupes = [fn for fn, n in counts.items() if n > 1]
print(dupes)  # empty == all unique


In [22]:
import re
from difflib import SequenceMatcher
from collections import defaultdict, deque

EMOTE_TOKEN = re.compile(r":[A-Za-z0-9_]+:")

def norm_for_match(msg: str) -> str:
    msg = msg or ""
    # remove :emote: tokens so "Oh no :sweat:" == "Oh no"
    s = EMOTE_TOKEN.sub(" ", msg)
    s = " ".join(s.split()).lower()
    # if it was only emotes, fall back to raw-ish
    return s if s else " ".join(msg.split()).lower()

def content_score(msg: str) -> int:
    """Prefer messages with more non-emote text."""
    msg = msg or ""
    s = EMOTE_TOKEN.sub("", msg)
    s = re.sub(r"\s+", "", s)
    if s:
        return len(s)
    return len(re.sub(r"\s+", "", msg))

def similar(a: str, b: str) -> float:
    return SequenceMatcher(None, a, b).ratio()

def merge_prefer_longer(old: dict, new: dict) -> dict:
    # pick longer text
    old_msg = old.get("message", "") or ""
    new_msg = new.get("message", "") or ""
    pick_new = content_score(new_msg) > content_score(old_msg)

    merged = dict(old)
    merged["message"] = new_msg if pick_new else old_msg

    # keep emotes if either has them; prefer the list with more entries
    old_em = old.get("emotes") or []
    new_em = new.get("emotes") or []
    if len(new_em) > len(old_em):
        merged["emotes"] = new_em
    else:
        merged["emotes"] = old_em

    return merged

def dedupe_frames_prefer_longer(frames, window_frames=1, fuzzy=True, threshold=0.92):
    """
    Removes overlap duplicates vs the previous N frames.
    If a duplicate is found, updates the earlier kept message to the longer/better version.
    """
    out = []
    history = deque(maxlen=window_frames)  # holds indices into `out`

    for frame in frames:
        # build memory from the last N output frames
        mem_exact = {}  # (user, norm) -> (frame_idx, msg_idx)
        mem_by_user = defaultdict(list)  # user -> [(norm, frame_idx, msg_idx)]

        for fi in history:
            for mi, m in enumerate(out[fi].get("chat_messages", [])):
                u = m.get("user_name", "")
                t = norm_for_match(m.get("message", ""))
                mem_exact[(u, t)] = (fi, mi)
                mem_by_user[u].append((t, fi, mi))

        kept = []
        for m in frame.get("chat_messages", []):
            u = m.get("user_name", "")
            t = norm_for_match(m.get("message", ""))

            match_ptr = mem_exact.get((u, t))

            # fuzzy/prefix match (handles "‚Ä¶people will just" vs "‚Ä¶people will just wait to doubt")
            if match_ptr is None and fuzzy and mem_by_user.get(u):
                best = None
                best_score = -1.0
                for prev_t, fi, mi in mem_by_user[u]:
                    prefixish = (t and (t in prev_t or prev_t in t))
                    sim = similar(t, prev_t)
                    ok = prefixish or (sim >= threshold)
                    if ok and sim > best_score:
                        best_score = sim
                        best = (fi, mi, prev_t)
                if best:
                    match_ptr = (best[0], best[1])

            if match_ptr is not None:
                fi, mi = match_ptr
                out[fi]["chat_messages"][mi] = merge_prefer_longer(out[fi]["chat_messages"][mi], m)
                continue  # don‚Äôt keep duplicate in current frame

            kept.append(m)

        out.append({**frame, "chat_messages": kept})
        history.append(len(out) - 1)

    return out


In [None]:
deduped_frames = dedupe_frames_prefer_longer(frames, window_frames=1, fuzzy=True)
deduped_frames

[{'filename': 'frame_000000_t000000.000.png',
  'chat_messages': [{'user_name': 'ToaSTy_T0aST',
    'message': 'nvm its on a cooldown :pepehands:',
    'emotes': [{'emote_name': 'peepehands',
      'description': 'pepe the frong crying'}]},
   {'user_name': 'frickelodeon',
    'message': '@iamkaalhode believe in the doubt',
    'emotes': []},
   {'user_name': 'sour_appel',
    'message': "So when Doug won't open the oxygen gates, we have to rely on the fart barons",
    'emotes': []},
   {'user_name': 'whamer100',
    'message': 'yeah why is the prediction open for so long',
    'emotes': [{'emote_name': 'owl', 'description': 'owl emoji'}]},
   {'user_name': 'iamkaalhode',
    'message': 'I have betted too much to lose D.',
    'emotes': [{'emote_name': 'owl', 'description': 'owl emoji'}]},
   {'user_name': 'Gavyn_J', 'message': '', 'emotes': []},
   {'user_name': 'TheHolyPangolin',
    'message': 'Because RIGGED :mario:',
    'emotes': [{'emote_name': 'mario', 'description': 'mario ru