In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from audio2text import AudioTranscriber  # 16sec
from utils import extract_audio, extract_frames, create_chunk_dataframe, combine_text
import os
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import tempfile
import re
import zipfile

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Initialize transcriber
transcriber = AudioTranscriber(chunk_length=30)
VIDEO_SUFFIXES = {'.mp4', '.avi', '.mkv'}

def idx_from_name(name: str) -> int:
    """Extract trailing integer from names like 'hate_video_12' or 'non_hate_video_3'."""
    m = re.search(r'_(\d+)$', name)
    return int(m.group(1)) if m else 0

def list_all_videos(video_zip_dir: Path):
    """
    For each .zip in the directory, collect (zip_path, member_name, vid_name)
    for every video file inside, sorted hate_ before non_hate_, then numerically.
    """
    entries = []
    for zip_path in sorted(video_zip_dir.glob('*.zip'), key=lambda p: idx_from_name(p.stem)):
        with zipfile.ZipFile(zip_path, 'r') as z:
            for member in z.namelist():
                if Path(member).suffix.lower() in VIDEO_SUFFIXES:
                    vid_name = Path(member).stem
                    entries.append((zip_path, member, vid_name))
    entries.sort(key=lambda e: (
        0 if e[2].startswith('hate_video_') else 1,
        idx_from_name(e[2])
    ))
    return entries

def process_video_entry(zip_path: Path, member: str, vid_name: str,
                        frames_root: Path, audio_root: Path,
                        transcriber, frequency: int = 30):
    """
    Unzip the specified member, extract frames & audio into
    frames_root/vid_name/ and audio_root/vid_name/, transcribe, and return two DataFrames.
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        # 1) extract this single file
        with zipfile.ZipFile(zip_path, 'r') as z:
            z.extract(member, tmpdir)
        video_file = Path(tmpdir) / member

        # 2) prepare output folders
        out_frames = frames_root / vid_name
        out_frames.mkdir(parents=True, exist_ok=True)
        out_audio  = audio_root  / vid_name
        out_audio.mkdir(parents=True, exist_ok=True)

        # 3) extract frames
        _, frame_data = extract_frames(
            str(video_file), str(out_frames),
            video_index=idx_from_name(vid_name),
            frequency=frequency
        )
        frame_data['video'] = frame_data['frame_number'] \
            .apply(lambda f: f"{vid_name}_frame{f}.jpg")
        frame_df = frame_data[['timestamp_seconds', 'video', 'frame_number']]

        # 4 & 5) extract audio and transcribe with unified fallback
        chunk_df = pd.DataFrame(columns=["timestamp", "text", "video"])
        try:
            # extract audio
            extract_audio(input_path=str(video_file), output_dir=str(out_audio))
            audio_file = out_audio / f"{vid_name}.mp3"
            # transcribe audio
            result     = transcriber.transcribe_audio(
                              input_audio=str(audio_file),
                              return_timestamps=True
                          )
            chunk_info = transcriber.extract_segments_with_timestamps(result)
            chunk_df   = create_chunk_dataframe(chunk_info)
            chunk_df['video'] = vid_name
        except Exception as e:
            print(f"⚠️ Audio processing failed for {vid_name}: {e}")
            # chunk_df remains empty

    return frame_df, chunk_df

# ─── MAIN ────────────────────────────────────────────────────────────────

if __name__ == '__main__':
    cwd           = Path.cwd()
    video_zip_dir = cwd / 'input' / 'video'
    frames_root   = cwd / 'input' / 'frames'
    audio_root    = cwd / 'input' / 'audio'

    # ensure output dirs exist
    frames_root.mkdir(parents=True, exist_ok=True)
    audio_root.mkdir(parents=True, exist_ok=True)

    # build the list of all videos inside all zips
    video_entries = list_all_videos(video_zip_dir)
    # optionally limit number of videos:
    video_entries = video_entries[:3]

    all_frame_dfs = []
    all_chunk_dfs = []

    # process them sequentially
    for zip_path, member, vid_name in tqdm(video_entries, desc="Processing videos", unit="video"):
        fdf, cdf = process_video_entry(
            zip_path, member, vid_name,
            frames_root, audio_root,
            transcriber
        )
        all_frame_dfs.append(fdf)
        all_chunk_dfs.append(cdf)

    # combine and save results
    final_frames_df = pd.concat(all_frame_dfs, ignore_index=True)
    final_chunks_df = pd.concat(all_chunk_dfs, ignore_index=True)
    # write CSVs
    final_frames_df.to_csv('frames.csv', index=False)
    final_chunks_df.to_csv('chunks.csv', index=False)

    # annotation
    def annotate_frames_within_interval(frames_df: pd.DataFrame, chunks_df: pd.DataFrame) -> pd.DataFrame:
        out = chunks_df.copy()
        out[['start', 'end']] = pd.DataFrame(
            out['timestamp']
                .apply(lambda ts: (float(ts[0]), float(ts[1])))
                .tolist(),
            index=out.index
        )
        if 'video_id' not in frames_df:
            frames_df = frames_df.copy()
            frames_df['video_id'] = frames_df['video'].str.replace(
                r'_frame\d+\.jpg$', '', regex=True
            )
        def find_frames(row):
            vid, s, e = row['video'], row['start'], row['end']
            mask = (
                (frames_df['video_id'] == vid) &
                (frames_df['timestamp_seconds'] >= s) &
                (frames_df['timestamp_seconds'] <= e)
            )
            return frames_df.loc[mask, 'video'].tolist()
        out['frames_within_interval'] = out.apply(find_frames, axis=1)
        return out.drop(columns=['start', 'end'])

    annotated_chunks = annotate_frames_within_interval(final_frames_df, final_chunks_df)
    annotated_chunks.to_csv('annotated_chunks.csv', index=False)


Device set to use cuda:0
Processing videos:   0%|          | 0/3 [00:00<?, ?video/s]

Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\data_proc\input\audio\hate_video_1\hate_video_1.mp3".


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Processing videos:  33%|███▎      | 1/3 [00:34<01:09, 34.55s/video]

Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\data_proc\input\audio\hate_video_2\hate_video_2.mp3".


Processing videos:  67%|██████▋   | 2/3 [01:16<00:38, 38.86s/video]

Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\data_proc\input\audio\hate_video_3\hate_video_3.mp3".


Processing videos: 100%|██████████| 3/3 [02:37<00:00, 52.49s/video]


In [4]:
def annotate_frames_within_interval(frames_df: pd.DataFrame, chunks_df: pd.DataFrame) -> pd.DataFrame:
    out = chunks_df.copy()
    out[['start', 'end']] = pd.DataFrame(
        out['timestamp']
            .apply(lambda ts: (float(ts[0]), float(ts[1])))
            .tolist(),
        index=out.index
    )
    if 'video_id' not in frames_df:
        frames_df = frames_df.copy()
        frames_df['video_id'] = frames_df['video'].str.replace(
            r'_frame\d+\.jpg$', '', regex=True
        )
    def find_frames(row):
        vid, s, e = row['video'], row['start'], row['end']
        mask = (
            (frames_df['video_id'] == vid) &
            (frames_df['timestamp_seconds'] >= s) &
            (frames_df['timestamp_seconds'] <= e)
        )
        return frames_df.loc[mask, 'video'].tolist()
    out['frames_within_interval'] = out.apply(find_frames, axis=1)
    return out.drop(columns=['start', 'end'])



In [11]:
#Read the frames and chunks CSV files
frames_df = pd.read_csv('frames.csv')
chunks_df = pd.read_csv('chunks.csv')

#### **frames_df**

In [36]:
frames_df.head()

Unnamed: 0,timestamp_seconds,video,frame_number
0,0.0,hate_video_1_frame0.jpg,0
1,1.001,hate_video_1_frame30.jpg,30
2,2.002,hate_video_1_frame60.jpg,60
3,3.003,hate_video_1_frame90.jpg,90
4,4.004,hate_video_1_frame120.jpg,120


In [None]:
frames_df["timestamp_seconds"] = frames_df["timestamp_seconds"].apply(
    lambda x: round(x, 3) if pd.notnull(x) else x
)

print(frames_df.dtypes, "\n")
print(frames_df.isna().sum(), "\n")
print(frames_df.shape)

timestamp_seconds    float64
video                 object
frame_number           int64
dtype: object 

timestamp_seconds    0
video                0
frame_number         0
dtype: int64 

(163506, 3)


In [35]:
# rows that begin with "non_hate_video"
mask_non_hate = frames_df["video"].str.startswith("non_hate_video", na=False)

# rows that begin with "hate_video" BUT are *not* already classed as non‑hate
mask_hate = frames_df["video"].str.startswith("hate_video", na=False) & ~mask_non_hate

non_hate_count = mask_non_hate.sum()
hate_count     = mask_hate.sum()

print("Rows whose `video` contains:")
print(f"  • hate_video       : {hate_count}")
print(f"  • non_hate_video   : {non_hate_count}")

#Check there the counts sum is equal to dataframes length
assert (non_hate_count + hate_count) == len(frames_df)

Rows whose `video` contains:
  • hate_video       : 77052
  • non_hate_video   : 86454


In [None]:
# ────────────────────────────────────────────────
# 1) Grab the integer that follows each prefix
#    ^hate_video_(\d+)_      → captures 126 in  hate_video_126_frame0.jpg
# ────────────────────────────────────────────────
hate_ids = (
    frames_df["video"]
    .str.extract(r"^hate_video_(\d+)_", expand=False)  # returns NaN when the pattern doesn't match
    .dropna()
    .astype(int)
)

non_hate_ids = (
    frames_df["video"]
    .str.extract(r"^non_hate_video_(\d+)_", expand=False)
    .dropna()
    .astype(int)
)

# ────────────────────────────────────────────────
# 2) Take the maximum in each Series
# ────────────────────────────────────────────────
max_hate_id      = hate_ids.max()
max_non_hate_id  = non_hate_ids.max()

print("Largest *video ID* seen in the filenames:")
print(f"  • hate_video       : {max_hate_id}")
print(f"  • non_hate_video   : {max_non_hate_id}")

#Make sure that these indices add up to the total number of videos (1083)
assert (max_hate_id + max_non_hate_id) == 1083, \
    f"Sum of hate and non-hate video IDs ({max_hate_id} + {max_non_hate_id}) " \
    f"does not equal the total number of videos (1083)."

Largest *video ID* seen in the filenames:
  • hate_video       : 431
  • non_hate_video   : 652


#### **chunks_df**

In [10]:
chunks_df.head()

NameError: name 'chunks_df' is not defined

In [41]:
print(chunks_df.dtypes, "\n")
print(chunks_df.isna().sum(), "\n")
print(chunks_df.shape)

timestamp    object
text         object
video        object
dtype: object 

timestamp      0
text         138
video          0
dtype: int64 

(11804, 3)


In [43]:
# Do not filter out, I want to see which videos have no text

mask_na = chunks_df["text"].isna()

videos_with_na = chunks_df.loc[mask_na, "video"].unique()
print("Videos that have ≥1 missing text entry:")
print(videos_with_na)


Videos that have ≥1 missing text entry:
['hate_video_3' 'hate_video_9' 'hate_video_13' 'hate_video_18'
 'hate_video_20' 'hate_video_29' 'hate_video_41' 'hate_video_44'
 'hate_video_46' 'hate_video_60' 'hate_video_71' 'hate_video_76'
 'hate_video_93' 'hate_video_94' 'hate_video_95' 'hate_video_98'
 'hate_video_107' 'hate_video_121' 'hate_video_132' 'hate_video_138'
 'hate_video_141' 'hate_video_144' 'hate_video_147' 'hate_video_148'
 'hate_video_151' 'hate_video_159' 'hate_video_163' 'hate_video_165'
 'hate_video_169' 'hate_video_178' 'hate_video_193' 'hate_video_202'
 'hate_video_204' 'hate_video_208' 'hate_video_233' 'hate_video_237'
 'hate_video_283' 'hate_video_293' 'hate_video_295' 'hate_video_298'
 'hate_video_337' 'hate_video_339' 'hate_video_351' 'hate_video_361'
 'hate_video_363' 'hate_video_382' 'hate_video_383' 'hate_video_398'
 'hate_video_406' 'hate_video_414' 'hate_video_415' 'hate_video_422'
 'hate_video_425' 'non_hate_video_12' 'non_hate_video_14'
 'non_hate_video_20' 'n

In [8]:
import ast            # safely turns "[1.23, 4.56]" → [1.23, 4.56]
import logging
import math
import pandas as pd

# --------------------------------------------------------------------
# Configure a simple logger that prints to stdout
# --------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s | row %(row)d | video=%(video)s | %(message)s"
)

def check_time_intervals(df: pd.DataFrame,
                         timestamp_col: str = "timestamp",
                         video_col: str = "video") -> pd.Series:
    """
    Validate the time intervals stored in *timestamp_col*.
    A valid interval is a two‑element list/tuple where:
        • both elements are real numbers (not NaN)
        • end  >  start
    Logs an INFO line for every row that fails either check.

    Returns
    -------
    pandas.Series[bool]
        True  → row is valid
        False → row has NaNs or end ≤ start
    """
    valid_mask = pd.Series(True, index=df.index)

    for i, raw in df[timestamp_col].items():
        video    = df.at[i, video_col]

        # 1) parse the string "[start, end]" safely
        try:
            start, end = map(float, ast.literal_eval(str(raw)))
        except Exception as exc:
            logging.info("un‑parsable timestamp (%s)", raw,
                         extra={"row": i, "video": video})
            valid_mask.at[i] = False
            continue

        # 2) check for NaNs
        if any(math.isnan(x) for x in (start, end)):
            logging.info("NaN in timestamp -> %s", (start, end),
                         extra={"row": i, "video": video})
            valid_mask.at[i] = False
            continue

        # 3) logical order
        if end <= start:
            logging.info("invalid interval -> start=%s, end=%s", start, end,
                         extra={"row": i, "video": video})
            valid_mask.at[i] = False

    return valid_mask


In [9]:
# df is your DataFrame (e.g. chunks_df)
bad_rows = ~check_time_intervals(chunks_df)     # rows that failed
print(f"✓ checked {len(chunks_df)} rows — {bad_rows.sum()} invalid")

NameError: name 'chunks_df' is not defined

In [None]:
import re

_sentence_boundary = re.compile(
    r"""                     # verbose regex
    (?<!\w\.\w\.)            # negative look‑behind: skip e.g. "U.S."
    (?<!                # skip titles like "Dr.", "Mrs." etc.
        \b               # word boundary
        (?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|Mt)
        \.
    )
    (?<= [.!?] )             # positive look‑behind: a real sentence‑ender
    \s+                      # the whitespace we’ll split on
    """,
    flags=re.VERBOSE,
)

def split_into_sentences(text: str) -> list[str]:
    """
    Split *text* into a list of sentences.

    Parameters
    ----------
    text : str
        A chunk that may contain multiple sentences.

    Returns
    -------
    list[str]
        The individual sentences with surrounding whitespace trimmed.
    """
    return [s.strip() for s in _sentence_boundary.split(text) if s.strip()]


In [1]:
chunks_df

NameError: name 'chunks_df' is not defined

In [None]:
chunk = "Dr. Smith arrived at 9 a.m. He said hello. Isn't that great? Yes!"
split_into_sentences(chunk)

In [None]:
hate_ids = (
    chunks_df["video"]
    .str.extract(r"^hate_video_(\d+)$", expand=False)      # match the entire string
    .dropna()
    .astype(int)
)

non_hate_ids = (
    chunks_df["video"]
    .str.extract(r"^non_hate_video_(\d+)$", expand=False)
    .dropna()
    .astype(int)
)

# ── largest ID in each class ─────────────────────────────────────────
max_hate_id     = hate_ids.max()        if not hate_ids.empty     else None
max_non_hate_id = non_hate_ids.max()    if not non_hate_ids.empty else None

print("Largest *video ID* seen:")
print(f"  • hate_video       : {max_hate_id}")
print(f"  • non_hate_video   : {max_non_hate_id}")

In [6]:
annotated_chunks = annotate_frames_within_interval(frames_df, chunks_df)
annotated_chunks.to_csv('annotated_chunks.csv', index=False)

ValueError: could not convert string to float: '['

In [12]:
#Test with texts on the images
# add a column called text_on_image which is a list ["Hello","","World","","","","","LOL,that guy is crazy!",...] fill the rest with empty strings
combined_frame_df["text_on_image"] = [
    "Hello", "", "World", "", "", "", "", "LOL,that guy is crazy!"
] + [""] * (len(combined_frame_df) - 8)


combined_frame_df.head(10)


Unnamed: 0,timestamp_seconds,video,frame_number,text_on_image
0,0.0,video_1_frame0.jpg,0,Hello
1,1.001,video_1_frame30.jpg,30,
2,2.002,video_1_frame60.jpg,60,World
3,3.003,video_1_frame90.jpg,90,
4,4.004,video_1_frame120.jpg,120,
5,5.005,video_1_frame150.jpg,150,
6,6.006,video_1_frame180.jpg,180,
7,7.007,video_1_frame210.jpg,210,"LOL,that guy is crazy!"
8,8.008,video_1_frame240.jpg,240,
9,9.009,video_1_frame270.jpg,270,


In [13]:
combined_chunk_df

Unnamed: 0,timestamp,text,video
0,"[0.0, 94.96]",The End Thank you. The End,video_1
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to...,video_2
2,"[27.42, 32.12]",Roses are red and violets are blue and nigger...,video_2
3,"[32.42, 37.8]","But they don't mind cause what the heck, you ...",video_2
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and ...",video_2
...,...,...,...
98,"[198.68, 202.76]","Bitch, clean my car while I drink lemonade.",video_3
99,"[202.76, 203.96]",Ya fuckin' slave.,video_3
100,"[203.96, 205.12]",Melvin on the beat,video_3
101,"[205.12, 208.56]","Buy me two pistols man, fuck all those nigger...",video_3


In [96]:
final_df = combined_chunk_df.copy()
final_df.head()

Unnamed: 0,timestamp,text,video
0,"[0.0, 94.96]",The End Thank you. The End,video_1
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to...,video_2
2,"[27.42, 32.12]",Roses are red and violets are blue and nigger...,video_2
3,"[32.42, 37.8]","But they don't mind cause what the heck, you ...",video_2
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and ...",video_2


In [97]:

# --- Main Logic ---

# 1) Unpack the "timestamp" column from final_df into "start" and "end" columns
final_df[['start', 'end']] = pd.DataFrame(final_df['timestamp'].tolist(), index=final_df.index)

# 2) For each row in combined_frame_df, find matching rows in final_df and update the text
for _, frame in combined_frame_df.iterrows():
    # Extract the numeric timestamp and text from the frame row
    t = frame["timestamp_seconds"]
    text_on_image = str(frame["text_on_image"]).strip()
    
    # We assume the "video" column in combined_frame_df is something like "video_1_frame2342.jpg"
    video_id = frame["video"].split("_frame")[0]
    
    # Build a mask to find the matching rows in final_df
    mask = (
        (final_df["video"] == video_id) &
        (final_df["start"] <= t) &
        (final_df["end"] >= t)
    )
    
    # Apply the combine_text function only on the masked rows
    final_df.loc[mask, "text"] = final_df.loc[mask, "text"].apply(
        lambda current_text: combine_text(current_text, text_on_image)
    )

# 3) (Optional) Drop the columns "start" and "end" if not needed anymore
final_df.drop(columns=["start", "end"], inplace=True)

final_df.head(10)


Unnamed: 0,timestamp,text,video
0,"[0.0, 94.96]",The End Thank you. The End | Hello | World | L...,video_1
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to ...,video_2
2,"[27.42, 32.12]",Roses are red and violets are blue and niggers...,video_2
3,"[32.42, 37.8]","But they don't mind cause what the heck, you g...",video_2
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and I...",video_2
5,"[44.6, 47.8]",So Uncle Sam won't help poor nigger hatin' me.,video_2
6,"[50.04, 52.04]","Jigaboo, jigaboo, where are you?",video_2
7,"[52.24, 54.68]",I was here in the woodpile watching you.,video_2
8,"[55.04, 57.04]","Jigaboo, jigaboo, come out, no.",video_2
9,"[57.14, 60.44]","So I scared all the white men way down south, ...",video_2


In [98]:
annot_data = pd.read_csv("./HateMM_annotations/HateMM_annotation.csv", delimiter=";")
annot_data

Unnamed: 0,video_file_name,label,hate_snippet,target,Hate_Label,Image_hate,Audio_hate,Text_hate,Frame_hate
0,hate_video_1.mp4,Hate,"[['00:00:34', '00:01:34']]",Blacks,1.0,0.0,0.0,1.0,1.0
1,hate_video_2.mp4,Hate,"[['00:00:06', '00:02:06']]",Blacks,1.0,1.0,1.0,0.0,1.0
2,non_hate_video_1.mp4,Non Hate,,Others,0.0,0.0,0.0,0.0,0.0
3,hate_video_3.mp4,Hate,"[['00:00:03', '00:01:40'], ['00:01:41', '00:03...",Blacks,1.0,1.0,1.0,0.0,1.0
4,non_hate_video_2.mp4,Non Hate,,Blacks,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1078,non_hate_video_650.mp4,Non Hate,,['Whites'],,,,,
1079,hate_video_430.mp4,Hate,"[['00:00:09', '00:04:04']]",['Jews'],,,,,
1080,non_hate_video_651.mp4,Non Hate,,['Others'],,,,,
1081,non_hate_video_652.mp4,Non Hate,,['Others'],,,,,


In [99]:
import ast
import pandas as pd

def hhmmss_to_seconds(time_str):
    """
    Convert a time string in hh:mm:ss (or mm:ss) format to seconds.
    If already numeric, returns it as a float.
    """
    try:
        time_str = time_str.strip()
        parts = time_str.split(":")
        if len(parts) == 3:
            h, m, s = parts
            return float(h) * 3600 + float(m) * 60 + float(s)
        elif len(parts) == 2:
            m, s = parts
            return float(m) * 60 + float(s)
        else:
            return float(time_str)
    except Exception as e:
        print(f"Error converting '{time_str}' to seconds: {e}")
        return None

def parse_and_convert_snippet(snippet):
    """
    Parse a snippet entry (which might be a string like "[['00:00:34','00:01:34']]" or "Missing value")
    and convert the intervals to seconds. Supports either a single interval [start,end] or multiple intervals.
    Returns a list of intervals in seconds or None if invalid.
    """
    # 1) Handle the literal string "Missing value"
    if isinstance(snippet, str):
        snippet = snippet.strip()
        if snippet == "Missing value":
            return None
        
        # 2) If it's not "Missing value", try to parse it as a Python literal.
        try:
            snippet = ast.literal_eval(snippet)
        except Exception as e:
            print(f"Error parsing snippet: {snippet}. Error: {e}")
            return None
    
    # 3) If snippet is still not a list, return None
    if not isinstance(snippet, list):
        return None
    
    # 4) Check if it's a list of intervals (multiple) or a single interval
    def convert_interval_to_seconds(interval):
        if isinstance(interval, list) and len(interval) == 2:
            start = hhmmss_to_seconds(interval[0])
            end   = hhmmss_to_seconds(interval[1])
            return [start, end]
        return None
    
    # If snippet’s first element is itself a list => multiple intervals
    if snippet and isinstance(snippet[0], list):
        converted = []
        for inter in snippet:
            conv = convert_interval_to_seconds(inter)
            if conv is not None:
                converted.append(conv)
        return converted
    else:
        # Single interval e.g. ['00:00:34','00:01:34']
        conv = convert_interval_to_seconds(snippet)
        return [conv] if conv else None

# ----------------- Main Logic -----------------
# Suppose annot_data is your DataFrame with "hate_snippet" column
# We'll create a new column called "converted_hate_snippet"

annot_data["converted_hate_snippet"] = annot_data["hate_snippet"].apply(parse_and_convert_snippet)

# Display a few rows to verify
print(annot_data[["hate_snippet", "converted_hate_snippet"]].head(10))


                                        hate_snippet  \
0                         [['00:00:34', '00:01:34']]   
1                         [['00:00:06', '00:02:06']]   
2                                                NaN   
3  [['00:00:03', '00:01:40'], ['00:01:41', '00:03...   
4                                                NaN   
5                         [['00:00:00', '00:00:13']]   
6                                                NaN   
7                                                NaN   
8                         [['00:00:00', '00:00:39']]   
9                                                NaN   

           converted_hate_snippet  
0                  [[34.0, 94.0]]  
1                  [[6.0, 126.0]]  
2                            None  
3  [[3.0, 100.0], [101.0, 207.0]]  
4                            None  
5                   [[0.0, 13.0]]  
6                            None  
7                            None  
8                   [[0.0, 39.0]]  
9                      

In [100]:
import re
import pandas as pd

# ------------------ Helper Functions ------------------
def get_vid_num_from_final(video_str):
    """
    Extract the numeric ID from a string like "video_2_frame2342.jpg".
    Returns 2 for "video_2_frame2342.jpg".
    """
    match = re.search(r"video_(\d+)", video_str)
    if match:
        return int(match.group(1))
    return None

def get_vid_num_from_annot(fname):
    """
    Extract the numeric ID from a filename like "hate_video_2.mp4" or "non_hate_video_2.mp4".
    Returns 2 for "hate_video_2.mp4".
    """
    match = re.search(r"_video_(\d+)\.mp4", fname)
    if match:
        return int(match.group(1))
    return None

def intervals_intersect(intervalA, intervalB):
    """
    Check if two intervals [startA, endA] and [startB, endB] intersect.
    They intersect if startA < endB AND startB < endA.
    """
    return intervalA[0] < intervalB[1] and intervalB[0] < intervalA[1]

# ------------------ Process final_df ------------------
# final_df is assumed to have a "timestamp" column (e.g. [0.0, 94.96]) and
# "video_file_name" column (e.g. "video_2_frame2342.jpg").
final_df["vid_num"] = final_df["video"].apply(get_vid_num_from_final)
# Unpack the "timestamp" column into "start" and "end" for easy interval comparison.
final_df[['start', 'end']] = pd.DataFrame(final_df["timestamp"].tolist(), index=final_df.index)

# ------------------ Process annot_data ------------------
# annot_data is assumed to have:
#   - "video_file_name": e.g. "hate_video_2.mp4" or "non_hate_video_2.mp4"
#   - "label": "Hate" or "Non Hate"
#   - "converted_hate_snippet": intervals in seconds (e.g. [[34, 94]] for a single interval)
annot_data["vid_num"] = annot_data["video_file_name"].apply(get_vid_num_from_annot)

# Build a dictionary mapping vid_num -> list of hate intervals (only for rows where label is "Hate")
hate_info = {}
for _, row in annot_data.iterrows():
    if row["Audio_hate"] == 1:
        vn = row["vid_num"]
        intervals = row["converted_hate_snippet"]
        # Ensure intervals is a list of intervals:
        if isinstance(intervals, list):
            # Check if this is a single interval (e.g. [34, 94]) or a list of intervals ([[34,94]])
            if intervals and not isinstance(intervals[0], list):
                intervals = [intervals]
        else:
            intervals = []
        if vn in hate_info:
            hate_info[vn].extend(intervals)
        else:
            hate_info[vn] = intervals

# ------------------ Label Each Row in final_df ------------------
labels = []
for _, row in final_df.iterrows():
    current_interval = [row["start"], row["end"]]
    vn = row["vid_num"]
    row_label = "Non Hate"  # default
    # Look up hate intervals for this video, if any.
    if vn in hate_info:
        for h_interval in hate_info[vn]:
            if intervals_intersect(current_interval, h_interval):
                row_label = "Hate"
                break
    labels.append(row_label)
final_df["audio_hate"] = labels

# ------------------ Cleanup Temporary Columns ------------------
final_df.drop(columns=["start", "end", "vid_num"], inplace=True, errors="ignore")

# Display the first few rows of the updated final_df with the new label.
print(final_df.head())


        timestamp                                               text    video  \
0    [0.0, 94.96]  The End Thank you. The End | Hello | World | L...  video_1   
1    [0.0, 25.44]  I Like Sugar And the NAACP would sure like to ...  video_2   
2  [27.42, 32.12]  Roses are red and violets are blue and niggers...  video_2   
3   [32.42, 37.8]  But they don't mind cause what the heck, you g...  video_2   
4  [38.34, 44.08]  No joke, I ain't got a nickel for a coke and I...  video_2   

  audio_hate  
0   Non Hate  
1       Hate  
2       Hate  
3       Hate  
4       Hate  


In [101]:
final_df

Unnamed: 0,timestamp,text,video,audio_hate
0,"[0.0, 94.96]",The End Thank you. The End | Hello | World | L...,video_1,Non Hate
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to ...,video_2,Hate
2,"[27.42, 32.12]",Roses are red and violets are blue and niggers...,video_2,Hate
3,"[32.42, 37.8]","But they don't mind cause what the heck, you g...",video_2,Hate
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and I...",video_2,Hate
...,...,...,...,...
98,"[198.68, 202.76]","Bitch, clean my car while I drink lemonade.",video_3,Hate
99,"[202.76, 203.96]",Ya fuckin' slave.,video_3,Hate
100,"[203.96, 205.12]",Melvin on the beat,video_3,Hate
101,"[205.12, 208.56]","Buy me two pistols man, fuck all those niggers...",video_3,Hate
