In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from audio2text import AudioTranscriber # 16sec
from utils import extract_audio, extract_frames, create_chunk_dataframe, combine_text
import os
import pandas as pd
from tqdm import tqdm

In [7]:
transcriber = AudioTranscriber() # 9sec

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# Example functions assumed to be defined:
# from your_module import extract_frames, extract_audio, create_chunk_dataframe
# And an instance of AudioTranscriber is created as follows:
# transcriber = AudioTranscriber(chunk_length=30)

# Directories
video_dir = "./input/video"
frames_output_dir = "./input/frames"
audio_output_dir = "./input/audio"

# Create output directories if they do not exist
os.makedirs(frames_output_dir, exist_ok=True)
os.makedirs(audio_output_dir, exist_ok=True)

# List video files (adjust extensions as needed)
video_extensions = ('.mp4', '.avi', '.mkv')
video_files = [f for f in os.listdir(video_dir) if f.lower().endswith(video_extensions)]

# Lists to hold DataFrames for frames and audio chunks
all_frame_dfs = []
all_chunk_dfs = []

for idx, video_file in tqdm(enumerate(video_files, start=1), total=len(video_files), desc="Processing videos"):
    input_path = os.path.join(video_dir, video_file)
    
    # Create subfolder for frames of this video
    video_frames_dir = os.path.join(frames_output_dir, f"video_{idx}")
    os.makedirs(video_frames_dir, exist_ok=True)
    
    # --- Extract Frames ---
    frame_count, frame_data = extract_frames(
        input_path,
        video_frames_dir,
        video_index=idx,
        frequency=30  # Adjust extraction frequency if needed
    )
    print(f"Extracted {frame_count} frames from the video '{video_file}'")
    
    # Modify frame_data: create a 'video' column in the format "video_1_frame2342.jpg"
    # Assumes that frame_data has a column "frame_number" (the frame id)
    frame_data["video"] = frame_data["frame_number"].apply(
        lambda f: f"video_{idx}_frame{f}.jpg"
    )

    frame_data = frame_data[["timestamp_seconds", "video","frame_number"]]
    all_frame_dfs.append(frame_data)
    
    # --- Extract Audio ---
    # Here we assume extract_audio writes an audio file for each video to audio_output_dir.
    # For example, it might create a file named "hate_video_{idx}.mp3".
    extract_audio(
        input_path=input_path,
        output_dir=audio_output_dir
    )
    print(f"Extracted audio from the video '{video_file}'")
    
    # Adjust the input_audio to reflect the actual extracted audio filename.
    # Here, if extract_audio creates "hate_video_{idx}.mp3":
    audio_file = os.path.join(audio_output_dir, f"hate_video_{idx}.mp3")
    
    # --- Transcribe Audio & Process Chunks ---
    sentence_result = transcriber.transcribe_audio(
        input_audio=audio_file,    # Use the actual audio file path for this video
        return_timestamps=True      # Use chunk-level timestamps
    )
    # Extract segmentation info (list of dicts with keys: "text", "start", "end")
    chunk_info = transcriber.extract_segments_with_timestamps(sentence_result)
    # Create a DataFrame for these chunks
    chunk_df = create_chunk_dataframe(chunk_info)
    # Add video ID column to chunk_df, e.g., "video_1"
    chunk_df["video"] = f"video_{idx}"

    all_chunk_dfs.append(chunk_df)


Processing videos:   0%|          | 0/3 [00:00<?, ?it/s]

Extracted 95 frames from the video 'hate_video_1.mp4'
Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\input\audio\hate_video_1.mp3".
Extracted audio from the video 'hate_video_1.mp4'


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Processing videos:  33%|███▎      | 1/3 [00:42<01:24, 42.07s/it]

Extracted 108 frames from the video 'hate_video_2.mp4'
Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\input\audio\hate_video_2.mp3".
Extracted audio from the video 'hate_video_2.mp4'


Processing videos:  67%|██████▋   | 2/3 [01:59<01:02, 62.81s/it]

Extracted 193 frames from the video 'hate_video_3.mp4'
Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\input\audio\hate_video_3.mp3".
Extracted audio from the video 'hate_video_3.mp4'


Processing videos: 100%|██████████| 3/3 [04:36<00:00, 92.17s/it] 


In [9]:
# Concatenate all DataFrames for frames and for audio chunks respectively
combined_frame_df = pd.concat(all_frame_dfs, ignore_index=True)
combined_chunk_df = pd.concat(all_chunk_dfs, ignore_index=True)

print("Combined Frames DataFrame:")
print(combined_frame_df.head())

print("\nCombined Audio Chunk DataFrame:")
print(combined_chunk_df.head())

Combined Frames DataFrame:
   timestamp_seconds                 video  frame_number
0              0.000    video_1_frame0.jpg             0
1              1.001   video_1_frame30.jpg            30
2              2.002   video_1_frame60.jpg            60
3              3.003   video_1_frame90.jpg            90
4              4.004  video_1_frame120.jpg           120

Combined Audio Chunk DataFrame:
        timestamp                                               text    video
0    [0.0, 94.96]                         The End Thank you. The End  video_1
1    [0.0, 25.44]   I Like Sugar And the NAACP would sure like to...  video_2
2  [27.42, 32.12]   Roses are red and violets are blue and nigger...  video_2
3   [32.42, 37.8]   But they don't mind cause what the heck, you ...  video_2
4  [38.34, 44.08]   No joke, I ain't got a nickel for a coke and ...  video_2


In [10]:
combined_chunk_df

Unnamed: 0,timestamp,text,video
0,"[0.0, 94.96]",The End Thank you. The End,video_1
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to...,video_2
2,"[27.42, 32.12]",Roses are red and violets are blue and nigger...,video_2
3,"[32.42, 37.8]","But they don't mind cause what the heck, you ...",video_2
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and ...",video_2
...,...,...,...
98,"[198.68, 202.76]","Bitch, clean my car while I drink lemonade.",video_3
99,"[202.76, 203.96]",Ya fuckin' slave.,video_3
100,"[203.96, 205.12]",Melvin on the beat,video_3
101,"[205.12, 208.56]","Buy me two pistols man, fuck all those nigger...",video_3


In [11]:
combined_frame_df

Unnamed: 0,timestamp_seconds,video,frame_number
0,0.000,video_1_frame0.jpg,0
1,1.001,video_1_frame30.jpg,30
2,2.002,video_1_frame60.jpg,60
3,3.003,video_1_frame90.jpg,90
4,4.004,video_1_frame120.jpg,120
...,...,...,...
391,225.600,video_3_frame5640.jpg,5640
392,226.800,video_3_frame5670.jpg,5670
393,228.000,video_3_frame5700.jpg,5700
394,229.200,video_3_frame5730.jpg,5730


In [12]:
#Test with texts on the images
# add a column called text_on_image which is a list ["Hello","","World","","","","","LOL,that guy is crazy!",...] fill the rest with empty strings
combined_frame_df["text_on_image"] = [
    "Hello", "", "World", "", "", "", "", "LOL,that guy is crazy!"
] + [""] * (len(combined_frame_df) - 8)


combined_frame_df.head(10)


Unnamed: 0,timestamp_seconds,video,frame_number,text_on_image
0,0.0,video_1_frame0.jpg,0,Hello
1,1.001,video_1_frame30.jpg,30,
2,2.002,video_1_frame60.jpg,60,World
3,3.003,video_1_frame90.jpg,90,
4,4.004,video_1_frame120.jpg,120,
5,5.005,video_1_frame150.jpg,150,
6,6.006,video_1_frame180.jpg,180,
7,7.007,video_1_frame210.jpg,210,"LOL,that guy is crazy!"
8,8.008,video_1_frame240.jpg,240,
9,9.009,video_1_frame270.jpg,270,


In [13]:
combined_chunk_df

Unnamed: 0,timestamp,text,video
0,"[0.0, 94.96]",The End Thank you. The End,video_1
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to...,video_2
2,"[27.42, 32.12]",Roses are red and violets are blue and nigger...,video_2
3,"[32.42, 37.8]","But they don't mind cause what the heck, you ...",video_2
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and ...",video_2
...,...,...,...
98,"[198.68, 202.76]","Bitch, clean my car while I drink lemonade.",video_3
99,"[202.76, 203.96]",Ya fuckin' slave.,video_3
100,"[203.96, 205.12]",Melvin on the beat,video_3
101,"[205.12, 208.56]","Buy me two pistols man, fuck all those nigger...",video_3


In [96]:
final_df = combined_chunk_df.copy()
final_df.head()

Unnamed: 0,timestamp,text,video
0,"[0.0, 94.96]",The End Thank you. The End,video_1
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to...,video_2
2,"[27.42, 32.12]",Roses are red and violets are blue and nigger...,video_2
3,"[32.42, 37.8]","But they don't mind cause what the heck, you ...",video_2
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and ...",video_2


In [97]:

# --- Main Logic ---

# 1) Unpack the "timestamp" column from final_df into "start" and "end" columns
final_df[['start', 'end']] = pd.DataFrame(final_df['timestamp'].tolist(), index=final_df.index)

# 2) For each row in combined_frame_df, find matching rows in final_df and update the text
for _, frame in combined_frame_df.iterrows():
    # Extract the numeric timestamp and text from the frame row
    t = frame["timestamp_seconds"]
    text_on_image = str(frame["text_on_image"]).strip()
    
    # We assume the "video" column in combined_frame_df is something like "video_1_frame2342.jpg"
    video_id = frame["video"].split("_frame")[0]
    
    # Build a mask to find the matching rows in final_df
    mask = (
        (final_df["video"] == video_id) &
        (final_df["start"] <= t) &
        (final_df["end"] >= t)
    )
    
    # Apply the combine_text function only on the masked rows
    final_df.loc[mask, "text"] = final_df.loc[mask, "text"].apply(
        lambda current_text: combine_text(current_text, text_on_image)
    )

# 3) (Optional) Drop the columns "start" and "end" if not needed anymore
final_df.drop(columns=["start", "end"], inplace=True)

final_df.head(10)


Unnamed: 0,timestamp,text,video
0,"[0.0, 94.96]",The End Thank you. The End | Hello | World | L...,video_1
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to ...,video_2
2,"[27.42, 32.12]",Roses are red and violets are blue and niggers...,video_2
3,"[32.42, 37.8]","But they don't mind cause what the heck, you g...",video_2
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and I...",video_2
5,"[44.6, 47.8]",So Uncle Sam won't help poor nigger hatin' me.,video_2
6,"[50.04, 52.04]","Jigaboo, jigaboo, where are you?",video_2
7,"[52.24, 54.68]",I was here in the woodpile watching you.,video_2
8,"[55.04, 57.04]","Jigaboo, jigaboo, come out, no.",video_2
9,"[57.14, 60.44]","So I scared all the white men way down south, ...",video_2


In [98]:
annot_data = pd.read_csv("./HateMM_annotations/HateMM_annotation.csv", delimiter=";")
annot_data

Unnamed: 0,video_file_name,label,hate_snippet,target,Hate_Label,Image_hate,Audio_hate,Text_hate,Frame_hate
0,hate_video_1.mp4,Hate,"[['00:00:34', '00:01:34']]",Blacks,1.0,0.0,0.0,1.0,1.0
1,hate_video_2.mp4,Hate,"[['00:00:06', '00:02:06']]",Blacks,1.0,1.0,1.0,0.0,1.0
2,non_hate_video_1.mp4,Non Hate,,Others,0.0,0.0,0.0,0.0,0.0
3,hate_video_3.mp4,Hate,"[['00:00:03', '00:01:40'], ['00:01:41', '00:03...",Blacks,1.0,1.0,1.0,0.0,1.0
4,non_hate_video_2.mp4,Non Hate,,Blacks,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1078,non_hate_video_650.mp4,Non Hate,,['Whites'],,,,,
1079,hate_video_430.mp4,Hate,"[['00:00:09', '00:04:04']]",['Jews'],,,,,
1080,non_hate_video_651.mp4,Non Hate,,['Others'],,,,,
1081,non_hate_video_652.mp4,Non Hate,,['Others'],,,,,


In [99]:
import ast
import pandas as pd

def hhmmss_to_seconds(time_str):
    """
    Convert a time string in hh:mm:ss (or mm:ss) format to seconds.
    If already numeric, returns it as a float.
    """
    try:
        time_str = time_str.strip()
        parts = time_str.split(":")
        if len(parts) == 3:
            h, m, s = parts
            return float(h) * 3600 + float(m) * 60 + float(s)
        elif len(parts) == 2:
            m, s = parts
            return float(m) * 60 + float(s)
        else:
            return float(time_str)
    except Exception as e:
        print(f"Error converting '{time_str}' to seconds: {e}")
        return None

def parse_and_convert_snippet(snippet):
    """
    Parse a snippet entry (which might be a string like "[['00:00:34','00:01:34']]" or "Missing value")
    and convert the intervals to seconds. Supports either a single interval [start,end] or multiple intervals.
    Returns a list of intervals in seconds or None if invalid.
    """
    # 1) Handle the literal string "Missing value"
    if isinstance(snippet, str):
        snippet = snippet.strip()
        if snippet == "Missing value":
            return None
        
        # 2) If it's not "Missing value", try to parse it as a Python literal.
        try:
            snippet = ast.literal_eval(snippet)
        except Exception as e:
            print(f"Error parsing snippet: {snippet}. Error: {e}")
            return None
    
    # 3) If snippet is still not a list, return None
    if not isinstance(snippet, list):
        return None
    
    # 4) Check if it's a list of intervals (multiple) or a single interval
    def convert_interval_to_seconds(interval):
        if isinstance(interval, list) and len(interval) == 2:
            start = hhmmss_to_seconds(interval[0])
            end   = hhmmss_to_seconds(interval[1])
            return [start, end]
        return None
    
    # If snippet’s first element is itself a list => multiple intervals
    if snippet and isinstance(snippet[0], list):
        converted = []
        for inter in snippet:
            conv = convert_interval_to_seconds(inter)
            if conv is not None:
                converted.append(conv)
        return converted
    else:
        # Single interval e.g. ['00:00:34','00:01:34']
        conv = convert_interval_to_seconds(snippet)
        return [conv] if conv else None

# ----------------- Main Logic -----------------
# Suppose annot_data is your DataFrame with "hate_snippet" column
# We'll create a new column called "converted_hate_snippet"

annot_data["converted_hate_snippet"] = annot_data["hate_snippet"].apply(parse_and_convert_snippet)

# Display a few rows to verify
print(annot_data[["hate_snippet", "converted_hate_snippet"]].head(10))


                                        hate_snippet  \
0                         [['00:00:34', '00:01:34']]   
1                         [['00:00:06', '00:02:06']]   
2                                                NaN   
3  [['00:00:03', '00:01:40'], ['00:01:41', '00:03...   
4                                                NaN   
5                         [['00:00:00', '00:00:13']]   
6                                                NaN   
7                                                NaN   
8                         [['00:00:00', '00:00:39']]   
9                                                NaN   

           converted_hate_snippet  
0                  [[34.0, 94.0]]  
1                  [[6.0, 126.0]]  
2                            None  
3  [[3.0, 100.0], [101.0, 207.0]]  
4                            None  
5                   [[0.0, 13.0]]  
6                            None  
7                            None  
8                   [[0.0, 39.0]]  
9                      

In [100]:
import re
import pandas as pd

# ------------------ Helper Functions ------------------
def get_vid_num_from_final(video_str):
    """
    Extract the numeric ID from a string like "video_2_frame2342.jpg".
    Returns 2 for "video_2_frame2342.jpg".
    """
    match = re.search(r"video_(\d+)", video_str)
    if match:
        return int(match.group(1))
    return None

def get_vid_num_from_annot(fname):
    """
    Extract the numeric ID from a filename like "hate_video_2.mp4" or "non_hate_video_2.mp4".
    Returns 2 for "hate_video_2.mp4".
    """
    match = re.search(r"_video_(\d+)\.mp4", fname)
    if match:
        return int(match.group(1))
    return None

def intervals_intersect(intervalA, intervalB):
    """
    Check if two intervals [startA, endA] and [startB, endB] intersect.
    They intersect if startA < endB AND startB < endA.
    """
    return intervalA[0] < intervalB[1] and intervalB[0] < intervalA[1]

# ------------------ Process final_df ------------------
# final_df is assumed to have a "timestamp" column (e.g. [0.0, 94.96]) and
# "video_file_name" column (e.g. "video_2_frame2342.jpg").
final_df["vid_num"] = final_df["video"].apply(get_vid_num_from_final)
# Unpack the "timestamp" column into "start" and "end" for easy interval comparison.
final_df[['start', 'end']] = pd.DataFrame(final_df["timestamp"].tolist(), index=final_df.index)

# ------------------ Process annot_data ------------------
# annot_data is assumed to have:
#   - "video_file_name": e.g. "hate_video_2.mp4" or "non_hate_video_2.mp4"
#   - "label": "Hate" or "Non Hate"
#   - "converted_hate_snippet": intervals in seconds (e.g. [[34, 94]] for a single interval)
annot_data["vid_num"] = annot_data["video_file_name"].apply(get_vid_num_from_annot)

# Build a dictionary mapping vid_num -> list of hate intervals (only for rows where label is "Hate")
hate_info = {}
for _, row in annot_data.iterrows():
    if row["Audio_hate"] == 1:
        vn = row["vid_num"]
        intervals = row["converted_hate_snippet"]
        # Ensure intervals is a list of intervals:
        if isinstance(intervals, list):
            # Check if this is a single interval (e.g. [34, 94]) or a list of intervals ([[34,94]])
            if intervals and not isinstance(intervals[0], list):
                intervals = [intervals]
        else:
            intervals = []
        if vn in hate_info:
            hate_info[vn].extend(intervals)
        else:
            hate_info[vn] = intervals

# ------------------ Label Each Row in final_df ------------------
labels = []
for _, row in final_df.iterrows():
    current_interval = [row["start"], row["end"]]
    vn = row["vid_num"]
    row_label = "Non Hate"  # default
    # Look up hate intervals for this video, if any.
    if vn in hate_info:
        for h_interval in hate_info[vn]:
            if intervals_intersect(current_interval, h_interval):
                row_label = "Hate"
                break
    labels.append(row_label)
final_df["audio_hate"] = labels

# ------------------ Cleanup Temporary Columns ------------------
final_df.drop(columns=["start", "end", "vid_num"], inplace=True, errors="ignore")

# Display the first few rows of the updated final_df with the new label.
print(final_df.head())


        timestamp                                               text    video  \
0    [0.0, 94.96]  The End Thank you. The End | Hello | World | L...  video_1   
1    [0.0, 25.44]  I Like Sugar And the NAACP would sure like to ...  video_2   
2  [27.42, 32.12]  Roses are red and violets are blue and niggers...  video_2   
3   [32.42, 37.8]  But they don't mind cause what the heck, you g...  video_2   
4  [38.34, 44.08]  No joke, I ain't got a nickel for a coke and I...  video_2   

  audio_hate  
0   Non Hate  
1       Hate  
2       Hate  
3       Hate  
4       Hate  


In [101]:
final_df

Unnamed: 0,timestamp,text,video,audio_hate
0,"[0.0, 94.96]",The End Thank you. The End | Hello | World | L...,video_1,Non Hate
1,"[0.0, 25.44]",I Like Sugar And the NAACP would sure like to ...,video_2,Hate
2,"[27.42, 32.12]",Roses are red and violets are blue and niggers...,video_2,Hate
3,"[32.42, 37.8]","But they don't mind cause what the heck, you g...",video_2,Hate
4,"[38.34, 44.08]","No joke, I ain't got a nickel for a coke and I...",video_2,Hate
...,...,...,...,...
98,"[198.68, 202.76]","Bitch, clean my car while I drink lemonade.",video_3,Hate
99,"[202.76, 203.96]",Ya fuckin' slave.,video_3,Hate
100,"[203.96, 205.12]",Melvin on the beat,video_3,Hate
101,"[205.12, 208.56]","Buy me two pistols man, fuck all those niggers...",video_3,Hate
