In [2]:
%load_ext autoreload
%autoreload 2

In [45]:
from audio2text import AudioTranscriber # 16sec
from utils import extract_audio, extract_frames, create_chunk_dataframe, combine_text
import os
import pandas as pd
from tqdm import tqdm

In [None]:
transcriber = AudioTranscriber() # 9sec

  from pandas.core import (
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
extract_audio(input_path="./input/video/hate.mp4", output_dir="./input/audio") 

Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\input\audio\hate.mp3".


In [5]:
#Getting the audio transcription 
sentence_result = transcriber.transcribe_audio(
    input_audio="./input/audio/hate.mp3",
    return_timestamps=True  # Instead of "word"
)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


In [None]:
# Example functions assumed to be defined:
# from your_module import extract_frames, extract_audio, create_chunk_dataframe
# And an instance of AudioTranscriber is created as follows:
# transcriber = AudioTranscriber(chunk_length=30)

# Directories
video_dir = "./input/video"
frames_output_dir = "./input/frames"
audio_output_dir = "./input/audio"

# Create output directories if they do not exist
os.makedirs(frames_output_dir, exist_ok=True)
os.makedirs(audio_output_dir, exist_ok=True)

# List video files (adjust extensions as needed)
video_extensions = ('.mp4', '.avi', '.mkv')
video_files = [f for f in os.listdir(video_dir) if f.lower().endswith(video_extensions)]

# Lists to hold DataFrames for frames and audio chunks
all_frame_dfs = []
all_chunk_dfs = []

for idx, video_file in tqdm(enumerate(video_files, start=1), total=len(video_files), desc="Processing videos"):
    input_path = os.path.join(video_dir, video_file)
    
    # Create subfolder for frames of this video
    video_frames_dir = os.path.join(frames_output_dir, f"video_{idx}")
    os.makedirs(video_frames_dir, exist_ok=True)
    
    # --- Extract Frames ---
    frame_count, frame_data = extract_frames(
        input_path,
        video_frames_dir,
        video_index=idx,
        frequency=30  # Adjust extraction frequency if needed
    )
    print(f"Extracted {frame_count} frames from the video '{video_file}'")
    
    # Modify frame_data: create a 'video' column in the format "video_1_frame2342.jpg"
    # Assumes that frame_data has a column "frame_number" (the frame id)
    frame_data["video"] = frame_data["frame_number"].apply(
        lambda f: f"video_{idx}_frame{f}.jpg"
    )

    frame_data = frame_data[["timestamp_seconds", "video","frame_number"]]
    all_frame_dfs.append(frame_data)
    
    # --- Extract Audio ---
    # Here we assume extract_audio writes an audio file for each video to audio_output_dir.
    # For example, it might create a file named "hate_video_{idx}.mp3".
    extract_audio(
        input_path=input_path,
        output_dir=audio_output_dir
    )
    print(f"Extracted audio from the video '{video_file}'")
    
    # Adjust the input_audio to reflect the actual extracted audio filename.
    # Here, if extract_audio creates "hate_video_{idx}.mp3":
    audio_file = os.path.join(audio_output_dir, f"hate_video_{idx}.mp3")
    
    # --- Transcribe Audio & Process Chunks ---
    sentence_result = transcriber.transcribe_audio(
        input_audio=audio_file,    # Use the actual audio file path for this video
        return_timestamps=True      # Use chunk-level timestamps
    )
    # Extract segmentation info (list of dicts with keys: "text", "start", "end")
    chunk_info = transcriber.extract_segments_with_timestamps(sentence_result)
    # Create a DataFrame for these chunks
    chunk_df = create_chunk_dataframe(chunk_info)
    # Add video ID column to chunk_df, e.g., "video_1"
    chunk_df["video"] = f"video_{idx}"

    all_chunk_dfs.append(chunk_df)


Processing videos:   0%|          | 0/2 [00:00<?, ?it/s]

Extracted 193 frames from the video 'hate_video_1.mp4'
Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\input\audio\hate_video_1.mp3".
Extracted audio from the video 'hate_video_1.mp4'


Processing videos:  50%|█████     | 1/2 [02:41<02:41, 161.59s/it]

Extracted 33 frames from the video 'hate_video_2.mp4'
Success : audio file has been saved to "c:\Users\Mert\OneDrive\Desktop\Deep_Learning_Project\NoHateZone\input\audio\hate_video_2.mp3".
Extracted audio from the video 'hate_video_2.mp4'


Processing videos: 100%|██████████| 2/2 [03:06<00:00, 93.04s/it] 


In [21]:
# Concatenate all DataFrames for frames and for audio chunks respectively
combined_frame_df = pd.concat(all_frame_dfs, ignore_index=True)
combined_chunk_df = pd.concat(all_chunk_dfs, ignore_index=True)

print("Combined Frames DataFrame:")
print(combined_frame_df.head())

print("\nCombined Audio Chunk DataFrame:")
print(combined_chunk_df.head())

Combined Frames DataFrame:
   timestamp_seconds                 video  frame_number
0                0.0    video_1_frame0.jpg             0
1                1.2   video_1_frame30.jpg            30
2                2.4   video_1_frame60.jpg            60
3                3.6   video_1_frame90.jpg            90
4                4.8  video_1_frame120.jpg           120

Combined Audio Chunk DataFrame:
    timestamp                              text    video
0  [0.0, 2.0]        I simply must get through!  video_1
1  [2.0, 4.0]       Sorry, you're much too big!  video_1
2  [4.0, 5.0]                         Yeah man!  video_1
3  [5.0, 7.0]   Bobby two pistols in his bitch!  video_1
4  [7.0, 8.0]                         Yeah man!  video_1


In [22]:
combined_chunk_df

Unnamed: 0,timestamp,text,video
0,"[0.0, 2.0]",I simply must get through!,video_1
1,"[2.0, 4.0]","Sorry, you're much too big!",video_1
2,"[4.0, 5.0]",Yeah man!,video_1
3,"[5.0, 7.0]",Bobby two pistols in his bitch!,video_1
4,"[7.0, 8.0]",Yeah man!,video_1
...,...,...,...
87,"[20.0, 24.0]","So, pretty much protected, because none of us...",video_2
88,"[24.0, 25.32]",Hate speech.,video_2
89,"[25.96, 27.26]","Well, I love this.",video_2
90,"[28.78, 29.52]",Thank you.,video_2


In [23]:
combined_frame_df

Unnamed: 0,timestamp_seconds,video,frame_number
0,0.000,video_1_frame0.jpg,0
1,1.200,video_1_frame30.jpg,30
2,2.400,video_1_frame60.jpg,60
3,3.600,video_1_frame90.jpg,90
4,4.800,video_1_frame120.jpg,120
...,...,...,...
221,28.028,video_2_frame840.jpg,840
222,29.029,video_2_frame870.jpg,870
223,30.030,video_2_frame900.jpg,900
224,31.031,video_2_frame930.jpg,930


In [28]:
#Test with texts on the images
# add a column called text_on_image which is a list ["Hello","","World","","","","","LOL,that guy is crazy!",...] fill the rest with empty strings
combined_frame_df["text_on_image"] = [
    "Hello", "", "World", "", "", "", "", "LOL,that guy is crazy!"
] + [""] * (len(combined_frame_df) - 8)


combined_frame_df.head(10)


Unnamed: 0,timestamp_seconds,video,frame_number,text_on_image
0,0.0,video_1_frame0.jpg,0,Hello
1,1.2,video_1_frame30.jpg,30,
2,2.4,video_1_frame60.jpg,60,World
3,3.6,video_1_frame90.jpg,90,
4,4.8,video_1_frame120.jpg,120,
5,6.0,video_1_frame150.jpg,150,
6,7.2,video_1_frame180.jpg,180,
7,8.4,video_1_frame210.jpg,210,"LOL,that guy is crazy!"
8,9.6,video_1_frame240.jpg,240,
9,10.8,video_1_frame270.jpg,270,


In [36]:
combined_chunk_df

Unnamed: 0,timestamp,text,video
0,"[0.0, 2.0]",I simply must get through!,video_1
1,"[2.0, 4.0]","Sorry, you're much too big!",video_1
2,"[4.0, 5.0]",Yeah man!,video_1
3,"[5.0, 7.0]",Bobby two pistols in his bitch!,video_1
4,"[7.0, 8.0]",Yeah man!,video_1
...,...,...,...
87,"[20.0, 24.0]","So, pretty much protected, because none of us...",video_2
88,"[24.0, 25.32]",Hate speech.,video_2
89,"[25.96, 27.26]","Well, I love this.",video_2
90,"[28.78, 29.52]",Thank you.,video_2


In [71]:
final_df = combined_chunk_df.copy()
final_df.head()

Unnamed: 0,timestamp,text,video
0,"[0.0, 2.0]",I simply must get through!,video_1
1,"[2.0, 4.0]","Sorry, you're much too big!",video_1
2,"[4.0, 5.0]",Yeah man!,video_1
3,"[5.0, 7.0]",Bobby two pistols in his bitch!,video_1
4,"[7.0, 8.0]",Yeah man!,video_1


In [72]:

# --- Main Logic ---

# 1) Unpack the "timestamp" column from final_df into "start" and "end" columns
final_df[['start', 'end']] = pd.DataFrame(final_df['timestamp'].tolist(), index=final_df.index)

# 2) For each row in combined_frame_df, find matching rows in final_df and update the text
for _, frame in combined_frame_df.iterrows():
    # Extract the numeric timestamp and text from the frame row
    t = frame["timestamp_seconds"]
    text_on_image = str(frame["text_on_image"]).strip()
    
    # We assume the "video" column in combined_frame_df is something like "video_1_frame2342.jpg"
    video_id = frame["video"].split("_frame")[0]
    
    # Build a mask to find the matching rows in final_df
    mask = (
        (final_df["video"] == video_id) &
        (final_df["start"] <= t) &
        (final_df["end"] >= t)
    )
    
    # Apply the combine_text function only on the masked rows
    final_df.loc[mask, "text"] = final_df.loc[mask, "text"].apply(
        lambda current_text: combine_text(current_text, text_on_image)
    )

# 3) (Optional) Drop the columns "start" and "end" if not needed anymore
final_df.drop(columns=["start", "end"], inplace=True)

final_df.head(10)


Unnamed: 0,timestamp,text,video
0,"[0.0, 2.0]",I simply must get through! | Hello,video_1
1,"[2.0, 4.0]","Sorry, you're much too big! | World",video_1
2,"[4.0, 5.0]",Yeah man!,video_1
3,"[5.0, 7.0]",Bobby two pistols in his bitch!,video_1
4,"[7.0, 8.0]",Yeah man!,video_1
5,"[8.0, 10.0]","Fuck all those niggers man! | LOL,that guy is ...",video_1
6,"[10.0, 12.0]",Martin Luther King deserved to be killed!,video_1
7,"[12.0, 14.0]",And fuck your nigger!,video_1
8,"[14.0, 16.0]",Just grab and steal your shit man!,video_1
9,"[16.0, 18.0]",Fuck those niggers man!,video_1


In [73]:
annot_data = pd.read_csv("./HateMM_annotations/HateMM_annotation.csv", delimiter=";")
annot_data = annot_data[annot_data["video_file_name"] == "hate_video_3.mp4"]
annot_data

Unnamed: 0,video_file_name,label,hate_snippet,target,Hate_Label,Image_hate,Audio_hate,Text_hate,Frame_hate
3,hate_video_3.mp4,Hate,"[['00:00:03', '00:01:40'], ['00:01:41', '00:03...",Blacks,1.0,1.0,1.0,0.0,1.0


In [74]:
import pandas as pd

def intervals_intersect(intervalA, intervalB):
    """
    Check if two intervals [a_start, a_end] and [b_start, b_end] intersect.
    They intersect if a_start < b_end AND b_start < a_end.
    """
    return intervalA[0] < intervalB[1] and intervalB[0] < intervalA[1]

# --- STEP 1: FILTER ANNOT_DATA FOR "Hate" ROWS & FLATTEN THE INTERVALS ---
hate_intervals = []

# We'll only collect intervals from rows whose "label" is "Hate".
hate_rows = annot_data[annot_data["label"] == "Hate"]

for _, row in hate_rows.iterrows():
    snippet = row["hate_snippet"]
    # snippet could be a single interval [start, end] or a list of intervals [[s1, e1], [s2, e2], ...]
    if isinstance(snippet, list):
        # If snippet[0] is itself a list, it means multiple intervals
        if snippet and isinstance(snippet[0], list):
            # multiple intervals
            for inter in snippet:
                hate_intervals.append(inter)
        else:
            # single interval
            hate_intervals.append(snippet)
    # If not a list, you might skip or handle differently

# --- STEP 2: ADD "start" AND "end" COLUMNS TO final_df FOR EASIER COMPARISON ---
final_df[["start", "end"]] = pd.DataFrame(final_df["timestamp"].tolist(), index=final_df.index)

# --- STEP 3: LABEL THE ROWS IN final_df ---
labels = []
for _, row in final_df.iterrows():
    interval = [row["start"], row["end"]]
    # By default assume "Non Hate"
    row_label = "Non Hate"
    
    # Check if this interval intersects with any hate interval
    for hate_interval in hate_intervals:
        if intervals_intersect(interval, hate_interval):
            row_label = "Hate"
            break  # No need to check further
    
    labels.append(row_label)

final_df["label"] = labels

# (Optional) Remove "start" and "end" if not needed anymore
final_df.drop(columns=["start", "end"], inplace=True)

# Now final_df["label"] is "Hate" if the interval intersects with any snippet, otherwise "Non Hate".
print(final_df.head())


    timestamp                                 text    video     label
0  [0.0, 2.0]   I simply must get through! | Hello  video_1  Non Hate
1  [2.0, 4.0]  Sorry, you're much too big! | World  video_1  Non Hate
2  [4.0, 5.0]                            Yeah man!  video_1  Non Hate
3  [5.0, 7.0]      Bobby two pistols in his bitch!  video_1  Non Hate
4  [7.0, 8.0]                            Yeah man!  video_1  Non Hate


In [76]:
final_df.head(10)

Unnamed: 0,timestamp,text,video,label
0,"[0.0, 2.0]",I simply must get through! | Hello,video_1,Non Hate
1,"[2.0, 4.0]","Sorry, you're much too big! | World",video_1,Non Hate
2,"[4.0, 5.0]",Yeah man!,video_1,Non Hate
3,"[5.0, 7.0]",Bobby two pistols in his bitch!,video_1,Non Hate
4,"[7.0, 8.0]",Yeah man!,video_1,Non Hate
5,"[8.0, 10.0]","Fuck all those niggers man! | LOL,that guy is ...",video_1,Non Hate
6,"[10.0, 12.0]",Martin Luther King deserved to be killed!,video_1,Non Hate
7,"[12.0, 14.0]",And fuck your nigger!,video_1,Non Hate
8,"[14.0, 16.0]",Just grab and steal your shit man!,video_1,Non Hate
9,"[16.0, 18.0]",Fuck those niggers man!,video_1,Non Hate
