In [1]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from audio2text import AudioTranscriber # 16sec
from utils import extract_audio
import pandas as pd
from pydub import AudioSegment
from pydub.generators import Sine
import torch.nn.functional as F
import re
import subprocess
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transcriber = AudioTranscriber() # 9sec
# Load the model and tokenizer
model_path = "checkpoints_v1/distilbert_hatespeech"

distilbert_model = DistilBertForSequenceClassification.from_pretrained(model_path)
distilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_path)

# Set the model to evaluation mode
distilbert_model.eval()

Device set to use cuda:0


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
text = "they are not good people"
inputs = distilbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True)

with torch.no_grad():
    outputs = distilbert_model(**inputs)
    logits = outputs.logits
    print(logits)
    predicted_class = torch.argmax(logits, dim=-1).item()

print("Predicted class:", predicted_class)

tensor([[ 1.1747, -1.2425]])
Predicted class: 0


In [4]:
extract_audio(input_path="./input/video/hate_video_418.mp4", output_dir="./input/audio") 

Success : audio file has been saved to "c:\Users\amene\Documents\deep learning\NoHateZone\input\audio\hate_video_418.mp3".


In [5]:
transcribtion = transcriber.transcribe_audio(input_audio="./input/audio/hate_video_418.mp3", return_timestamps="word") # 1m50 for 30sec audio

print(transcribtion["text"])

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


 The Jews clearly control the media and the banks, investment banks, not the commercial ones. But the point is, they carry out in those realms the exact same principles that they display in sexuality. They undermine traditional life, and they deracinate society. Deracinate. Tear out the roots. Real people derives this genius from the land, from the sun, from the sea, from the soil, you know? This is how they know know themselves like jews don't even have soil israel yeah those aren't jews of course they're jews notice the israelis it's a fundamentally secular society they no longer need judaism because they have soil because the real jew is a wanderer he's a nomad he's got no roots and no attachments, so he universalizes everything. He can't hammer a nail or plow a field. All he can do is buy and sell and invest capital and manipulate markets, and, you know, it's like all mental. He takes a life of a people that's rooted in soil, and then he turns it into this cosmopolitan culture base

In [8]:
sentences = [sentence.strip() for sentence in transcribtion["text"].split('.') if sentence.strip()]
print(sentences)
labels = []
for sentence in sentences:
    inputs = distilbert_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = distilbert_model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1).squeeze()  
        predicted_class = torch.argmax(probs).item()
        prob_class_0 = round(probs[0].item(), 3)
        prob_class_1 = round(probs[1].item(), 3)
        
    labels.append({
        "sentence": sentence,
        "label": predicted_class,
        "prob_class_0": prob_class_0,
        "prob_class_1": prob_class_1
    })        

df = pd.DataFrame(labels)
df.head()

['The Jews clearly control the media and the banks, investment banks, not the commercial ones', 'But the point is, they carry out in those realms the exact same principles that they display in sexuality', 'They undermine traditional life, and they deracinate society', 'Deracinate', 'Tear out the roots', 'Real people derives this genius from the land, from the sun, from the sea, from the soil', "You know, this is how they know know themselves like jews don't even have soil israel yeah those aren't jews of course they're jews notice the israelis it's a fundamentally secular society they no longer need judaism because they have soil because the real jew is a wanderer he's a nomad he's got no roots and no attachments, so he universalizes everything", "He can't hammer a nail or plow a field", "All he can do is buy and sell and invest capital and manipulate markets, and, you know, it's like all mental", "He takes a life of a people that's rooted in soil, and then he turns it into this cosmop

Unnamed: 0,sentence,label,prob_class_0,prob_class_1
0,The Jews clearly control the media and the ban...,1,0.008,0.992
1,"But the point is, they carry out in those real...",0,1.0,0.0
2,"They undermine traditional life, and they dera...",1,0.005,0.995
3,Deracinate,0,1.0,0.0
4,Tear out the roots,0,0.917,0.083


In [9]:
def normalize(word):
    return re.sub(r"[^\w]", "", word).lower()

beep_intervals = []

transcript_words = [
    {
        "word": normalize(chunk["text"]),
        "start": chunk["timestamp"][0],
        "end": chunk["timestamp"][1]
    }
    for chunk in transcribtion.get("chunks", [])
]

transcript_word_list = [w["word"] for w in transcript_words]

for idx, row in df.iterrows():
    if row["label"] != 1:
        continue

    sentence = row["sentence"]
    sentence_words = [normalize(w) for w in sentence.split() if normalize(w)]

    if not sentence_words:
        continue

    # Sliding window to find exact match in transcript
    for i in range(len(transcript_word_list) - len(sentence_words) + 1):
        window = transcript_word_list[i:i + len(sentence_words)]
        if window == sentence_words:
            start_time = transcript_words[i]["start"]
            end_time = transcript_words[i + len(sentence_words) - 1]["end"]
            beep_intervals.append((start_time, end_time))
            print(f"[MATCH] '{sentence}' → BEEP from {start_time:.2f}s to {end_time:.2f}s")
            break

[MATCH] 'The Jews clearly control the media and the banks, investment banks, not the commercial ones' → BEEP from 0.10s to 4.30s
[MATCH] 'They undermine traditional life, and they deracinate society' → BEEP from 9.18s to 13.90s
[MATCH] 'All he can do is buy and sell and invest capital and manipulate markets, and, you know, it's like all mental' → BEEP from 50.04s to 55.94s
[MATCH] 'Marx, Freud, Einstein, what have they given us? Communism, infantile sexuality, and the atom bomb' → BEEP from 69.32s to 76.16s
[MATCH] 'They've ripped us out of a world of order and reason' → BEEP from 81.72s to 83.84s
[MATCH] 'Why? Because it's the deepest impulse of a Jewish soul to pull at the very fabric of life till there's nothing left but a thread' → BEEP from 94.10s to 101.44s


In [None]:
audio = AudioSegment.from_mp3("./input/audio/hate_video_418.mp3")

censored_audio = AudioSegment.empty()
current_pos = 0  

for start_sec, end_sec in sorted(beep_intervals):
    start_ms = int(start_sec * 1000)
    end_ms = int(end_sec * 1000)
    duration = end_ms - start_ms
    censored_audio += audio[current_pos:start_ms]
    beep = Sine(1000).to_audio_segment(duration=duration).apply_gain(-3.0)
    censored_audio += beep
    current_pos = end_ms

censored_audio += audio[current_pos:]

output_path = "./output/audio/censored_hate_video_418.wav"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
censored_audio.export(output_path, format="wav")

print("Censored audio saved as", output_path)


Censored file saved as ./output/audio/censored_hate_video_418.wav


In [19]:
input_video = "./input/video/hate_video_418.mp4"
input_audio = "./output/audio/censored_hate_video_418.wav"
output_video = "./output/video/censored_hate_video_418.mp4"
os.makedirs(os.path.dirname(output_video), exist_ok=True)

cmd = [
    "ffmpeg",
    "-i", input_video,
    "-i", input_audio,
    "-c:v", "copy",
    "-c:a", "aac",  
    "-map", "0:v:0",
    "-map", "1:a:0",
    output_video,
    "-y"  
]

subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

print("Censored video saved as:", output_video)

Censored video saved as: ./output/video/censored_hate_video_418.mp4
