In [9]:

# Essential imports from both notebooks
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import librosa
import soundfile as sf
from scipy.io import wavfile
from IPython.display import Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, pipeline



# Integrated Speech-to-Text and Hate Speech Detection Notebook

This notebook combines functionalities to:
1. Convert audio input into text using a Speech-to-Text model.
2. Analyze the transcribed text for hate speech using a text classification model.


In [10]:

# Load Wav2Vec2 model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

def speech_to_text(audio_path):
    # Load audio file
    audio_input, _ = librosa.load(audio_path, sr=16000)
    
    # Tokenize and predict
    input_values = tokenizer(audio_input, return_tensors="pt", padding="longest").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # Decode transcription
    transcription = tokenizer.decode(predicted_ids[0])
    print(f"Transcription: {transcription}")
    return transcription


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:

# Load hate speech detection model (Hugging Face pipeline)
def load_hate_speech_model():
    print("Loading hate speech detection model...")
    model = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-offensive")
    return model

# Analyze text for hate speech
def analyze_text_for_hate_speech(model, text):
    result = model(text)
    print(f"Analysis: {result}")
    return result


In [12]:

# Main workflow to combine speech-to-text and hate speech detection
def main(audio_path):
    # Step 1: Speech-to-text transcription
    transcription = speech_to_text(audio_path)
    if transcription:
        # Step 2: Load hate speech detection model
        model = load_hate_speech_model()
        
        # Step 3: Analyze transcription for hate speech
        result = analyze_text_for_hate_speech(model, transcription)
        return result
    else:
        print("No transcription available for analysis.")
        return None

# Run the workflow (provide the path to an audio file)
if __name__ == "__main__":
    audio_file_path = "C:/Users/eshaa/Downloads/Myaudio.wav"
    output = main(audio_file_path)
    print(f"Final Output: {output}")


Transcription: HELLO MY NAME IS ESHA
Loading hate speech detection model...


config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cpu


Analysis: [{'label': 'non-offensive', 'score': 0.8386955857276917}]
Final Output: [{'label': 'non-offensive', 'score': 0.8386955857276917}]


Error while downloading from https://cdn-lfs.hf.co/cardiffnlp/twitter-roberta-base-offensive/35ebad0cb76d64c2ce454fc8514c69408ebf8cba6909f454aacbe28e07a0393d?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1737556288&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNzU1NjI4OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9jYXJkaWZmbmxwL3R3aXR0ZXItcm9iZXJ0YS1iYXNlLW9mZmVuc2l2ZS8zNWViYWQwY2I3NmQ2NGMyY2U0NTRmYzg1MTRjNjk0MDhlYmY4Y2JhNjkwOWY0NTRhYWNiZTI4ZTA3YTAzOTNkP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=tRNZJBfUwLJcxEeaUyKPJrmYz7MPQSBlVpQlsjjtVG660e6vBWUSYrVcylNJNcw9eyasCEWLkTrhq3y%7EpEuIwVPiVT48CrbLfmx6gRCUuC%7EZPoOZlxHI8pyMdcuIUjRAkGo-G0S04how1vqsyM-RY68%7ENSw87WHdRM3olnnFnJf8vUrgB14cjhkF-ZUvDfk8grkbFdNO9xoAwD08L5PLNRoZD3kYl2F-agrMrzVWxp2ENhtlZg9B6EQYV-ejutPzfywT6ecwFQN-YRgJgFiCzY90JPs3Q6i%7EmNFY-vzapUC8cjHArRy2iGoN8qgS%7ER6rFRzJrgtruPPUYNVlQ69JMQ

model.safetensors:  74%|#######3  | 367M/499M [00:00<?, ?B/s]