In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [3]:
pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-1.0.2-py3-none-any.whl.metadata (23 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9->speechbrain)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.9->speechbrain)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3

In [4]:
!pip install SpeechRecognition pydub

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub, SpeechRecognition
Successfully installed SpeechRecognition-3.14.1 pydub-0.25.1


In [27]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Try importing speech recognition and pydub
try:
    import speech_recognition as sr
    from pydub import AudioSegment
    AUDIO_PROCESSING_AVAILABLE = True
    print("Audio processing libraries successfully imported")
except ImportError:
    AUDIO_PROCESSING_AVAILABLE = False
    print("Audio processing libraries not available - run the installation code first")

import warnings
warnings.filterwarnings('ignore')

Audio processing libraries successfully imported


In [7]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
# Define the path to save/load the model in Google Drive
MODEL_PATH = "/content/drive/My Drive/hate_speech_model.pkl"

In [29]:
# Function to load and prepare the dataset
def load_dataset(dataset_path):
    """Load the dataset with minimal assumptions about column names."""
    print(f"Loading dataset from {dataset_path}")
    df = pd.read_csv(dataset_path)

    # Print column names to help debugging
    print(f"Dataset columns: {df.columns.tolist()}")

    # Create text and label columns with basic validation
    if "text" not in df.columns:
        # Try to find text column
        text_candidates = ["content", "sentence", "comment", "tweet"]
        for col in text_candidates:
            if col in df.columns:
                df["text"] = df[col]
                print(f"Using '{col}' as the text column")
                break
        else:
            # If no match found, use the first string column
            for col in df.columns:
                if df[col].dtype == 'object':
                    df["text"] = df[col]
                    print(f"Using '{col}' as the text column")
                    break

    if "label" not in df.columns:
        # Try to find label column
        label_candidates = ["is_hate", "hate_speech", "class", "toxic", "label_column"]
        for col in label_candidates:
            if col in df.columns:
                df["label"] = df[col]
                print(f"Using '{col}' as the label column")
                break

    # Handle dataset without specified text/label columns
    if "text" not in df.columns:
        raise ValueError("Could not identify a text column in the dataset")
    if "label" not in df.columns:
        raise ValueError("Could not identify a label column in the dataset")

    # Ensure labels are properly formatted (0 or 1)
    if df["label"].dtype != 'int64':
        # Try to convert to int if possible
        try:
            df["label"] = df["label"].astype(int)
        except:
            # If conversion fails, map unique values to 0 and 1
            unique_labels = df["label"].unique()
            label_map = {val: i for i, val in enumerate(unique_labels)}
            df["label"] = df["label"].map(label_map)
            print(f"Mapped labels: {label_map}")

    # Add a language column if not present (default to 'en' for English)
    if "language" not in df.columns:
        # Try to detect if it's Urdu or English based on a sample
        # This is a very simplified approach - you may need more sophisticated detection
        df["language"] = df["text"].apply(
            lambda x: "ur" if any(ord(c) > 127 for c in str(x)) else "en"
        )

    # Select only the columns we need
    df = df[["text", "label", "language"]]

    return df


In [30]:
# Merge datasets function
def merge_datasets(english_path, urdu_path, output_path=None):
    """Merge English and Roman Urdu datasets for training."""
    try:
        # Load English dataset
        en_df = load_dataset(english_path)
        en_df["language"] = "en"

        # Load Urdu dataset
        ur_df = load_dataset(urdu_path)
        ur_df["language"] = "ur"

        # Merge datasets
        merged_df = pd.concat([en_df, ur_df], ignore_index=True)

        # Save merged dataset if output path provided
        if output_path:
            merged_df.to_csv(output_path, index=False)
            print(f"Merged dataset saved to {output_path}")

        return merged_df

    except Exception as e:
        print(f"Error merging datasets: {str(e)}")
        return None

In [31]:
# Create and train the model
def train_model(dataset_path=None, merged_df=None):
    """Train the hate speech detection model using scikit-learn."""
    try:
        # Load dataset if not provided directly
        if merged_df is None:
            if dataset_path:
                df = load_dataset(dataset_path)
            else:
                raise ValueError("Either dataset_path or merged_df must be provided")
        else:
            df = merged_df

        print(f"Training on dataset with {len(df)} records")
        print(f"Languages present: {df['language'].value_counts().to_dict()}")

        # Split the dataset
        X_train, X_test, y_train, y_test = train_test_split(
            df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
        )

        # Create a pipeline with TF-IDF and Logistic Regression
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=15000, ngram_range=(1, 3))),
            ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
        ])

        # Train the model
        print("Starting model training...")
        pipeline.fit(X_train, y_train)

        # Evaluate the model
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        print(f"Model accuracy: {accuracy:.4f}")
        print("Classification report:")
        print(report)

        # Save the model to Google Drive
        with open(MODEL_PATH, "wb") as f:
            pickle.dump(pipeline, f)
        print(f"Model saved to Google Drive at {MODEL_PATH}")

        return pipeline

    except Exception as e:
        print(f"Error during model training: {str(e)}")
        return None


In [32]:
# Load a previously saved model from Google Drive
def load_model():
    """Load a previously saved model from Google Drive."""
    try:
        if os.path.exists(MODEL_PATH):
            with open(MODEL_PATH, "rb") as f:
                model = pickle.load(f)
            print(f"Model loaded from Google Drive at {MODEL_PATH}")
            return model
        else:
            print("No saved model found in Google Drive.")
            return None
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None

In [33]:
# Audio transcription function - only available if the libraries are installed
def transcribe_audio(audio_path):
    """Transcribe audio file to text using speech recognition."""
    if not AUDIO_PROCESSING_AVAILABLE:
        print("Audio processing is not available. Please run the installation code first.")
        return ""

    try:
        # Initialize recognizer
        recognizer = sr.Recognizer()

        # Convert various audio formats to WAV if needed
        file_ext = os.path.splitext(audio_path)[1].lower()
        temp_wav_path = None

        if file_ext != '.wav':
            print(f"Converting {file_ext} to WAV format...")
            temp_wav_path = "temp_audio.wav"
            audio = AudioSegment.from_file(audio_path)
            audio.export(temp_wav_path, format="wav")
            audio_path_to_use = temp_wav_path
        else:
            audio_path_to_use = audio_path

        # Load the audio file
        with sr.AudioFile(audio_path_to_use) as source:
            # Adjust for ambient noise and record
            recognizer.adjust_for_ambient_noise(source)
            audio_data = recognizer.record(source)

            # Try to recognize with English
            try:
                print("Transcribing audio...")
                text = recognizer.recognize_google(audio_data, language="en-US")
                print(f"Transcription: {text}")

                # If transcription seems too short, try to recognize with Urdu
                if len(text.split()) < 3:
                    try:
                        urdu_text = recognizer.recognize_google(audio_data, language="ur-PK")
                        print(f"Urdu transcription: {urdu_text}")
                        # Combine both transcriptions if we got both
                        if urdu_text:
                            text = f"{text} {urdu_text}"
                    except:
                        pass

                # Clean up temp file if created
                if temp_wav_path and os.path.exists(temp_wav_path):
                    os.remove(temp_wav_path)

                return text

            except sr.UnknownValueError:
                print("Speech Recognition could not understand audio")
                # Try with Urdu if English fails
                try:
                    text = recognizer.recognize_google(audio_data, language="ur-PK")
                    print(f"Urdu transcription: {text}")

                    # Clean up temp file if created
                    if temp_wav_path and os.path.exists(temp_wav_path):
                        os.remove(temp_wav_path)

                    return text
                except:
                    print("Could not recognize speech in either English or Urdu")

            except sr.RequestError as e:
                print(f"Could not request results; {e}")

            # Clean up temp file if created
            if temp_wav_path and os.path.exists(temp_wav_path):
                os.remove(temp_wav_path)

            return ""

    except Exception as e:
        print(f"Error transcribing audio: {str(e)}")
        return ""

In [34]:
# Function to detect hate speech in text
def detect_hate_speech(text, model):
    """Detect if text contains hate speech using the trained model."""
    try:
        if not text or len(text.strip()) == 0:
            return {
                "is_hate_speech": False,
                "confidence": 0.0,
                "prediction": 0,
                "transcribed_text": text,
                "message": "No text to analyze"
            }

        # Predict probability
        proba = model.predict_proba([text])[0]
        prediction = model.predict([text])[0]

        # Get the confidence score
        confidence = proba[prediction]

        return {
            "is_hate_speech": bool(prediction == 1),
            "confidence": float(confidence),
            "prediction": int(prediction),
            "transcribed_text": text
        }

    except Exception as e:
        print(f"Error detecting hate speech: {str(e)}")
        return {
            "is_hate_speech": False,
            "confidence": 0.0,
            "prediction": 0,
            "transcribed_text": text,
            "error": str(e)
        }

In [35]:
# Function to process audio for hate speech
def detect_hate_speech_in_audio(audio_path, model):
    """Transcribe audio and detect hate speech."""
    if not AUDIO_PROCESSING_AVAILABLE:
        return {
            "is_hate_speech": False,
            "confidence": 0.0,
            "prediction": 0,
            "transcribed_text": "",
            "message": "Audio processing libraries not available. Please install them first."
        }

    # Transcribe the audio
    transcribed_text = transcribe_audio(audio_path)

    if not transcribed_text:
        return {
            "is_hate_speech": False,
            "confidence": 0.0,
            "prediction": 0,
            "transcribed_text": "",
            "message": "Failed to transcribe audio"
        }

    # Detect hate speech in the transcribed text
    result = detect_hate_speech(transcribed_text, model)

    return result


In [36]:
# Alternative function for manual text input when audio processing isn't available
def manual_text_input():
    """Get manual text input from user when audio processing isn't available."""
    print("\nAudio processing libraries are not installed.")
    print("As an alternative, you can enter text manually to detect hate speech.")
    text = input("\nEnter text to analyze for hate speech: ")
    return text


In [40]:
# Demo function
def demo(model):
    """Run a simple demo to test the hate speech detection with text examples."""
    # Example texts
    texts = [
        "I really enjoyed this movie, the actors were great!",
        "The service was very shit and waiter was an asshole, I'll never go back.",
    ]

    print("\n--- Text Hate Speech Detection Demo ---\n")
    for text in texts:
        print(f"Text: {text}")
        result = detect_hate_speech(text, model)
        print(f"Result: {'Hate Speech' if result['is_hate_speech'] else 'Not Hate Speech'}")
        print(f"Confidence: {result['confidence']:.4f}")
        print("-" * 50)

# Audio demo function
def audio_demo(model, audio_path):
    """Demo function for audio hate speech detection."""
    print(f"\n--- Audio Hate Speech Detection Demo ---\n")
    print(f"Analyzing audio file: {audio_path}")

    result = detect_hate_speech_in_audio(audio_path, model)

    print(f"Transcribed text: {result.get('transcribed_text', '')}")
    print(f"Result: {'Hate Speech' if result['is_hate_speech'] else 'Not Hate Speech'}")
    print(f"Confidence: {result['confidence']:.4f}")

    return result


In [43]:
# Main function
def main():
    # Paths to your datasets
    english_path = "english_hate_speech_dataset.csv"
    urdu_path = "roman_urdu_hate_speech_dataset.csv"
    merged_path = "merged_hate_speech_dataset.csv"

    # Check if model already exists in Google Drive
    model = load_model()
    if model is None:
        print("No saved model found. Training a new model...")

        # Check if merged dataset exists
        if os.path.exists(merged_path):
            model = train_model(dataset_path=merged_path)
        else:
            # Merge and train on the fly
            merged_df = merge_datasets(english_path, urdu_path, merged_path)
            if merged_df is not None:
                model = train_model(merged_df=merged_df)
            else:
                print("Failed to merge datasets. Exiting.")
                return

    if model is not None:

        # Check if audio processing is available
        if AUDIO_PROCESSING_AVAILABLE:
            # Run audio demo if audio file is provided
            audio_path = input("\nEnter path to audio file for hate speech detection (or press Enter to skip): ")
            if audio_path and os.path.exists(audio_path):
                audio_demo(model, audio_path)
            elif audio_path:
                print(f"Audio file not found: {audio_path}")
        else:
            # If audio processing isn't available, get manual text input
            text = manual_text_input()
            if text:
                result = detect_hate_speech(text, model)
                print(f"Result: {'⚠️ Hate Speech' if result['is_hate_speech'] else '✅ Not Hate Speech'}")
                print(f"Confidence: {result['confidence']:.4f}")
    else:
        print("Skipping demo due to missing model")

In [44]:
if __name__ == "__main__":
    main()

Model loaded from Google Drive at /content/drive/My Drive/hate_speech_model.pkl

Enter path to audio file for hate speech detection (or press Enter to skip): /content/Urdu Hindi Gaaliyan.mp3

--- Audio Hate Speech Detection Demo ---

Analyzing audio file: /content/Urdu Hindi Gaaliyan.mp3
Converting .mp3 to WAV format...
Transcribing audio...
Transcription: Buffalo Indian motorcycles
Transcribed text: Buffalo Indian motorcycles
Result: Hate Speech
Confidence: 0.5476
