In [2]:
!pip install faster-whisper
!apt-get install ffmpeg

Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-14.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting flatbuffers (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB

In [45]:
from faster_whisper import WhisperModel
import json

def transcribe_audio_with_timestamps(audio_path, output_json_path=None):
    """
    Transcribes an audio file and saves the transcription with word-level timestamps to a JSON file.

    Args:
        audio_path (str): Path to the input audio file.
        output_json_path (str, optional): Path to save the transcription as JSON.
                                          If None, prints to the console. Defaults to None.

    Returns:
        list: A list of dictionaries, each with 'text', 'start', and 'end' keys.
              Returns None if transcription fails.
    """
    try:
        model = WhisperModel("large-v2")  # You can change model size here
        segments, info = model.transcribe(audio_path, word_timestamps=True)

        transcription_with_timestamps = []
        for segment in segments:
            for word_info in segment.words:
                transcription_with_timestamps.append({
                    'text': word_info.word.strip(),
                    'start': word_info.start,
                    'end': word_info.end
                })

        if output_json_path:
            with open(output_json_path, 'w') as f:
                json.dump(transcription_with_timestamps, f, indent=4)
            print(f"Transcription with timestamps saved to: {output_json_path}")
        else:
            for item in transcription_with_timestamps:
                print(f"[{item['start']:.3f} -> {item['end']:.3f}] {item['text']}")

        return transcription_with_timestamps

    except Exception as e:
        print(f"Error during transcription: {e}")
        return None


if __name__ == "__main__":
    audio_file = '/content/batch_audios/PII_detection3.wav'  # Your audio file path
    output_json = 'transcription_with_timestamps.json'  # Save as JSON now

    transcription_data = transcribe_audio_with_timestamps(audio_file, output_json)

    if transcription_data:
        print("\nSample from transcription data:")
        print(transcription_data)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config.json:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Transcription with timestamps saved to: transcription_with_timestamps.json

Sample from transcription data:
[{'text': 'Hi,', 'start': np.float64(0.0), 'end': np.float64(0.32)}, {'text': "I'm", 'start': np.float64(0.66), 'end': np.float64(0.92)}, {'text': 'Michael', 'start': np.float64(0.92), 'end': np.float64(1.18)}, {'text': 'Davis.', 'start': np.float64(1.18), 'end': np.float64(1.66)}, {'text': "I've", 'start': np.float64(1.86), 'end': np.float64(2.06)}, {'text': 'noticed', 'start': np.float64(2.06), 'end': np.float64(2.34)}, {'text': 'a', 'start': np.float64(2.34), 'end': np.float64(2.58)}, {'text': 'suspicious', 'start': np.float64(2.58), 'end': np.float64(2.98)}, {'text': 'and', 'start': np.float64(2.98), 'end': np.float64(3.28)}, {'text': 'unauthorized', 'start': np.float64(3.28), 'end': np.float64(4.02)}, {'text': 'charge', 'start': np.float64(4.02), 'end': np.float64(4.46)}, {'text': 'on', 'start': np.float64(4.46), 'end': np.float64(4.94)}, {'text': 'my', 'start': np.float64(4

In [46]:
import json

with open("transcription_with_timestamps.json") as f:
    word_segments = json.load(f)

In [52]:
def clean_transcription(text):
    text = text.replace(" -", "-")  # Remove space before hyphen
    text = text.replace("- ", "-")  # Remove space after hyphen
    # Add additional cleaning logic if needed
    return text

In [53]:
# 2. Create full_text for PII detection
full_text = " ".join([w['text'] for w in word_segments])

    # 3. Clean transcription (optional)
cleaned_text = clean_transcription(full_text)

In [54]:
full_text = " ".join([w['text'] for w in word_segments])

In [55]:
print(cleaned_text)

Hi, I'm Michael Davis. I've noticed a suspicious and unauthorized charge on my credit card 370-3763-5608-4596. I'm not sure how this transaction occurred, as I've never made any purchases at the store in question, and I haven't lost my card. I've already contacted the merchant, but they directed me to the bank to resolve the dispute. Could you please initiate an investigation and possibly freeze my card to prevent further unauthorized activity? Please contact me at 708-926-9979 as soon as possible. Thank you for your help.


In [38]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = "AI-Enthusiast11/pii-entity-extractor"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
# Post processing logic to combine the subword tokens
def merge_tokens(ner_results):
    entities = {}
    for entity in ner_results:
        entity_type = entity["entity_group"]
        entity_value = entity["word"].replace("##", "")

        # Handle token merging
        if entity_type not in entities:
            entities[entity_type] = []
        if entities[entity_type] and not entity_value.startswith(" "):
            # If the previous token exists and this one isn't a new word, merge it
            entities[entity_type][-1] += entity_value
        else:
            entities[entity_type].append(entity_value)

    return entities

In [60]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Reconstruct full text
full_text = " ".join([w['text'] for w in word_segments])

# Step 1: Run token-level NER
token_level_results = ner_pipeline(cleaned_text)

# Step 2: Merge subword tokens to get clean entities
pii_entities = merge_tokens(token_level_results)

Device set to use cpu


In [61]:
pii_entities

{'NAME': ['Michael Davis'],
 'CREDIT-CARD-NO': ['370-3763-5608-4596'],
 'PHONE-NO': ['708-926-9979']}

In [62]:
import re

def normalize(text):
    return re.sub(r'\W+', '', text.lower())  # Removes non-alphanumeric characters

def match_pii_to_timestamps(pii_dict, word_segments):
    redaction_segments = []

    words = [w['text'] for w in word_segments]
    start_times = [w['start'] for w in word_segments]
    end_times = [w['end'] for w in word_segments]

    for entity_type, pii_list in pii_dict.items():
        for pii in pii_list:
            norm_pii = normalize(pii)

            for i in range(len(words)):
                combined = ""
                j = i

                while j < len(words) and len(normalize(combined)) < len(norm_pii):
                    combined += words[j]
                    if normalize(combined) == norm_pii:
                        redaction_segments.append({
                            'entity_type': entity_type,
                            'pii': pii,
                            'start': start_times[i],
                            'end': end_times[j]
                        })
                        break
                    j += 1

    return redaction_segments

In [63]:
redaction_segments = match_pii_to_timestamps(pii_entities, transcription_data)

In [64]:
redaction_segments

[{'entity_type': 'NAME',
  'pii': 'Michael Davis',
  'start': np.float64(0.92),
  'end': np.float64(1.66)},
 {'entity_type': 'CREDIT-CARD-NO',
  'pii': '370-3763-5608-4596',
  'start': np.float64(5.68),
  'end': np.float64(11.28)},
 {'entity_type': 'PHONE-NO',
  'pii': '708-926-9979',
  'start': np.float64(30.8),
  'end': np.float64(34.56)}]

In [19]:
!pip install ffmpeg-python

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting future (from ffmpeg-python)
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Downloading future-1.0.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.3/491.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: future, ffmpeg-python
Successfully installed ffmpeg-python-0.2.0 future-1.0.0


In [65]:
import ffmpeg

def redact_audio(input_path, output_path, segments_to_mute, padding=0.2):
    """
    Redacts PII segments in an audio file by muting specific timestamp ranges.

    Args:
        input_path (str): Path to the input audio file.
        output_path (str): Path to save the redacted audio.
        segments_to_mute (list): List of dicts with 'start' and 'end' keys.
        padding (float): Time in seconds to subtract from start (to catch early syllables).
    """
    input_audio = ffmpeg.input(input_path)
    audio = input_audio.audio

    filtered_audio = audio

    for segment in segments_to_mute:
        start = max(segment['start'] - padding, 0)  # Pad backwards but stay >= 0
        end = segment['end']
        filtered_audio = filtered_audio.filter_(
            "volume", enable=f"between(t,{start},{end})", volume=0
        )

    out = ffmpeg.output(filtered_audio, output_path)
    ffmpeg.run(out, overwrite_output=True)


In [67]:
redact_audio("/content/batch_audios/PII_detection3.wav", "redacted_output.wav", redaction_segments)

### Just to see if detection is working fine

In [58]:
import os
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from faster_whisper import WhisperModel
import ffmpeg
import json

# === Load your PII detection model ===
model_name = "AI-Enthusiast11/pii-entity-extractor"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# === Merge tokens function ===
def merge_tokens(ner_results):
    entities = {}
    for entity in ner_results:
        entity_type = entity["entity_group"]
        entity_value = entity["word"].replace("##", "")

        # Handle token merging
        if entity_type not in entities:
            entities[entity_type] = []
        if entities[entity_type] and not entity_value.startswith(" "):
            # If the previous token exists and this one isn't a new word, merge it
            entities[entity_type][-1] += entity_value
        else:
            entities[entity_type].append(entity_value)

    return entities


# === Transcription function ===
def transcribe_audio(audio_path):
    model = WhisperModel("medium")
    segments, _ = model.transcribe(audio_path, word_timestamps=True)

    transcription = []
    for segment in segments:
        for word_info in segment.words:
            transcription.append({
                'text': word_info.word.strip(),
                'start': word_info.start,
                'end': word_info.end
            })
    return transcription

# === Redaction function ===
def redact_audio(input_path, output_path, segments_to_mute, padding=0.2):
    input_audio = ffmpeg.input(input_path)
    audio = input_audio.audio
    filtered_audio = audio

    for segment in segments_to_mute:
        start = max(segment['start'] - padding, 0)
        end = segment['end']
        filtered_audio = filtered_audio.filter_("volume", enable=f"between(t,{start},{end})", volume=0)

    out = ffmpeg.output(filtered_audio, output_path)
    ffmpeg.run(out, overwrite_output=True)

# === Match PII to timestamps ===
def match_pii_to_segments(pii_dict, transcription):
    pii_words = [val.lower() for values in pii_dict.values() for val in values]
    redaction_segments = []

    for word_info in transcription:
        clean_text = word_info['text'].replace(",", "").replace(".", "").replace("-", "").lower()
        for pii_value in pii_words:
            if clean_text in pii_value.replace("-", "").replace(" ", ""):
                redaction_segments.append({
                    "start": word_info['start'],
                    "end": word_info['end']
                })
    return redaction_segments

# === Batch processing ===
def batch_redact_audio(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    audio_files = list(Path(input_folder).glob("*.wav"))

    for audio_file in audio_files:
        print(f"Processing: {audio_file.name}")

        # 1. Transcribe
        transcription = transcribe_audio(str(audio_file))

        # 2. Join all words for detection
        full_text = " ".join([w["text"] for w in transcription])

        # 3. Clean transcription
        clean_text = clean_transcription(full_text)
        print(f"[DEBUG] Cleaned transcription: {clean_text}")

        # 3. Detect PII
        ner_results = ner_pipeline(clean_text)
        pii_entities = merge_tokens(ner_results)
        print(f"Detected PII: {pii_entities}")

        # 4. Match to timestamps
        segments_to_mute = match_pii_to_segments(pii_entities, transcription)

        # 5. Redact audio
        redacted_path = Path(output_folder) / audio_file.name
        redact_audio(str(audio_file), str(redacted_path), segments_to_mute)

        print(f"Redacted file saved to: {redacted_path}\n")


# === Run it ===
input_folder = "/content/batch_audios"
output_folder = "/content/redacted_audio"


batch_redact_audio(input_folder, output_folder)

Device set to use cpu


Processing: PII_detection1.wav


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[DEBUG] Cleaned transcription: Hello, my name is Benjamin Carter. I'm contacting you about an issue with my tax return from last year. There seems to be a problem with my bank account number for 873153717, and I believe my social security number 589904308 is incorrect in your records. I've already attempted to resolve this issue online, but didn't receive a response. Additionally, this delay has caused me to miss the filing deadline, which could result in penalties. Please verify the information and reach out to me at 416-557-3342. Thank you for your help in resolving this matter quickly.
Detected PII: {'NAME': ['Benjamin Carter'], 'BANK-ACCOUNT-NO': ['873153717'], 'SSN': ['589904308'], 'PHONE-NO': ['416-557-3342.']}
Processing: PII_detection4.wav
[DEBUG] Cleaned transcription: Hi, this is Kiana Turcotte. I recently received a notice regarding a mismatch with my social security number 798606271 in relation to my tax records. I'm also concerned because I have direct deposit set up with 

### Batch Processing

In [59]:
import os
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from faster_whisper import WhisperModel
import ffmpeg
import json

# === Load your PII detection model ===
model_name = "AI-Enthusiast11/pii-entity-extractor"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# === Merge tokens function ===
def merge_tokens(ner_results):
    entities = {}
    for entity in ner_results:
        entity_type = entity["entity_group"]
        entity_value = entity["word"].replace("##", "")

        # Handle token merging
        if entity_type not in entities:
            entities[entity_type] = []
        if entities[entity_type] and not entity_value.startswith(" "):
            # If the previous token exists and this one isn't a new word, merge it
            entities[entity_type][-1] += entity_value
        else:
            entities[entity_type].append(entity_value)

    return entities


# === Transcription function ===
def transcribe_audio(audio_path):
    model = WhisperModel("medium")
    segments, _ = model.transcribe(audio_path, word_timestamps=True)

    transcription = []
    for segment in segments:
        for word_info in segment.words:
            transcription.append({
                'text': word_info.word.strip(),
                'start': word_info.start,
                'end': word_info.end
            })
    return transcription

def clean_transcription(text):
    text = text.replace(" -", "-")  # Remove space before hyphen
    text = text.replace("- ", "-")  # Remove space after hyphen
    # Add additional cleaning logic if needed
    return text

# === Redaction function ===
def redact_audio(input_path, output_path, segments_to_mute, padding=0.2):
    input_audio = ffmpeg.input(input_path)
    audio = input_audio.audio
    filtered_audio = audio

    for segment in segments_to_mute:
        start = max(segment['start'] - padding, 0)
        end = segment['end']
        filtered_audio = filtered_audio.filter_("volume", enable=f"between(t,{start},{end})", volume=0)

    out = ffmpeg.output(filtered_audio, output_path)
    ffmpeg.run(out, overwrite_output=True)

# === Match PII to timestamps ===
def match_pii_to_segments(pii_dict, transcription):
    pii_words = [val.lower() for values in pii_dict.values() for val in values]
    redaction_segments = []

    for word_info in transcription:
        clean_text = word_info['text'].replace(",", "").replace(".", "").replace("-", "").lower()
        for pii_value in pii_words:
            if clean_text in pii_value.replace("-", "").replace(" ", ""):
                redaction_segments.append({
                    "start": word_info['start'],
                    "end": word_info['end']
                })
    return redaction_segments

# === Batch processing ===
def batch_redact_audio(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    audio_files = list(Path(input_folder).glob("*.wav"))

    for audio_file in audio_files:
        print(f"Processing: {audio_file.name}")

        # 1. Transcribe
        transcription = transcribe_audio(str(audio_file))

        # 2. Join all words for detection
        full_text = " ".join([w["text"] for w in transcription])

        # 3. Clean transcription
        clean_text = clean_transcription(full_text)
        print(f"[DEBUG] Cleaned transcription: {clean_text}")

        # 3. Detect PII
        ner_results = ner_pipeline(clean_text)
        pii_entities = merge_tokens(ner_results)
        print(f"Detected PII: {pii_entities}")

        # 4. Match to timestamps
        segments_to_mute = match_pii_to_segments(pii_entities, transcription)

        # 5. Redact audio
        redacted_path = Path(output_folder) / audio_file.name
        redact_audio(str(audio_file), str(redacted_path), segments_to_mute)

        print(f"Redacted file saved to: {redacted_path}\n")


# === Run it ===
input_folder = "/content/batch_audios"
output_folder = "/content/red_audio"


batch_redact_audio(input_folder, output_folder)

Device set to use cpu


Processing: PII_detection1.wav


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[DEBUG] Cleaned transcription: Hello, my name is Benjamin Carter. I'm contacting you about an issue with my tax return from last year. There seems to be a problem with my bank account number for 873153717, and I believe my social security number 589904308 is incorrect in your records. I've already attempted to resolve this issue online, but didn't receive a response. Additionally, this delay has caused me to miss the filing deadline, which could result in penalties. Please verify the information and reach out to me at 416-557-3342. Thank you for your help in resolving this matter quickly.
Detected PII: {'NAME': ['Benjamin Carter'], 'BANK-ACCOUNT-NO': ['873153717'], 'SSN': ['589904308'], 'PHONE-NO': ['416-557-3342.']}
Redacted file saved to: /content/red_audio/PII_detection1.wav

Processing: PII_detection4.wav
[DEBUG] Cleaned transcription: Hi, this is Kiana Turcotte. I recently received a notice regarding a mismatch with my social security number 798606271 in relation to my tax records