In [None]:
import torch
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import Trainer, TrainingArguments
import librosa

model_name = "Dataset\FineRecordings"
model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)

dataset = load_dataset("C:\Users\girip\OneDrive\Desktop\Education\Autodidactic\Springboard\S2A\Dataset\FineRecordings")

def preprocess_function(examples):
    audio = librosa.load(examples["audio"], sr=16000)[0]
    inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
    labels = processor(examples["text"], return_tensors="pt").input_ids
    inputs["labels"] = labels
    return inputs

train_dataset = dataset["train"].map(preprocess_function, remove_columns=["audio", "text"])

training_args = TrainingArguments(
    output_dir="./whisper_finetuned",
    evaluation_strategy="steps",
    save_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dataset["test"]
)

trainer.train()

model.save_pretrained("./whisper_finetuned")
processor.save_pretrained("./whisper_finetuned")




In [None]:
import csv
import torch
from datasets import load_metric
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa

model_name = "S2A\Codes\whisper_finetuned.pkl"
model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)

wer_metric = load_metric("wer")

csv_file_path = "audio_files.csv"

def load_csv(file_path):
    audio_paths = []
    ground_truths = []
    with open(file_path, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            audio_paths.append(row["audio_path"])
            ground_truths.append(row["transcription"])
    return audio_paths, ground_truths

def compute_wer_from_csv(audio_paths, ground_truths):
    predictions = []
    for audio_path in audio_paths:
        audio = librosa.load(audio_path, sr=16000)[0]
        inputs = processor(audio, return_tensors="pt", sampling_rate=16000).input_features
        with torch.no_grad():
            generated_ids = model.generate(inputs)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        predictions.append(transcription)
    return wer_metric.compute(predictions=predictions, references=ground_truths), predictions

audio_paths, ground_truths = load_csv(csv_file_path)
wer, predictions = compute_wer_from_csv(audio_paths, ground_truths)

print(f"Word Error Rate (WER) after fine-tuning: {wer}")


Word Error Rate (WER) after fine-tuning: 0.9390909090909091


In [None]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import T5ForConditionalGeneration, T5Tokenizer
from diffusers import StableDiffusionPipeline
from PIL import Image

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

audio_path = "S2A\Codes\dog.mp3"
audio, original_rate = torchaudio.load(audio_path)

resampler = torchaudio.transforms.Resample(orig_freq=original_rate, new_freq=16000)
audio = resampler(audio)

input_features = processor(audio.squeeze(), sampling_rate=16000, return_tensors="pt").input_features

predicted_ids = model.generate(input_features)
transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(f"Transcribed Text: {transcribed_text}")

t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")

input_text = f"Correct the following text: {transcribed_text}"
inputs = t5_tokenizer(input_text, return_tensors="pt")
outputs = t5_model.generate(**inputs, max_length=100)
corrected_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Corrected Text: {corrected_text}")

enhanced_prompt = f"Generate a detailed image of: {corrected_text}"
print(f"Enhanced Prompt for Image Generation: {enhanced_prompt}")

pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")

image = pipe(enhanced_prompt).images[0]
image.show()
image.save("output_image.png")


Transcribed Text: a dog in a ferest
Corrected Text: a dog in a forest
Enhanced Prompt for Image Generation: Generate a detailed image : a dog in a forest


In [None]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import T5ForConditionalGeneration, T5Tokenizer
from diffusers import StableDiffusionPipeline
from PIL import Image

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

audio_path = "S2A\Codes\vaudio.mp3"
audio, original_rate = torchaudio.load(audio_path)
resampler = torchaudio.transforms.Resample(orig_freq=original_rate, new_freq=16000)
audio = resampler(audio)

input_features = processor(audio.squeeze(), sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(f"Transcribed Text: {transcribed_text}")

t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")

input_text = f"Correct the following text: {transcribed_text}"
inputs = t5_tokenizer(input_text, return_tensors="pt")
outputs = t5_model.generate(**inputs, max_length=100)
corrected_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Corrected Text: {corrected_text}")

volatile_words = ["nude", "violent","blood", "sexual", "pornographic", "erotic", "sensual", "suggestive","seductive", "abusive", "vulgar", "immoral", "distasteful","killing","abusive"]
enhanced_prompt = f"Generate a detailed image of: {corrected_text}"

if any(word in enhanced_prompt.lower() for word in volatile_words):
    print("Warning: The generated prompt may contain sensitive content.")

print(f"Enhanced Prompt for Image Generation: {enhanced_prompt}")

pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")

image = pipe(enhanced_prompt).images[0]
image.show()
image.save("output_image.png")


Transcribed Text: a man killing child
Corrected Text: a  man killing child
