In [None]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import T5ForConditionalGeneration, T5Tokenizer
from diffusers import StableDiffusionPipeline
from PIL import Image

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

audio_path = "S2A\Codes\dog.mp3"
audio, original_rate = torchaudio.load(audio_path)

resampler = torchaudio.transforms.Resample(orig_freq=original_rate, new_freq=16000)
audio = resampler(audio)

input_features = processor(audio.squeeze(), sampling_rate=16000, return_tensors="pt").input_features

predicted_ids = model.generate(input_features)
transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(f"Transcribed Text: {transcribed_text}")

t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")

input_text = f"Correct the following text: {transcribed_text}"
inputs = t5_tokenizer(input_text, return_tensors="pt")
outputs = t5_model.generate(**inputs, max_length=100)
corrected_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Corrected Text: {corrected_text}")

enhanced_prompt = f"Generate a detailed image of: {corrected_text}"
print(f"Enhanced Prompt for Image Generation: {enhanced_prompt}")

pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")

image = pipe(enhanced_prompt).images[0]
image.show()
image.save("output_image.png")


Transcribed Text: a dog in a ferest
Corrected Text: a dog in a forest
Enhanced Prompt for Image Generation: Generate a detailed image : a dog in a forest


In [None]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import T5ForConditionalGeneration, T5Tokenizer
from diffusers import StableDiffusionPipeline
from PIL import Image

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

audio_path = "S2A\Codes\vaudio.mp3"
audio, original_rate = torchaudio.load(audio_path)
resampler = torchaudio.transforms.Resample(orig_freq=original_rate, new_freq=16000)
audio = resampler(audio)

input_features = processor(audio.squeeze(), sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(f"Transcribed Text: {transcribed_text}")

t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")

input_text = f"Correct the following text: {transcribed_text}"
inputs = t5_tokenizer(input_text, return_tensors="pt")
outputs = t5_model.generate(**inputs, max_length=100)
corrected_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Corrected Text: {corrected_text}")

volatile_words = ["nude", "violent","blood", "sexual", "pornographic", "erotic", "sensual", "suggestive","seductive", "abusive", "vulgar", "immoral", "distasteful","killing","abusive"]
enhanced_prompt = f"Generate a detailed image of: {corrected_text}"

if any(word in enhanced_prompt.lower() for word in volatile_words):
    print("Warning: The generated prompt may contain sensitive content.")

print(f"Enhanced Prompt for Image Generation: {enhanced_prompt}")

pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")

image = pipe(enhanced_prompt).images[0]
image.show()
image.save("output_image.png")


Transcribed Text: a man killing child
Corrected Text: a  man killing child


In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

classifier = TextClassifier.load('en-sentiment')

def analyze_sentiment_with_flair(text):
    sentence = Sentence(text)
    classifier.predict(sentence)
    sentiment = sentence.labels[0].value
    confidence = sentence.labels[0].score
    
    return {"sentiment": sentiment, "confidence": confidence}

sample_text = "A man walking in a road"

result = analyze_sentiment_with_flair(sample_text)
print(result)


{'sentiment': 'POSITIVE', 'confidence': 0.9993454818725586}


In [None]:
from flair.models import TextClassifier
from flair.data import Sentence
from better_profanity import profanity

classifier = TextClassifier.load('en-sentiment')
profanity.load_censor_words()

def analyze_sentiment_with_flair_and_check_violence(text):
    if profanity.contains_profanity(text):
        violence_detected = True
    else:
        violence_detected = False
    
    sentence = Sentence(text)
    classifier.predict(sentence)
    
    sentiment = sentence.labels[0].value
    confidence = sentence.labels[0].score
    
    result = {
        "sentiment": sentiment,
        "confidence": confidence,
        "violence_detected": violence_detected
    }
    
    return result

sample_text = "A man killing a dog"
result = analyze_sentiment_with_flair_and_check_violence(sample_text)
print(result)


{'sentiment': 'NEGATIVE','confidence': 0.998,'violence_detected': True}


In [None]:
import sounddevice as sd
import numpy as np
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from diffusers import StableDiffusionPipeline
from flair.models import TextClassifier
from flair.data import Sentence
from better_profanity import profanity
import io

whisper_model_path = "S2A\Codes\whisper_finetuned.pkl"
processor = WhisperProcessor.from_pretrained(whisper_model_path)
model = WhisperForConditionalGeneration.from_pretrained(whisper_model_path)
model = model.to("cuda")

stable_diffusion_model_id = "CompVis/stable-diffusion-v1-4"
pipe = StableDiffusionPipeline.from_pretrained(stable_diffusion_model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

classifier = TextClassifier.load('en-sentiment')
profanity.load_censor_words()

def record_audio(duration=10, sample_rate=16000):
    print("Recording...")
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='int16')
    sd.wait()
    print("Recording finished.")
    return audio.flatten()

def transcribe_audio(audio_data):
    input_features = processor(audio_data.numpy(), sampling_rate=16000, return_tensors="pt").input_features
    input_features = input_features.to("cuda")
    predicted_ids = model.generate(input_features)
    return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

def generate_image(prompt, output_path):
    with torch.autocast("cuda"):
        image = pipe(prompt).images[0]
    image.save(output_path)

def analyze_sentiment_and_check_profanity(text):
    if profanity.contains_profanity(text):
        violence_detected = True
    else:
        violence_detected = False
    
    sentence = Sentence(text)
    classifier.predict(sentence)
    
    sentiment = sentence.labels[0].value
    confidence = sentence.labels[0].score
    
    result = {
        "sentiment": sentiment,
        "confidence": confidence,
        "violence_detected": violence_detected
    }
    
    return result

if __name__ == "__main__":
    audio_data = record_audio(duration=10)
    audio_data = torch.tensor(audio_data, dtype=torch.float32)
    transcribed_text = transcribe_audio(audio_data)
    print(f"Transcribed Text: {transcribed_text}")
    
    sentiment_result = analyze_sentiment_and_check_profanity(transcribed_text)
    print(f"Sentiment: {sentiment_result['sentiment']}, Confidence: {sentiment_result['confidence']}, Violence Detected: {sentiment_result['violence_detected']}")
    
    generate_image(transcribed_text, "otman_img.png")


Transcribed Text: A dog in a forest
Sentiment: POSITIVE, Confidence: 0.9993454818725678, Violence Detected: False
