## Text to speech


In [None]:
import os

from dotenv import load_dotenv

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY") 




In [9]:
from groq import Groq

client = Groq(api_key=GROQ_API_KEY)

model = 'playai-tts'
text = 'I love building and shipping new features for our users!'
voice = "Fritz-PlayAI"
response_format = 'wav'

response = client.audio.speech.create(
    model=model,
    input=text,
    voice=voice,
    response_format=response_format)


In [10]:
speech_file_path = 'speech_test.wav'
response.write_to_file(speech_file_path)

## Speech to Text


In [11]:
filename = 'speech_test.wav'

with open(filename, 'rb') as f:
    transcript = client.audio.transcriptions.create(
        file=f,
        model = 'whisper-large-v3-turbo',
        prompt ='Transcribe the audio in the file.',
        response_format='verbose_json',
        timestamp_granularities = ["word", "segment"], # Optional (must set response_format to "json" to use and can specify "word", "segment" (default), or both)
        language="en",  # Optional
        temperature=0.0  # Optional
    )

In [12]:
print(transcript.text)

 I love building and shipping new features for our users.


# Image Vision

In [13]:
import google.generativeai as genai

GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

genai.configure(api_key=GEMINI_API_KEY)



  from .autonotebook import tqdm as notebook_tqdm


In [15]:
from PIL import Image
image = Image.open('images/sakamoto.jpg')


In [18]:
model = genai.GenerativeModel(model_name='gemini-2.0-flash')
response = model.generate_content([image,"Whats this in the image?"])

In [19]:
print(response.text)

The image shows a scene inside a train carriage. There are several characters:

*   A couple embracing on the left.

*   A tall man with white hair and glasses wearing a yellow shirt and a green apron in the foreground.

*   A man reading a newspaper in the background.

*   There is a Netflix logo in the top left corner.


In [21]:
my_file = genai.upload_file('images/sakamoto.jpg')

In [22]:
my_file

genai.File({
    'name': 'files/s04ye4atgrxs',
    'display_name': 'sakamoto.jpg',
    'download_uri': '',
    'mime_type': 'image/jpeg',
    'sha256_hash': 'YTRhN2I1MmQ0YjA5YjZlZDEyYjQ1MjYwODU0MTQzZTU3MWY3MDBjM2IxOTIxMzAyNDU0MGE4YTQxYjE2OTNlMA==',
    'size_bytes': '76103',
    'source': 'UPLOADED',
    'state': 'ACTIVE',
    'uri': 'https://generativelanguage.googleapis.com/v1beta/files/s04ye4atgrxs',
    'create_time': '2025-06-19T08:01:47.276760Z',
    'expiration_time': '2025-06-21T08:01:47.150041326Z',
    'update_time': '2025-06-19T08:01:47.276760Z'})

In [23]:
res = model.generate_content([my_file,"Whats this in the image?"])
print(res.text)

The image is an animation still from the Netflix series "Jujutsu Kaisen." The scene takes place inside a train car. The main character in the foreground is Nanami Kento. Other people are in the background, including a man reading a newspaper, and a couple who are hugging each other.


## Using Groq

In [20]:
import base64
import os

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = "images/sakamoto.jpg"

# Getting the base64 string
base64_image = encode_image(image_path)



chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What's in this image?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                    },
                },
            ],
        }
    ],
    model="meta-llama/llama-4-scout-17b-16e-instruct",
)

print(chat_completion.choices[0].message.content)

The image depicts a cartoon scene of an older man standing on a crowded train, with the Netflix logo in the top-left corner. The man is dressed in a yellow shirt and green overalls, and he appears to be looking at something or someone with a sense of concern or unease.

In the background, there are several other people on the train, including a man and a girl sitting down, and another man standing up reading a newspaper. The overall atmosphere of the image suggests that something unexpected or unusual is happening on the train, and the man in the foreground is reacting to it.

**Key Elements:**

* Older man in yellow shirt and green overalls
* Netflix logo in top-left corner
* Crowded train with several passengers
* Man and girl sitting down
* Man standing up reading a newspaper
* Atmosphere of concern or unease

**Possible Interpretation:**

Based on the image, it appears that the scene is from an animated series or film available on Netflix. The man's expression and body language sug