## Top Level Understanding

In [1]:
import os
import openai

In [2]:
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# api_key = os.environ.get("OPENAI_API_KEY")

client = openai.OpenAI(api_key = api_key)

In [3]:
from openai import OpenAI

client = OpenAI()
# path = os.path.expanduser("~/Downloads/Amazon_V3_treated.wav")
path = os.path.expanduser("~/Downloads/The Slightly Curious Studio 3 1-25m_x2.m4a")
audio_file = open(path, "rb")

transcription = client.audio.transcriptions.create(
    # model="gpt-4o-mini-transcribe", 
    model = "whisper-1",
    file=audio_file, 
    # response_format="json",
    response_format = "srt"
    # stream = True
)

# compiled_stream = ""
# for event in transcription:
#     if event.type == 'transcript.text.delta':
#         compiled_stream += event.delta
#         print(compiled_stream)


# client.audio.transcriptions.create(
#     model="gpt-4o-mini-transcribe", 
#     file=audio_file, 
#     # response_format="text",
#     stream = True
# )

In [11]:
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
import re
import io

def srt_to_docx(srt_string):

    doc = Document()

    style = doc.styles['Normal']
    style.font.name = 'Arial'
    style.font.size = Pt(11)

    entries = re.split(r'\n\n', srt_string.strip())

    for entry in entries:
        lines = entry.split('\n')
        # if len(lines) >= 3:  # Ensure we have at least timestamp and text?
        timestamp = lines[1]
        text = ' '.join(lines[2:])

        p = doc.add_paragraph()

        timestamp_run = p.add_run(timestamp + " ")
        timestamp_run.font.color.rgb = RGBColor(192, 192, 192)  # Light grey

        # wrapped_text = textwrap.wrap(text, width=60)  # Adjust width as needed
        p.add_run(lines[2])
    
    byte_stream = io.BytesIO()
    doc.save(byte_stream)
    byte_stream.seek(0)  

        # for line in wrapped_text[1:]:
            # p.add_run('\n' + ' ' * 8 + line)

        # p.paragraph_format.space_after = Pt(6)  # Space after paragraph

    return byte_stream


In [12]:
transcription_doc = srt_to_docx(transcription)

transcription_doc

<_io.BytesIO at 0x11c8f1e40>

In [13]:
with open(os.path.expanduser("~/Downloads/document.docx"), 'wb') as f:
    f.write(transcription_doc.getvalue())

While this is all a nice idea, timestamps are more important and so maybe there is no place for this in the app? Or should I implement streaming where the user does not request timestamps?

In [50]:
import openai
from openai import OpenAI

client = OpenAI()

def stream_and_get_full_response(messages):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        stream=True
    )
    full_response_text = ""
    for chunk in response:
        if chunk.choices:
            delta = chunk.choices[0].delta
            if delta.content:
                full_response_text += delta.content
            if chunk.choices[0].finish_reason == "stop":
                # End of stream.  full_response_text now contains the full response.
                break
    return full_response_text

messages = [{"role": "user", "content": "Write a short story about a talking dog."}]
full_story = stream_and_get_full_response(messages)
print(full_story)

Once upon a sunny afternoon in the small town of Willowbrook, a curious little dog named Benny unearthed a peculiar object in his owner's backyard. Benny was a scruffy terrier with a heart full of adventure and a nose for trouble. As he scratched at the ground, he found an old, dusty lamp, half-buried beneath the roots of a gnarled oak tree.

With a wag of his tail, Benny pawed at the lamp until it rolled over, revealing a shiny side. Intrigued, he nudged it with his nose, and to his surprise, a soft, swirling mist poured out. It twisted and turned until it formed into a small, fluffy cloud. The cloud gently floated down and materialized into a tiny, talking genie, complete with a feathery blue hat and a long white beard.

“Greetings, noble pup!” the genie exclaimed, his voice melodious and booming. “I am Azar, the Genie of the Whimsical Wish! You have freed me from my slumber. In gratitude, I shall grant you one wish!”

Benny, absolutely baffled, tilted his head to the side. After a m

This doesn't seem to stream anything?

In [11]:
1400.501333/60

23.341688883333333

## Getting Langcodes

In [51]:
import pycountry 

countries = [country.alpha_2 for country in pycountry.countries]
# country_dict ={"Code" : country.alpha_2, "Country" : country.name for country in pycountry.countries}
country_dict ={country.alpha_2 : country.name for country in pycountry.countries}
country_dict_named = [
    {"label": name, "value": code}
    for code, name in country_dict.items()
]

country_dict_named

[{'label': 'Aruba', 'value': 'AW'},
 {'label': 'Afghanistan', 'value': 'AF'},
 {'label': 'Angola', 'value': 'AO'},
 {'label': 'Anguilla', 'value': 'AI'},
 {'label': 'Åland Islands', 'value': 'AX'},
 {'label': 'Albania', 'value': 'AL'},
 {'label': 'Andorra', 'value': 'AD'},
 {'label': 'United Arab Emirates', 'value': 'AE'},
 {'label': 'Argentina', 'value': 'AR'},
 {'label': 'Armenia', 'value': 'AM'},
 {'label': 'American Samoa', 'value': 'AS'},
 {'label': 'Antarctica', 'value': 'AQ'},
 {'label': 'French Southern Territories', 'value': 'TF'},
 {'label': 'Antigua and Barbuda', 'value': 'AG'},
 {'label': 'Australia', 'value': 'AU'},
 {'label': 'Austria', 'value': 'AT'},
 {'label': 'Azerbaijan', 'value': 'AZ'},
 {'label': 'Burundi', 'value': 'BI'},
 {'label': 'Belgium', 'value': 'BE'},
 {'label': 'Benin', 'value': 'BJ'},
 {'label': 'Bonaire, Sint Eustatius and Saba', 'value': 'BQ'},
 {'label': 'Burkina Faso', 'value': 'BF'},
 {'label': 'Bangladesh', 'value': 'BD'},
 {'label': 'Bulgaria', 'v

In [52]:
len(pycountry.countries)
list(pycountry.countries)[0]

Country(alpha_2='AW', alpha_3='ABW', flag='🇦🇼', name='Aruba', numeric='533')

In [23]:
import langcodes

dir(langcodes)

['ALL_SCRIPTS',
 'Any',
 'DEFAULT_LANGUAGE',
 'DEFAULT_SCRIPTS',
 'Dict',
 'Iterable',
 'LANGUAGE_ALPHA3',
 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC',
 'LANGUAGE_NAME_IMPORT_MESSAGE',
 'LANGUAGE_REPLACEMENTS',
 'LIKELY_SUBTAGS',
 'Language',
 'LanguageData',
 'LanguageTagError',
 'List',
 'Mapping',
 'NORMALIZED_MACROLANGUAGES',
 'Optional',
 'Sequence',
 'TERRITORY_REPLACEMENTS',
 'Tuple',
 'Union',
 'VALIDITY',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'best_match',
 'closest_match',
 'closest_supported_match',
 'data_dicts',
 'find',
 'find_name',
 'get',
 'itemgetter',
 'language_distance',
 'normalize_characters',
 'parse_tag',
 'standardize_tag',
 'sys',
 'tag_distance',
 'tag_is_valid',
 'tag_match_score',
 'tag_parser',
 'tuple_distance_cached',

# what about adding a prompt?

In [None]:
transcription = client.audio.transcriptions.create(
    model="gpt-4o-mini-transcribe", 
    # model = "whisper-1",
    file=audio_file, 
    response_format="json",
    stream = True
)


## Trying chunking text

In [10]:
from pydub import AudioSegment

song = AudioSegment.from_mp3("good_morning.mp3")

# PyDub handles time in milliseconds
ten_minutes = 10 * 60 * 1000

first_10_minutes = song[:ten_minutes]

first_10_minutes.export("good_morning_10.mp3", format="mp3")

ModuleNotFoundError: No module named 'pydub'

In [54]:
import dash
import dash_bootstrap_components as dbc
from dash import html

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

card_content = [
    dbc.CardHeader("Card header"),
    dbc.CardBody(
        [
            html.H5("Card title", className="card-title"),
            html.P(
                "This is some card content that we'll reuse",
                className="card-text",
            ),
        ]
    ),
]

cards = html.Div(
    [
        dbc.Row(
            [
                dbc.Col(dbc.Card(card_content, color="primary", inverse=True)),
                dbc.Col(dbc.Card(card_content, color="secondary", inverse=True)),
                dbc.Col(dbc.Card(card_content, color="info", inverse=True)),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dbc.Card(card_content, color="success", inverse=True)),
                dbc.Col(dbc.Card(card_content, color="warning", inverse=True)),
                dbc.Col(dbc.Card(card_content, color="danger", inverse=True)),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dbc.Card(card_content, color="light")),
                dbc.Col(dbc.Card(card_content, color="dark", inverse=True)),
            ]
        ),
    ]
)

app.layout = html.Div(cards)

if __name__ == "__main__":
    app.run_server(debug=True)


nodename nor servname provided, or not known


SystemExit: 1


To exit: use 'exit', 'quit', or Ctrl-D.



# Translating to other languages

In [None]:
response = client.responses.create(
    model="gpt-4o-mini",
    instructions=instructions,
    input="How would I declare a variable for a last name?",
)


In [None]:
# Make sure requests package is installed  
import requests 
import os
import json

# Load the API key from the environment variable
api_key = os.getenv("OPENAI_API_KEY")


def process_audio_with_gpt_4o(base64_encoded_audio, output_modalities, system_prompt):
    # Chat Completions API end point 
    url = "https://api.openai.com/v1/chat/completions"

    # Set the headers
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    # Construct the request data
    data = {
        "model": "gpt-4o-mini",
        "modalities": output_modalities,
        "audio": {
            "voice": "alloy",
            "format": "wav"
        },
        "messages": [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_audio",
                        "input_audio": {
                            "data": base64_encoded_audio,
                            "format": "wav"
                        }
                    }
                ]
            }
        ]
    }
    
    request_response = requests.post(url, headers=headers, data=json.dumps(data))
    if request_response.status_code == 200:
        return request_response.json()
    else:  
        print(f"Error {request_response.status_code}: {request_response.text}")
        return
    

# Translating to non-english

In [14]:
import os
import openai

In [17]:
from dotenv import load_dotenv

load_dotenv("../")
api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key = api_key)

In [24]:
from openai import OpenAI

client = OpenAI()
path = os.path.expanduser("~/Downloads/Amazon_V3_treated.wav")
# path = os.path.expanduser("~/Downloads/The Slightly Curious Studio 3 1-25m_x2.m4a")
audio_file = open(path, "rb")

transcription = client.audio.transcriptions.create(
    # model="gpt-4o-mini-transcribe", 
    model = "whisper-1",
    file=audio_file, 
    response_format="json",
    # response_format = "srt"
    # stream = True
)

In [30]:
transcription.text

"Nature is our greatest ally in the fight against climate change. By partnering with NGOs and experts across Europe, Amazon is exploring new ways to capture carbon, tackle climate change, and protect nature and our communities, right now. Amazon's RightNow Climate Fund is restoring vital ecosystems, protecting historical woodlands, and reintroducing species, like beavers, back into cities after they'd been gone for centuries. And ensuring the next generation is ready to take on the challenge for the future."

In [31]:
from openai import OpenAI
client = OpenAI()

language_from = "english"
language_to = "spanish"
transcribed_text = transcription.text

spanish_response = client.responses.create(
    model="gpt-4.1-mini",
    input=f"The following is an body of {language_from} text. Translate it to {language_to}: {transcribed_text}"
)

print(spanish_response.output_text)

La naturaleza es nuestra mayor aliada en la lucha contra el cambio climático. Al asociarse con ONG y expertos de toda Europa, Amazon está explorando nuevas formas de capturar carbono, combatir el cambio climático y proteger la naturaleza y nuestras comunidades, ahora mismo. El Fondo Climático RightNow de Amazon está restaurando ecosistemas vitales, protegiendo bosques históricos y reintroduciendo especies, como los castores, en las ciudades después de que hubieran desaparecido durante siglos. Además, está asegurando que la próxima generación esté lista para enfrentar el desafío del futuro.


what about translating spanish to french:

In [33]:
from openai import OpenAI
client = OpenAI()

language_from = "spanish"
language_to = "french"
transcribed_text = spanish_response.output_text

french_response = client.responses.create(
    model="gpt-4.1-mini",
    input=f"The following is an body of {language_from} text. Translate it to {language_to}: {transcribed_text}"
)

print(french_response.output_text)

La nature est notre plus grand allié dans la lutte contre le changement climatique. En s'associant avec des ONG et des experts de toute l'Europe, Amazon explore de nouvelles façons de capturer le carbone, de combattre le changement climatique et de protéger la nature ainsi que nos communautés, dès maintenant. Le Fonds Climatique RightNow d'Amazon restaure des écosystèmes vitaux, protège des forêts historiques et réintroduit des espèces, comme les castors, dans les villes après avoir disparu pendant des siècles. Et il s'assure que la prochaine génération soit prête à relever le défi de l'avenir.


do I want to keep outputting an srt or do I want to output a json instead?

In [36]:
transcription_srt = client.audio.transcriptions.create(
    # model="gpt-4o-mini-transcribe", 
    model = "whisper-1",
    file=audio_file, 
    # response_format="json",
    response_format = "srt"
    # stream = True
)

type(transcription_srt)

<class 'str'>

In [45]:
transcription = client.audio.transcriptions.create(
    # model="gpt-4o-mini-transcribe", 
    model = "whisper-1",
    file=audio_file, 
    response_format="verbose_json",
    timestamp_granularities = ["word", "segment"]
    # response_format = "srt"
    # stream = True
)

transcription.__dict__.keys()

dict_keys(['duration', 'language', 'text', 'segments', 'usage', 'words', '_request_id'])

In [47]:
print(transcription.segments)

[TranscriptionSegment(id=0, avg_logprob=-0.27510523796081543, compression_ratio=1.5872340202331543, end=3.9000000953674316, no_speech_prob=0.0048235934227705, seek=0, start=0.0, temperature=0.0, text=' Nature is our greatest ally in the fight against climate change.', tokens=[50364, 20159, 307, 527, 6636, 23356, 294, 264, 2092, 1970, 5659, 1319, 13, 50614]), TranscriptionSegment(id=1, avg_logprob=-0.27510523796081543, compression_ratio=1.5872340202331543, end=8.140000343322754, no_speech_prob=0.0048235934227705, seek=0, start=5.159999847412109, temperature=0.0, text=' By partnering with NGOs and experts across Europe,', tokens=[50614, 3146, 31290, 365, 46454, 293, 8572, 2108, 3315, 11, 50764]), TranscriptionSegment(id=2, avg_logprob=-0.27510523796081543, compression_ratio=1.5872340202331543, end=12.819999694824219, no_speech_prob=0.0048235934227705, seek=0, start=8.880000114440918, temperature=0.0, text=' Amazon is exploring new ways to capture carbon, tackle climate change,', tokens=[

In [48]:
transcription_srt

"1\n00:00:00,000 --> 00:00:05,000\nNature is our greatest ally in the fight against climate change.\n\n2\n00:00:05,000 --> 00:00:08,000\nBy partnering with NGOs and experts across Europe,\n\n3\n00:00:08,000 --> 00:00:13,000\nAmazon is exploring new ways to capture carbon, tackle climate change,\n\n4\n00:00:13,000 --> 00:00:17,000\nand protect nature and our communities, right now.\n\n5\n00:00:17,000 --> 00:00:22,000\nAmazon's RightNow Climate Fund is restoring vital ecosystems,\n\n6\n00:00:22,000 --> 00:00:24,000\nprotecting historical woodlands,\n\n7\n00:00:24,000 --> 00:00:27,000\nand reintroducing species, like beavers,\n\n8\n00:00:27,000 --> 00:00:31,000\nback into cities after they'd been gone for centuries.\n\n9\n00:00:31,000 --> 00:00:36,000\nAnd ensuring the next generation is ready to take on the challenge for the future.\n\n\n"

how does translating an srt work?

In [53]:
language_from = "english"
language_to = "spanish"
transcribed_text = transcription.text
words_not_for_translation = "'Amazon', 'RightNow Climate Fund'"

print(f"Below is an srt with {language_from} text. Translate it to an srt in {language_to}. Don't translate words that don't have a {language_to} translation, some examples are: {words_not_for_translation}. \n\n {transcription_srt}")

Below is an srt with english text. Translate it to an srt in spanish. Don't translate words that don't have a spanish translation, some examples are: 'Amazon', 'RightNow Climate Fund'. 

 1
00:00:00,000 --> 00:00:05,000
Nature is our greatest ally in the fight against climate change.

2
00:00:05,000 --> 00:00:08,000
By partnering with NGOs and experts across Europe,

3
00:00:08,000 --> 00:00:13,000
Amazon is exploring new ways to capture carbon, tackle climate change,

4
00:00:13,000 --> 00:00:17,000
and protect nature and our communities, right now.

5
00:00:17,000 --> 00:00:22,000
Amazon's RightNow Climate Fund is restoring vital ecosystems,

6
00:00:22,000 --> 00:00:24,000
protecting historical woodlands,

7
00:00:24,000 --> 00:00:27,000
and reintroducing species, like beavers,

8
00:00:27,000 --> 00:00:31,000
back into cities after they'd been gone for centuries.

9
00:00:31,000 --> 00:00:36,000
And ensuring the next generation is ready to take on the challenge for the future.





In [55]:
from openai import OpenAI
client = OpenAI()

language_from = "english"
language_to = "spanish"
transcribed_text = transcription.text
words_not_for_translation = "'SAMY'" # testing if I just leave on filler word

spanish_response_srt = client.responses.create(
    model="gpt-4.1-mini",
    input=f"Below is an srt with {language_from} text. Translate it to an srt in {language_to}. Don't translate words that don't have a {language_to} translation, some examples are: {words_not_for_translation}. \n\n {transcription_srt}"
)

print(spanish_response_srt.output_text)

1
00:00:00,000 --> 00:00:05,000
La naturaleza es nuestra mayor aliada en la lucha contra el cambio climático.

2
00:00:05,000 --> 00:00:08,000
Al asociarse con ONGs y expertos en toda Europa,

3
00:00:08,000 --> 00:00:13,000
Amazon está explorando nuevas formas de capturar carbono, combatir el cambio climático,

4
00:00:13,000 --> 00:00:17,000
y proteger la naturaleza y nuestras comunidades, ahora mismo.

5
00:00:17,000 --> 00:00:22,000
El Fondo Climático RightNow de Amazon está restaurando ecosistemas vitales,

6
00:00:22,000 --> 00:00:24,000
protegiendo bosques históricos,

7
00:00:24,000 --> 00:00:27,000
y reintroduciendo especies, como castores,

8
00:00:27,000 --> 00:00:31,000
de nuevo en las ciudades después de que hubieran desaparecido por siglos.

9
00:00:31,000 --> 00:00:36,000
Y asegurando que la próxima generación esté lista para asumir el desafío del futuro.


for now the path of least resistance is probably keeping all as an srt and translating the srt.

how would I deal with word inputs for no translation list?

In [78]:
input_words = "word1, word2, word3,word4,  word5,word6"
no_translation = 'word3'

pattern = r', ?'

words_split = re.split(pattern, input_words)
words_list = [no_translation] + words_split
words_quote_list = [f"'{word}'" for word in words_list]
", ".join(words_quote_list)


"'word3', 'word1', 'word2', 'word3', 'word4', ' word5', 'word6'"

What if empty input for words not to translate?

In [79]:
input_words = ""
no_translation = 'word3'

pattern = r', ?'

words_split = re.split(pattern, input_words)
words_list = [no_translation] + words_split
words_quote_list = [f"'{word}'" for word in words_list]
", ".join(words_quote_list)

"'word3', ''"

what does an empty text string look like? Is it null or is it an empty string?