In [4]:
# Loading my OPENAI KEY as an environment variable
from dotenv import load_dotenv
load_dotenv()

True

## Audio transcription

We begin by  transcribing the audio from the input

In [5]:
import whisper
import os

audio_input_path = "audio/audio_to_transcribe.mp3" 

model = whisper.load_model("base.en")

In [6]:
result = model.transcribe(audio_input_path)
print(result["text"])

 Today we are going to be looking at how to use the VITN application to view your exam schedule. We will begin by opening the VITN application. Once it is opened up, we tap on the refresh button in order to get the latest data and keep it updated. Then we open the sidebar by tapping on the sidebar icon, go into academics, down and exam schedule. From here we can select our exam and now view the exam venue, see details, time and date. Thank you.


In [7]:
transcript = result

In [8]:
print(result)

{'text': ' Today we are going to be looking at how to use the VITN application to view your exam schedule. We will begin by opening the VITN application. Once it is opened up, we tap on the refresh button in order to get the latest data and keep it updated. Then we open the sidebar by tapping on the sidebar icon, go into academics, down and exam schedule. From here we can select our exam and now view the exam venue, see details, time and date. Thank you.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 3.8000000000000003, 'text': ' Today we are going to be looking at how to use the VITN application to view your exam', 'tokens': [50363, 6288, 356, 389, 1016, 284, 307, 2045, 379, 703, 284, 779, 262, 569, 2043, 45, 3586, 284, 1570, 534, 2814, 50553], 'temperature': 0.0, 'avg_logprob': -0.22808042346921742, 'compression_ratio': 1.7233201581027668, 'no_speech_prob': 0.13063941895961761}, {'id': 1, 'seek': 0, 'start': 3.8000000000000003, 'end': 4.8, 'text': ' schedule.', 'tokens': [5

In [9]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain, TransformChain
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage

## Steps generation

1. We generate steps taken in the video, alongside the timestamps for when the step begins.
2. We will use the start time of each step to display the step_text in our video

In [10]:
# Prompt template to generate the narrator text
topic_template = """
You are given the transcript of a video of me using an app.
You are to convert the given text of actions performed in the video into an organized list of steps and properly number them.
You are to also timestamp each of the steps as per the transcript. 
The output must be a python list of tuples like [(step_text, timestamp), (step_text, timestamp)] where each step_text is a step and timestamp is the start of the text in seconds . Be informative and do not make up things.

App details:
```
{app_desc}
```

Tutorial Description:
```
{tutorial_desc}
```

The Transcript:
```
{transcript}
```
Do not give any additional text. No talk, just go. 
"""

In [11]:
chat = ChatOpenAI(temperature=0, model_name = "gpt-4")
system_message_prompt = SystemMessage(content="You are an expert at making tutorial videos and very good at defining tasks in simple terms.")
human_message_prompt = HumanMessagePromptTemplate(prompt=PromptTemplate(
                                                  template=topic_template,
                                                  input_variables=["app_desc","tutorial_desc", "transcript"]))

In [12]:
# Chain to write the create the steps from the input text
chat_prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
steps_chain = LLMChain(llm=chat, prompt=chat_prompt_template, output_key='steps')

In [13]:
# Description of the app we are using

# USER INPUT
app_desc = """VITian is an application built for students of VIT to access their student data. 
We can use the app to find out examination allotments."""

# Tutorial Description

#USER INPUT
tutorial_desc = """This is a video tutorial on how to use the VITian app to check exam schedule and timings."""

# transcript has been acquired from the whisper module

In [14]:
# testing the steps chain + prompt engineering
op = steps_chain(inputs={"app_desc": app_desc, "tutorial_desc": tutorial_desc, "transcript": transcript}, return_only_outputs=True)

In [15]:
print(op)

{'steps': 'Here is the list of steps with timestamps:\n\n```python\n[\n    ("Open the VITN application", 4.8),\n    ("Tap on the refresh button to get the latest data", 9.08),\n    ("Open the sidebar by tapping on the sidebar icon", 15.2),\n    ("Go into academics, down and exam schedule", 21.04),\n    ("Select your exam to view the exam venue, details, time and date", 23.4)\n]\n```'}


In [17]:
print(op['steps'])
steps = op['steps']

Here is the list of steps with timestamps:

```python
[
    ("Open the VITN application", 4.8),
    ("Tap on the refresh button to get the latest data", 9.08),
    ("Open the sidebar by tapping on the sidebar icon", 15.2),
    ("Go into academics, down and exam schedule", 21.04),
    ("Select your exam to view the exam venue, details, time and date", 23.4)
]
```


## Extracting tuples from the output

In [18]:
steps = """Here is the list of steps with timestamps:

```python
[
    ("Open the VITN application", 4.8),
    ("Tap on the refresh button to get the latest data", 9.08),
    ("Open the sidebar by tapping on the sidebar icon", 15.2),
    ("Go into academics, then exam schedule", 21.04),
    ("Select your exam to view the exam venue, details, time and date", 23.4)
]
```"""

In [19]:
import re

tuples = re.findall(r'\(([^)]+)', steps)
# array of timestamps of each step
timers = [0]

for i in tuples:
    text = re.findall(r'[^"]+', i)
    # print(text[0])
    timer = float(re.findall(r'\d+\.\d+', i)[0])
    # print(timer)
    print((text[0],timer))
    timers.append(timer)
    print("=========")

# we want the starting timestamps, not the end
timers.pop()

('Open the VITN application', 4.8)
('Tap on the refresh button to get the latest data', 9.08)
('Open the sidebar by tapping on the sidebar icon', 15.2)
('Go into academics, then exam schedule', 21.04)
('Select your exam to view the exam venue, details, time and date', 23.4)


23.4

In [20]:
file = open('text/items.txt','w')
for item in tuples:
	file.write(item+"\n")
file.close()

In [21]:
timers

[0, 4.8, 9.08, 15.2, 21.04]

## Narrator Script Generation

In [22]:
# Prompt template to generate teh narrator text
topic_template = """
You are to narrate a tutorial video about using an app. Be informative and do not make up things.
You are given a description about the app as well as the tutorial topic.
You will then be given the list of steps taken in the video alongside the timestamps where they start.
Your narration must abide by the timestamps. You can make a maximum of 3 words per second in the time window for each step.
Generate text for the narrator to read out to the viewer and nothing more.
Output must be a python list of strings where each string is the text for each step.
Output format:
```
[step_1_text, step_2_text, step_3_text, step_4_text]
```

App details:
```
{app_desc}
```

Tutorial Description:
```
{tutorial_desc}
```

The steps(with timestamps) being performed in the video are:
```
{steps}
```
"""

In [23]:
chat = ChatOpenAI(temperature=0, model_name = "gpt-4")
system_message_prompt = SystemMessage(content="You are a narrator with experience in making tech tutorial videos.")
human_message_prompt = HumanMessagePromptTemplate(prompt=PromptTemplate(
                                                  template=topic_template,
                                                  input_variables=["app_desc", "tutorial_desc", "steps"]))

In [24]:
# Chain to write the script for the narrator
chat_prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
script_chain = LLMChain(llm=chat, prompt=chat_prompt_template, output_key='script')

In [25]:
# testing the scripts + prompt engineering

In [26]:
# testing the steps chain + prompt engineering
op2 = script_chain(inputs={"app_desc": app_desc, "tutorial_desc": tutorial_desc, "steps": steps}, return_only_outputs=True)

In [27]:
print(op2['script'])
script = op2['script']

["Welcome to this tutorial. Let's start by opening the VITian application.", "Now, tap on the refresh button to fetch the latest data.", "Next, open the sidebar by tapping on the sidebar icon.", "Navigate to academics, then select exam schedule.", "Finally, select your exam to view the venue, details, time and date."]


In [46]:
import re

# extract strings inside double quotes
strings = re.findall(r'"([^"]*)"', script)

script = strings

In [48]:
for i in script:
    print(i)
    print("======================")

Welcome to this tutorial. Let's start by opening the VITian application.
Now, tap on the refresh button to fetch the latest data.
Next, open the sidebar by tapping on the sidebar icon.
Navigate to academics, then select exam schedule.
Finally, select your exam to view the venue, details, time and date.


# Generate the audio

### Suno - Bark

In [49]:
# from bark import SAMPLE_RATE, generate_audio, preload_models
# from IPython.display import Audio

# preload_models()

In [50]:
# # generate audio from text
# text_prompt = """
#      Hello, my name is Suno. And, uh — and I like pizza. [laughs] 
#      But I also have other interests such as playing tic tac toe.
# """
# # Male Speaker
# audio_array = generate_audio(text_prompt, history_prompt="v2/en_speaker_6")

# # # Female Speaker
# # audio_array = generate_audio(text_prompt, history_prompt="v2/en_speaker_6")
# Audio(audio_array, rate=SAMPLE_RATE) 

In [51]:
#Bark is waay too slow

In [52]:
# # testing purposes
# temp = ["Welcome to this tutorial. Let's start by opening the VITian application.", "Now, tap the refresh button to fetch the latest data.", "Next, open the sidebar by tapping on the sidebar icon.", "Navigate to academics, then select exam schedule.", "Finally, select your exam to view the venue, details, time and date."]
# temp
# script = temp

In [53]:
# # testing purposes
# steps = """Here is the list of steps with timestamps:

# ```python
# [
#     ("Open the VITN application", 4.8),
#     ("Tap on the refresh button to get the latest data", 9.08),
#     ("Open the sidebar by tapping on the sidebar icon", 15.2),
#     ("Go into academics, then exam schedule", 21.04),
#     ("Select your exam to view the exam venue, details, time and date", 23.4)
# ]
# ```"""

In [54]:
# # start a timer
# import time

# start_time = time.time()
# j = 0
# for i in temp:
#     # print the message being said
#     print(i)
#     # print the time the message starts
#     print(time.time() - start_time)
#     if time.time() - start_time >= timers[j]:
#         tts(i)
#     else:
#         time.sleep(timers[j] - (time.time() - start_time))
#         tts(i)
    
#     j+=1
    
#     print("===============")

## pyttsx3

In [55]:
import pyttsx3
engine = pyttsx3.init()
engine.say("Speech Testing")
engine.runAndWait()

In [56]:
# # start a timer
# import time

# start_time = time.time()
# j = 0
# for i in temp:
#     # print the message being said
#     print(i)
#     # print the time the message starts
#     print(time.time() - start_time)
#     if time.time() - start_time >= timers[j]:
#         engine.say(i)
#         engine.runAndWait()
        
#     else:
#         time.sleep(timers[j] - (time.time() - start_time))
#         engine.say(i)
#         engine.runAndWait()
    
#     j+=1
    
#     print("===============")

### TTS Conclusions

Suno bark has very good quality but too much time taken. <Br>
pyttsx3 is fast but the quality is not good. <Br>
ttsvoice is decent and fast but does not have a save to file option <Br>
Need to explore Google TTS 

In [57]:
script

["Welcome to this tutorial. Let's start by opening the VITian application.",
 'Now, tap on the refresh button to fetch the latest data.',
 'Next, open the sidebar by tapping on the sidebar icon.',
 'Navigate to academics, then select exam schedule.',
 'Finally, select your exam to view the venue, details, time and date.']

In [58]:
j = 0
audio_file_paths = []
for i in script:
    # print the message being said
    print(i)
    # print the time the message starts
    output_path = "audio/Audio_Step_" + str(j) + ".mp3"
    engine.save_to_file(i, output_path)
    engine.runAndWait()    
    audio_file_paths.append(output_path)
    j+=1    
    print("===========================================")

Welcome to this tutorial. Let's start by opening the VITian application.
Now, tap on the refresh button to fetch the latest data.
Next, open the sidebar by tapping on the sidebar icon.
Navigate to academics, then select exam schedule.
Finally, select your exam to view the venue, details, time and date.


In [59]:
audio_file_paths

['audio/Audio_Step_0.mp3',
 'audio/Audio_Step_1.mp3',
 'audio/Audio_Step_2.mp3',
 'audio/Audio_Step_3.mp3',
 'audio/Audio_Step_4.mp3']

## Using pydub to merge into a singular audio file with appropriate timings

In [60]:
from pydub import AudioSegment

In [61]:
from pydub import AudioSegment

def merge_audio_with_silence(filenames, start_times):
    # Load the first audio file
    combined_audio = AudioSegment.from_file(filenames[0])
    
    # Add the required silence before the first audio file
    silence_duration = start_times[0] * 1000  # Convert seconds to milliseconds
    combined_audio = AudioSegment.silent(duration=silence_duration) + combined_audio
    
    # For each subsequent audio file...
    for i in range(1, len(filenames)):
        # Load the audio file
        audio = AudioSegment.from_file(filenames[i])
        # print("Now reading " + filenames[i])
        
        # Calculate the required silence duration
        len_so_far = len(combined_audio) / 1000.0
        silence_duration = (start_times[i] - len_so_far) * 1000  # Convert seconds to milliseconds
        # print("len_so_far (in seconds): " , len_so_far)
        # print("we gotta start at (in seconds): ", start_times[i])
        # print("silence_duration (in milliseconds): " , silence_duration)
        if silence_duration > 0:
            silence = AudioSegment.silent(duration=silence_duration)
        else:
            silence = AudioSegment.empty()
        
        # Append the silence and the audio to the combined audio
        combined_audio += silence + audio
        # print("============================================================")
    
    return combined_audio


In [62]:
print(timers)
print(audio_file_paths)

[0, 4.8, 9.08, 15.2, 21.04]
['audio/Audio_Step_0.mp3', 'audio/Audio_Step_1.mp3', 'audio/Audio_Step_2.mp3', 'audio/Audio_Step_3.mp3', 'audio/Audio_Step_4.mp3']


In [63]:
# Merge the audio files
merged_audio = merge_audio_with_silence(audio_file_paths, timers)

# Save the merged audio to a file
output_path = "audio/merged_audio.mp3"
merged_audio.export(output_path)

<_io.BufferedRandom name='audio/merged_audio.mp3'>