In [2]:
# Loading my OPENAI KEY as an environment variable
from dotenv import load_dotenv
load_dotenv()

True

## Audio transcription

We begin by  transcribing the audio from the input

In [7]:
import whisper
import os

audio_input_path = "audio/audio_to_transcribe.mp3" 

model = whisper.load_model("base.en")

In [8]:
result = model.transcribe(audio_input_path)
print(result["text"])

 Today we are going to be looking at how to use the VITN application to view your exam schedule. We will begin by opening the VITN application. Once it is opened up, we tap on the refresh button in order to get the latest data and keep it updated. Then we open the sidebar by tapping on the sidebar icon, go into academics, down and exam schedule. From here we can select our exam and now view the exam venue, see details, time and date. Thank you.


In [78]:
transcript = result

In [79]:
print(result)

{'text': ' Today we are going to be looking at how to use the VITN application to view your exam schedule. We will begin by opening the VITN application. Once it is opened up, we tap on the refresh button in order to get the latest data and keep it updated. Then we open the sidebar by tapping on the sidebar icon, go into academics, down and exam schedule. From here we can select our exam and now view the exam venue, see details, time and date. Thank you.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 3.8000000000000003, 'text': ' Today we are going to be looking at how to use the VITN application to view your exam', 'tokens': [50363, 6288, 356, 389, 1016, 284, 307, 2045, 379, 703, 284, 779, 262, 569, 2043, 45, 3586, 284, 1570, 534, 2814, 50553], 'temperature': 0.0, 'avg_logprob': -0.22808042346921742, 'compression_ratio': 1.7233201581027668, 'no_speech_prob': 0.13063941895961761}, {'id': 1, 'seek': 0, 'start': 3.8000000000000003, 'end': 4.8, 'text': ' schedule.', 'tokens': [5

In [80]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain, TransformChain
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage

## Steps generation

1. We generate steps taken in the video, alongside the timestamps for when the step begins.
2. We will use the start time of each step to display the step_text in our video

In [81]:
# Prompt template to generate the narrator text
topic_template = """
You are given the transcript of a video of me using an app.
You are to convert the given text of actions performed in the video into an organized list of steps and properly number them.
You are to also timestamp each of the steps as per the transcript. 
The output must be a python list of tuples like [(step_text, timestamp), (step_text, timestamp)] where each step_text is a step and timestamp is the start of the text in seconds . Be informative and do not make up things.

App details:
```
{app_desc}
```

Tutorial Description:
```
{tutorial_desc}
```

The Transcript:
```
{transcript}
```
Do not give any additional text. No talk, just go. 
"""

In [82]:
chat = ChatOpenAI(temperature=0, model_name = "gpt-4")
system_message_prompt = SystemMessage(content="You are an expert at making tutorial videos and very good at defining tasks in simple terms.")
human_message_prompt = HumanMessagePromptTemplate(prompt=PromptTemplate(
                                                  template=topic_template,
                                                  input_variables=["app_desc","tutorial_desc", "transcript"]))

In [83]:
# Chain to write the create the steps from the input text
chat_prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
steps_chain = LLMChain(llm=chat, prompt=chat_prompt_template, output_key='steps')

In [84]:
# Description of the app we are using

# USER INPUT
app_desc = """VITian is an application built for students of VIT to access their student data. 
We can use the app to find out examination allotments."""

# Tutorial Description

#USER INPUT
tutorial_desc = """This is a video tutorial on how to use the VITian app to check exam schedule and timings."""

# transcript has been acquired from the whisper module

In [85]:
# testing the steps chain + prompt engineering
op = steps_chain(inputs={"app_desc": app_desc, "tutorial_desc": tutorial_desc, "transcript": transcript}, return_only_outputs=True)

In [86]:
print(op)

{'steps': 'Here is the list of steps with timestamps:\n\n```python\n[\n    ("Open the VITN application", 4.8),\n    ("Tap on the refresh button to get the latest data", 9.08),\n    ("Open the sidebar by tapping on the sidebar icon", 15.2),\n    ("Go into academics, down and exam schedule", 21.04),\n    ("Select your exam to view the exam venue, details, time and date", 23.4)\n]\n```'}


In [95]:
print(op['steps'])
steps = op['steps']

Here is the list of steps with timestamps:

```python
[
    ("Open the VITN application", 4.8),
    ("Tap on the refresh button to get the latest data", 9.08),
    ("Open the sidebar by tapping on the sidebar icon", 15.2),
    ("Go into academics, down and exam schedule", 21.04),
    ("Select your exam to view the exam venue, details, time and date", 23.4)
]
```


## Narrator Script Generation

In [92]:
# Prompt template to generate teh narrator text
topic_template = """
You are to narrate a tutorial video about using an app. Be informative and do not make up things.
You are given a description about the app as well as the tutorial topic.
You will then be given the list of steps taken in the video alongside the timestamps where they start.
Your narration must abide by the timestamps.
Generate text for the narrator to read out to the viewer and nothing more.

App details:
```
{app_desc}
```

Tutorial Description:
```
{tutorial_desc}
```

The steps(with timestamps) being performed in the video are:
```
{steps}
```
"""

In [96]:
chat = ChatOpenAI(temperature=0, model_name = "gpt-4")
system_message_prompt = SystemMessage(content="You are a narrator with experience in making tech tutorial videos.")
human_message_prompt = HumanMessagePromptTemplate(prompt=PromptTemplate(
                                                  template=topic_template,
                                                  input_variables=["app_desc", "tutorial_desc", "steps"]))

In [97]:
# Chain to write the script for the narrator
chat_prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
script_chain = LLMChain(llm=chat, prompt=chat_prompt_template, output_key='script')

In [98]:
# testing the scripts + prompt engineering

In [102]:
# testing the steps chain + prompt engineering
op2 = script_chain(inputs={"app_desc": app_desc, "tutorial_desc": tutorial_desc, "steps": steps}, return_only_outputs=True)

In [105]:
print(op2['script'])

[0:00-4.7]
"Hello and welcome to this tutorial. Today, we will be learning how to use the VITian app to check your exam schedule and timings. This app is specifically designed for students of VIT to access their student data. Let's get started."

[4.8-9.07]
"Firstly, open the VITian application on your device. You will be greeted with the home screen of the app."

[9.08-15.1]
"Next, tap on the refresh button located at the top right corner of the screen. This will ensure that you have the latest data available."

[15.2-21.03]
"Now, open the sidebar by tapping on the sidebar icon, usually represented by three horizontal lines, located at the top left corner of the screen."

[21.04-23.3]
"Once the sidebar is open, navigate to the 'Academics' section. Scroll down and select 'Exam Schedule'."

[23.4-End]
"Finally, select your exam from the list to view the exam venue, details, time, and date. And there you have it! You can now easily check your exam schedule and timings using the VITian ap

# Generate the audio

In [2]:
from bark import SAMPLE_RATE, generate_audio, preload_models
from IPython.display import Audio

preload_models()



Downloading text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading coarse_2.pt:   0%|          | 0.00/3.93G [00:00<?, ?B/s]

Downloading fine_2.pt:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to C:\Users\anand/.cache\torch\hub\checkpoints\encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [00:02<00:00, 33.2MB/s]


In [3]:
# generate audio from text
text_prompt = """
     Hello, my name is Suno. And, uh — and I like pizza. [laughs] 
     But I also have other interests such as playing tic tac toe.
"""
# Male Speaker
audio_array = generate_audio(text_prompt, history_prompt="v2/en_speaker_6")

# # Female Speaker
# audio_array = generate_audio(text_prompt, history_prompt="v2/en_speaker_6")
Audio(audio_array, rate=SAMPLE_RATE) 

100%|██████████| 562/562 [00:21<00:00, 26.25it/s]
100%|██████████| 29/29 [13:05<00:00, 27.09s/it]
