In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pytube import YouTube
from moviepy.editor import VideoFileClip
import os
import time
import string
from natsort import natsorted
import json

import openai
import langchain
from openai import OpenAI
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Pinecone
from unidecode import unidecode
from tqdm.auto import tqdm

In [3]:
os.environ["OPENAI_API_KEY"] = 'your key'
os.environ['PINECONE_API_KEY'] = 'your key'
os.environ['ANYSCALE_API_KEY'] = 'your key'
os.environ['ANYSCALE_API_BASE']  =  "your key"

In [4]:
'''
This code is essential for the functioning of PyDub package
'''
from pydub import utils, AudioSegment

def get_prober_name():
    return "C://ffmpeg/bin/ffprobe.exe"


AudioSegment.converter = "C://ffmpeg/bin/ffmpeg.exe"                  
utils.get_prober_name = get_prober_name

## Download YouTube videos, convert into audio, chunk it and transcribe it

In [5]:
def convert_time_to_timestamp(x):
    x = str(round(int(x)/60000,2))                 #1195000 -> 19.92
    x1, x2 = x.split('.')                          #19.92 -> [19,92]
    x1, x2 = int(x1), int(x2)
    if x2>0:
        x2 = int((x2/100)*60)                      #92 -> 55
        
    timestamp = f"{str(x1)}:{str(x2)}"             #19:55
    return timestamp

def create_llm():
    '''
    This step helps create a LLM chain with a prompt.
    This is specifically used to preprocess and remove any mistakes from text transcribed by OpenAI Whisper model.
    '''
    
    prompt_text = """Follow the list of instructions numbered below.\
    1) Please review the below transcript and correct any spelling mistakes or grammatical errors in the transcript. \
    2) There are chances that the transcript might contain organization names or political leader names. Make sure that those names are correct. \
    3) It's possible that the audio transcript may begin abruptly or be incomplete. Give proper starting or closure to the transcript if required. \
    4) If there is any other language text please translate them into English.\
    \n\
    {transcript}\
    Format the response as follows, 'text':response
"""

    llm = langchain.llms.OpenAI()
    llm_chain = LLMChain(llm=llm, prompt=PromptTemplate.from_template(template=prompt_text))
    return llm_chain

def save_transcript(audio_chunk_file_path):
    audio_chunk_folder_name = audio_chunk_file_path.split('/')[0]
    audio_chunk_sub_folder_name = audio_chunk_file_path.split('/')[1]
    audio_chunk_file_name = audio_chunk_file_path.split('/')[2]

    transcript_folder_name = 'transcripts'
    transcript_sub_folder_name = audio_chunk_sub_folder_name
    transcript_file_name = audio_chunk_file_name.split('.')[0]+'.txt'
    transcript_file_path = f"{transcript_folder_name}/{transcript_sub_folder_name}/{transcript_file_name}"
    
    '''
    If the transcript is not saved already, transcripts the audio chunk and saves
    else reads the transcript txt file and sends back the content
    '''
    if not os.path.exists(f"{transcript_folder_name}/{transcript_sub_folder_name}"):
        os.mkdir(f"{transcript_folder_name}/{transcript_sub_folder_name}")
    
    if not os.path.exists(transcript_file_path):
        content = transcript_audio_chunk(audio_chunk_file_path)
        with open(transcript_file_path, 'w', encoding='utf-8') as file:
            file.write(content)
            
        print('Converted audio chunk to transcript and saved as txt file')
        return content
            
    else:
        with open(transcript_file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
        print('Loaded transcript from audio chunk file')
        return content
    

def transcript_audio_chunk(audio_chunk_file_path):
    '''
    Converts 1 minute audio chunks into text
    '''
    client = openai.OpenAI()
    audio_file= open(audio_chunk_file_path, "rb")
    transcript = client.audio.transcriptions.create(
      model="whisper-1", 
      file=audio_file, 
      response_format="text"
    )
    
#     llm_txt = llm_chain({"text":transcript})
#     modified_transcript = llm_txt['text'].replace('\n','')
    return transcript

In [6]:
def convert_mp4_to_mp3(mp4_file_path, mp3_file_path):
    '''
    Converts the video from mp4 to mp3 format
    '''
    if not os.path.exists(mp3_file_path):
        video_clip = VideoFileClip(mp4_file_path)
        audio_clip = video_clip.audio
        audio_clip.write_audiofile(mp3_file_path)
    
        video_clip.close()
        audio_clip.close()
    
        print('Completed converting mp4 to mp3')
    else:
        pass
    
def chunk_audio_clips(mp3_file_name, yt):
    '''
    Chunks the audio clips into 1 minute duration and saves at a location
    '''
    total_duration_ms = (yt.length+1)*1000
    one_minute_ms = 1 * 60 * 1000
    
    chunk_ids = []
    chunk_timestamps = []
    chunk_transcripts = []
    
    ## Reads the audio file
    song = AudioSegment.from_mp3(f"{mp3_file_name}.mp3")
    
    '''
    Only if the audio clip is not available we chunk it.
    Else we fetch the audio clip and send it for transcribing
    '''
    
    if not os.path.exists(f'audio_chunks/{mp3_file_name}'):
        os.mkdir(f'audio_chunks/{mp3_file_name}')

        for chunk_no,start in enumerate(range (0, total_duration_ms, one_minute_ms)):
            chunked_mp3_file_name = f'{mp3_file_name}_{chunk_no+1}.mp3'
            chunked_mp3_file_path = f'audio_chunks/{mp3_file_name}/{chunked_mp3_file_name}'

            chunk_ids.append(chunk_no+1)
            
            end = min(total_duration_ms,start+one_minute_ms)
            song_1_minute_chunk = song[start:end]
            chunk_timestamps.append(str(end))

            song_1_minute_chunk.export(chunked_mp3_file_path, format="mp3")
            
            chunk_transcript = save_transcript(chunked_mp3_file_path)
            chunk_transcripts.append(chunk_transcript)
            
        print('Converted audio into transcript')
        return chunk_ids,chunk_timestamps,chunk_transcripts
    
    else:
        file_paths = []
        for audio_chunk in os.listdir(f'audio_chunks/{mp3_file_name}'):
            file_paths.append(f'audio_chunks/{mp3_file_name}/{audio_chunk}')

        sorted_file_paths = natsorted(file_paths)

        total_duration = 0

        for chunk_no,file_path in enumerate(sorted_file_paths):
            chunk_ids.append(chunk_no+1)
            audio = AudioSegment.from_file(file_path)

            # Get the duration in seconds
            duration = len(audio)
            total_duration+=duration
            
            chunk_transcript = save_transcript(file_path)
            chunk_transcripts.append(chunk_transcript)

            chunk_timestamps.append(total_duration)
            
        print('Converted audio into transcript')    
        return chunk_ids,chunk_timestamps,chunk_transcripts
            
    print('Chunked audio file into 1 minute clips')

def preprocess_title(file_name):
    '''
    Receives the youtube video title
    Creates two titles: 1) Removes punctuation marks (In local, the file name is stored without punctuation marks)
                        2) Replaces spaces with underscore and converts text to lower case (this for the new file name)
    '''
    restricted_characters = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
    system_file_name = ''.join([char for char in file_name if char not in restricted_characters])
    preprocessed_file_name = system_file_name.lower().strip().replace(' ','_')
    preprocessed_file_name = preprocessed_file_name 
    return system_file_name, preprocessed_file_name

def download_from_youtube(url):
    '''
    Downloads a video from YouTube, Preprocesses the title
    '''
    yt = YouTube(url)
    try:
        yt_downloader = yt.streams.filter(progressive=True, file_extension='mp4')
    except:
        return [],[],[],"no video"
    
    actual_video_title = yt.title 
    print(f"Started {actual_video_title}")
    
    system_title, preprocessed_title = preprocess_title(actual_video_title)
    system_mp4_title = system_title + '.mp4'
    preprocessed_mp4_title = preprocessed_title + '.mp4'
    mp3_title = preprocessed_title + '.mp3'
    
    if not os.path.exists(preprocessed_mp4_title):
        try:
            yt_downloader.first().download(filename=preprocessed_mp4_title)
        except Exception as e:
            print(e)
        print(f"Downloaded and renamed {actual_video_title} video successfully")
    
    convert_mp4_to_mp3(preprocessed_mp4_title,mp3_title)
    chunk_ids, chunk_timestamps, chunk_transcripts = chunk_audio_clips(preprocessed_title, yt)
    
    return chunk_ids,chunk_timestamps,chunk_transcripts,yt.title

In [7]:
def get_credentials(llm_model_name):
    if llm_model_name.lower().startswith("gpt"):
        return os.environ["OPENAI_API_BASE"], os.environ["OPENAI_API_KEY"]
    else:
        return os.environ["ANYSCALE_API_BASE"], os.environ["ANYSCALE_API_KEY"]

In [8]:
## Till
## Is Gujarat model a disaster

In [9]:
urls = ['https://www.youtube.com/watch?v=J-V7wR4YPOw&t=30s',
        'https://www.youtube.com/watch?v=0QWC_N6Hi5s',
        'https://www.youtube.com/watch?v=_2-JIfqStOc',
        'https://youtu.be/QP20QMad1eE?si=3nCvhGGwChJi3i8H',
        'https://youtu.be/ocdFuuAaApE?si=ABFOYQVKi9neKsw-',
        'https://youtu.be/0DlSRxEq0W4?si=bLDDXSCZWq1iVeX7',
        'https://youtu.be/A8jyW_6hCGU?si=VP7Dc6aSBQsWCgCZ',
        'https://youtu.be/7WcCeymKCq4?si=3f7OYNmdh3TinN46',
        'https://youtu.be/qJleik1x5k0?si=xEasBJuXxti1RymB',
        'https://youtu.be/ddBooxrxp10?si=fLXcynQiyKxa0YIG',
        'https://youtu.be/GMGGQZ9vmb0?si=p-mLBy-02rlUKqRj',
        'https://youtu.be/SfiD81EdbMs?si=w3qZ_ZJQfHQkx1fR',
        'https://youtu.be/sr7BIdpcG10?si=FsNc0juX1zAmNTCH',
        'https://youtu.be/XYOZ4RzrHW0?si=2CpLgpdFHz3V7kmm',
        'https://youtu.be/og_ob8Xdlvk?si=fhnwgyCr1hewoGNH',
        'https://youtu.be/1WYWwE4Axd8?si=MamiWA5ub-ys5Hs6',
        'https://youtu.be/KYFdxuLFVyQ?si=1dma__aPfVGc2h0F',
        'https://youtu.be/qQPRWA9JNNQ?si=pGwgVHfvuxqrPg4Q',
        'https://youtu.be/Vx4hyGpzVyI?si=P1E8e5nG4AMgR0Le',
        'https://youtu.be/i7Dwe2qm__M?si=ew35we2eyle7-uC0',
        'https://youtu.be/yzDF_KX4Zjg?si=sm4tSY5QUS2pTjXo',
        'https://youtu.be/uZm1au4W7tA?si=5eugIZiq-5UYHozM',
        'https://youtu.be/31rAi_XvyNU?si=ckZ--6awhVr92PVd',
        'https://youtu.be/IU_91PJMZvg?si=Z1dUDjrfQsCacOM5',
        'https://youtu.be/n60GZmfFde8?si=SMGu_4rgHXfsRH3b',
        'https://youtu.be/HAsBtOjGOYQ?si=why0_3RouamxHOEC',
        'https://youtu.be/fLXsf4SWdsI?si=3BhKF6FPBOeAqLsA',
        'https://youtu.be/fkgJFRHS26g?si=FDem_uqWx4W2S7MW',
        'https://youtu.be/qapDyBW_S-w?si=_P3tznMiwLE7uzEO',
        'https://youtu.be/cjhlcIOgUy4?si=cdp3lyB7Bpg9kl2g',
        'https://youtu.be/hOX0YQXTc_E?si=ecPuts-iO0U23piS',
        'https://youtu.be/098cgN5PH0s?si=LxrDCCDY105bGeg6',
        'https://youtu.be/VzUmLAUPSQo?si=ZcboUpHXTOc2CVvm',
        'https://youtu.be/BlDYS_2X5wI?si=YwikQgjZh2MnCXx7',
        'https://youtu.be/kyCopA4K41w?si=BhDjxgEnFhA7e5pu',
        'https://youtu.be/Vh8FaokLWus?si=Ec2_us5oTns7TNzq',
        'https://youtu.be/wO7copUXogI?si=MqZKVlc8uSn3oGUs',
        'https://youtu.be/YWara0eQsDk?si=IMAIYAPbFCqlRcj4',
        'https://youtu.be/7TcG9cPj2gA?si=Ru5Ooqll6RMpYRe7',
        'https://youtu.be/EZkZe-BuXf8?si=Nf8Mic6jWqNK_KtT',
        'https://youtu.be/WD2nz_IKyJM?si=DWiPVOVzGSVi8Usw',
        'https://youtu.be/KDsGPYETAd4?si=wFJL4PWPsZSF9qhg',
        'https://youtu.be/2nIRHB3FtOc?si=RezDl6VP5oFee-J8',
        'https://youtu.be/IIv7SNc3Ass?si=ik4ZeVlpoR4QlX0M',
        'https://youtu.be/iw5AdKOM-IU?si=1nh8NwCD9Mmw9lbP',
        'https://youtu.be/4WSAeT_XDaI?si=WvkPR0P1Iid8o0DL',
        'https://youtu.be/wO28dHUQusU?si=UpEvTgIxsK7_S-xa',
        'https://youtu.be/QRGO9wvNPQc?si=tsopwAkzj8GE0jJk',
        'https://youtu.be/Ko3K8Duq4Eo?si=HpQyD9SNgDbtJfYp',
        'https://youtu.be/bpjKmJnRPV8?si=ouYgOaj6RnL5eszp',
        'https://youtu.be/OKQDst5GUc0?si=6ylXRzU1GjuGDNVf',
        'https://youtu.be/7UdCt_Rsevk?si=jM6p2_KVqXT_6ZRW',
        'https://youtu.be/EIZhJmqIc00?si=sLEURSXvJk40Y4Ir',
        'https://youtu.be/zawAVCpQNgg?si=RGk5p3tujyeyxGVP'
       ]

overall_chunk_ids = []
overall_yt_titles = []
overall_chunk_timestamps = []
overall_chunk_transcripts = []
overall_urls = []

for url in urls:
    chunk_ids, chunk_timestamps, chunk_transcripts, yt_title = download_from_youtube(url)
    chunk_ids = [yt_title+'_'+str(chunk_id) for chunk_id in chunk_ids]
    yt_titles = [yt_title]*len(chunk_ids)
    tmp_urls = [url]*len(chunk_ids)
    overall_chunk_ids.extend(chunk_ids)
    overall_yt_titles.extend(yt_titles)
    overall_chunk_timestamps.extend(chunk_timestamps)
    overall_chunk_transcripts.extend(chunk_transcripts)
    overall_urls.extend(tmp_urls)

Started This Secret B2b Company Makes 400 Crores Per Year By Helping Zomato And Uber: Business Case Study
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Converted audio into transcript
Started Is India Winning or Losing the Electric War?: Business

Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Converted audio into transcript
Started Can football save Saudi Arabia from an economic crisis? : Geopolitical Case Study
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk fil

Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Converted audio into transcript
Started How Masters union is building a Harvard for India? : Business case study
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded 

Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Converted audio into transcript
Started How Bangalore became the SILICON valley of Asia? | Business case study
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded transcript from audio chunk file
Loaded tr

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio into tra

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio into tra

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio into tra

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio into transcript
Started How Nykaa’s Genius Financial Strategy Backfired? : Nykaa Business case study
IncompleteRead(1631743 bytes read, 2571009 more expected)
Downloaded and renamed 

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio into transcript
Started How did China go from POVERTY to becoming 

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio into tra

                                                                                                                       

MoviePy - Done.
Completed converting mp4 to mp3
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to transcript and saved as txt file
Converted audio chunk to

In [30]:
audio_chunks_df = pd.DataFrame(zip(overall_chunk_ids,overall_chunk_timestamps,overall_chunk_transcripts),columns=['chunk_id','timestamp','transcript'])

audio_chunks_df['timestamp'] = audio_chunks_df['timestamp'].apply(convert_time_to_timestamp)
audio_chunks_df['title'] = overall_yt_titles
audio_chunks_df['url'] = overall_urls

In [39]:
llm_model_name = 'llama2'
base_url, api_key = get_credentials(llm_model_name)

client = openai.OpenAI(
    base_url = base_url,
    api_key = api_key
)

def get_embedding(text, model_name):
    '''
    Converts the text into embeddings
    '''
    text = text.replace("\n", " ")
    embedding = client.embeddings.create(
        model=model_name,
        input=[text]
    ).data[0].embedding
    
    return embedding

audio_chunks_df['transcript_embeddings'] = audio_chunks_df['transcript'].apply(lambda x: get_embedding(x, model_name="thenlper/gte-large"))

audio_chunks_df.to_csv('transcripts_with_embeddings.csv',index=False)

## Inserting embeddings into Vector Database

In [36]:
import pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment="gcp-starter")

In [37]:
index_name = 'youtube-rag'

# only create index if it does not exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=1024,
        metric='cosine'
    )
    # waits till index is initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

# now connect to the index
index = pinecone.Index(index_name)

In [38]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [93]:
# def get_embedding_model(embedding_model_name, model_kwargs, encode_kwargs):
#     if embedding_model_name == "text-embedding-ada-002":
#         embedding_model = OpenAIEmbeddings(
#             model=embedding_model_name,
#             openai_api_key=keys["OPENAI_API_KEY"])
#     else:
#         embedding_model = HuggingFaceEmbeddings(
#             model_name=embedding_model_name,  # also works with model_path
#             model_kwargs=model_kwargs,
#             encode_kwargs=encode_kwargs)
#     return embedding_model

# embedding_model_name = 'thenlper/gte-base'

# embedding_model = get_embedding_model(
#     embedding_model_name=embedding_model_name, 
#     model_kwargs={"device": "cuda"}, 
#     encode_kwargs={"device": "cuda", "batch_size": 100})

In [40]:
audio_chunks_df = pd.read_csv('transcripts_with_embeddings.csv')
audio_chunks_df = audio_chunks_df.dropna(subset=['transcript'])
audio_chunks_df = audio_chunks_df.reset_index().drop('index',axis=1)

#unidecode makes sure that the chunk_id contains only ASCII characters which is required for Pinecone Vector DB index
audio_chunks_df['chunk_id'] = audio_chunks_df['chunk_id'].apply(lambda x:unidecode(preprocess_title(x)[1]))
audio_chunks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037 entries, 0 to 1036
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   chunk_id               1037 non-null   object
 1   timestamp              1037 non-null   object
 2   transcript             1037 non-null   object
 3   title                  1037 non-null   object
 4   url                    1037 non-null   object
 5   transcript_embeddings  1037 non-null   object
dtypes: object(6)
memory usage: 48.7+ KB


In [41]:
audio_chunks_df

Unnamed: 0,chunk_id,timestamp,transcript,title,url,transcript_embeddings
0,this_secret_b2b_company_makes_400_crores_per_y...,1:0,"Hi, everybody. In 2003, Jay Bezos delivered an...",This Secret B2b Company Makes 400 Crores Per Y...,https://www.youtube.com/watch?v=J-V7wR4YPOw&t=30s,"[0.01958036795258522, 0.017185915261507034, -0..."
1,this_secret_b2b_company_makes_400_crores_per_y...,2:0,in a year. And out of these thousands of miner...,This Secret B2b Company Makes 400 Crores Per Y...,https://www.youtube.com/watch?v=J-V7wR4YPOw&t=30s,"[-0.0022682896815240383, 0.019085343927145004,..."
2,this_secret_b2b_company_makes_400_crores_per_y...,3:0,and fell. But the ones who actually made money...,This Secret B2b Company Makes 400 Crores Per Y...,https://www.youtube.com/watch?v=J-V7wR4YPOw&t=30s,"[-0.006876604165881872, 0.012396343983709812, ..."
3,this_secret_b2b_company_makes_400_crores_per_y...,4:0,could be a huge opportunity for you. And if yo...,This Secret B2b Company Makes 400 Crores Per Y...,https://www.youtube.com/watch?v=J-V7wR4YPOw&t=30s,"[0.030099136754870415, 0.01251540333032608, -0..."
4,this_secret_b2b_company_makes_400_crores_per_y...,5:0,so that you can service them. But the moment t...,This Secret B2b Company Makes 400 Crores Per Y...,https://www.youtube.com/watch?v=J-V7wR4YPOw&t=30s,"[0.0014116348465904593, -0.008071042597293854,..."
...,...,...,...,...,...,...
1032,is_gujarat_model_a_disaster_or_a_miracle__guja...,19:0,has actually increased. This clearly states th...,Is Gujarat model a Disaster or a Miracle? : Gu...,https://youtu.be/zawAVCpQNgg?si=RGk5p3tujyeyxGVP,"[0.031933464109897614, 0.007105099502950907, -..."
1033,is_gujarat_model_a_disaster_or_a_miracle__guja...,20:0,"day. In fact, the national average itself is 3...",Is Gujarat model a Disaster or a Miracle? : Gu...,https://youtu.be/zawAVCpQNgg?si=RGk5p3tujyeyxGVP,"[0.01684519834816456, 0.007003523409366608, -0..."
1034,is_gujarat_model_a_disaster_or_a_miracle__guja...,21:0,brief analysis of the state of Gujarat. Now af...,Is Gujarat model a Disaster or a Miracle? : Gu...,https://youtu.be/zawAVCpQNgg?si=RGk5p3tujyeyxGVP,"[0.028295528143644333, 0.0043985722586512566, ..."
1035,is_gujarat_model_a_disaster_or_a_miracle__guja...,22:0,risk analysis. So read through that and you'll...,Is Gujarat model a Disaster or a Miracle? : Gu...,https://youtu.be/zawAVCpQNgg?si=RGk5p3tujyeyxGVP,"[0.0030611043330281973, 0.028293581679463387, ..."


In [42]:
# Inserts the embeddings along with metadata into the Pinecone index in batches

batch_size = 100
for i in tqdm(range(0, len(audio_chunks_df), batch_size)):
    i_end = min(len(audio_chunks_df),i+batch_size)
    batch = audio_chunks_df.iloc[i:i_end]
    
    index_ids = [str(row['chunk_id']) for _,row in batch.iterrows()]
    embeddings = [eval(row['transcript_embeddings']) for _,row in batch.iterrows()]
    
    metadata = [{
        'title': row['title'],
        'timestamp': row['timestamp'],
        'text': row['transcript'],
        'url': row['url']
    } for _,row in batch.iterrows()]
    
    index.upsert(vectors=zip(index_ids, embeddings, metadata))

  0%|          | 0/11 [00:00<?, ?it/s]

## Retreival Augmented Generation using llama2 13B params model

In [43]:
def generate_response(llm_model_name, temperature=0.1, stream=False, query=" ", context=" ", max_retries=3, retry_interval=60):
    '''
    Generate response from an LLM
    '''
    retry_count = 0
    system_content="""
    <<SYS>>You are a helpful assistant. Answer the query based on the context provided.
    
    The query should be answered only based on the context and 
    if context is not available don't hallucinate and return false information
    
    Also make sure to generate response based only on the context most relevant to the query
    
    Always answer as helpfully as possible, while being safe. Your answers should not include
    any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
    Please ensure that your responses are socially unbiased and positive in nature.
    <</SYS>>
    """
    user_content=f"query:{query} context:{context}"
    api_base, api_key = get_credentials(llm_model_name=llm_model_name)
    while retry_count < max_retries:
        try:
            client = openai.OpenAI(
                base_url = api_base,
                api_key = api_key)
            response = client.chat.completions.create(
                model=llm_model_name,
                temperature=temperature,
                stream=stream,
                messages=[
                    {"role": "system", "content": system_content},
                    {"role": "user", "content": user_content}
                ]
            )
            return response.choices[0].message.content

        except Exception as e:
            print(f"Exception: {e}")
            time.sleep(retry_interval)  # default is per-minute rate limits
            retry_count += 1
    return ""

def get_embedding(query, model_name):
    base_url, api_key = get_credentials(model_name)

    client = openai.OpenAI(
        base_url = base_url,
        api_key = api_key
    
    )
    query = query.replace("\n", " ")
    
    embedding = client.embeddings.create(
        model=model_name,
        input=[query]
    ).data[0].embedding
    
    return embedding

def retrieve_relevant_context(embedding, no_of_top_chunks):
    '''
    Identifies the top (no_of_top_chunks) embeddings most similar to the query embedding and
    retrieves the text and metadata corresponding to those embeddings
    '''
    index_name = 'youtube-rag'
    index = pinecone.Index(index_name)
    relevant_chunks = index.query(embedding, top_k=no_of_top_chunks, include_metadata=True)
    context = '\n'.join([chunk['metadata']['text'] for chunk in relevant_chunks['matches']])
    titles = [chunk['metadata']['title'] for chunk in relevant_chunks['matches']]
    urls = [chunk['metadata']['url'] for chunk in relevant_chunks['matches']]
    return context,titles,urls

def rag(query, no_of_top_chunks, llm_model_name="meta-llama/Llama-2-13b-chat-hf", embedding_model_name="thenlper/gte-large", stream=False):
    '''
    Converts the query into embedding
    Retrieves the embeddings similar to query embedding along with the corresponding text chunks which will act as context
    Generate response based on the context
    '''
    embedding = get_embedding(query, embedding_model_name)
    context,titles,urls = retrieve_relevant_context(embedding, no_of_top_chunks)
    rag_response = generate_response(llm_model_name=llm_model_name,
                                    query=query,
                                    context=context)
    rag_response = rag_response.replace('\n','')
    return rag_response,titles,urls

In [44]:
query = 'what has Modi specially done in Gujarat?'
rag(query=query, no_of_top_chunks=5)

('  Hello! As a helpful assistant, I\'ll do my best to provide you with accurate and relevant information based on the context you\'ve provided.Regarding the query "what has Modi specially done in Gujarat?", it is important to note that Narendra Modi has been the Chief Minister of Gujarat for several terms and has implemented various policies and initiatives during his tenure. Some of his notable achievements include:1. Industrial growth: Gujarat has consistently ranked among the top three states in India with respect to ease of doing business, and this has led to significant industrial growth. The state\'s GDP has grown at a rate of 10% from 2001 to 2013, and it is expected to grow at 17.4% this fiscal.2. Ease of doing business: Gujarat has implemented various policies and initiatives to make it easier to do business in the state. This has included simplifying regulatory processes, reducing bureaucratic hurdles, and providing incentives to industries.3. Education: Modi has emphasized 