# Install main dependencies

In [1]:
'''import subprocess
import sys

# Function to install packages using pip
def install_packages(packages):
    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])

# List of packages to install
packages = [
    'vosk', 'yt-dlp', 'tqdm', 'datasets', 'openai', 'pinecone-client', 'tiktoken',
    'pyarrow==11.0.0', 'flask',
    'langchain', 'langchainhub', 'langchain-openai', 'langchain_community',
    'langchain-pinecone', 'langchain_anthropic'
]

# Install the packages
install_packages(packages)

# If running on Colab, you can uncomment and run these lines to download and set up the model
subprocess.check_call(["wget", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])
subprocess.check_call(["unzip", "vosk-model-small-en-us-0.15.zip"])
subprocess.check_call(["mv", "vosk-model-small-en-us-0.15", "model"])
subprocess.check_call(["rm", "vosk-model-small-en-us-0.15.zip"])
subprocess.check_call(["rm", "-rf", "audio.wav"])
subprocess.check_call(["rm", "-rf", "transcription.txt"])
subprocess.check_call(["apt-get", "update"])
subprocess.check_call(["apt-get", "install", "-y", "ffmpeg"])

# Now your dependencies are installed, and you can continue with your Streamlit app code'''

'import subprocess\nimport sys\n\n# Function to install packages using pip\ndef install_packages(packages):\n    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])\n\n# List of packages to install\npackages = [\n    \'vosk\', \'yt-dlp\', \'tqdm\', \'datasets\', \'openai\', \'pinecone-client\', \'tiktoken\',\n    \'pyarrow==11.0.0\', \'flask\',\n    \'langchain\', \'langchainhub\', \'langchain-openai\', \'langchain_community\',\n    \'langchain-pinecone\', \'langchain_anthropic\'\n]\n\n# Install the packages\ninstall_packages(packages)\n\n# If running on Colab, you can uncomment and run these lines to download and set up the model\nsubprocess.check_call(["wget", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])\nsubprocess.check_call(["unzip", "vosk-model-small-en-us-0.15.zip"])\nsubprocess.check_call(["mv", "vosk-model-small-en-us-0.15", "model"])\nsubprocess.check_call(["rm", "vosk-model-small-en-us-0.15.zip"])\nsubprocess.check_call(["rm"

# Install main packages and modules

In [2]:
import os
import re
import json
import time
import wave
import yt_dlp
import sqlite3
import pinecone
import requests
import tiktoken
import subprocess

from uuid import uuid4
from typing import List
from tqdm.auto import tqdm
from datasets import load_dataset

from vosk import Model, KaldiRecognizer

from pinecone import Pinecone, ServerlessSpec

from langchain.chains import RetrievalQA
from langchain_pinecone import PineconeVectorStore
from langchain.agents import Tool, create_react_agent
from langchain_openai import OpenAIEmbeddings, ChatOpenAI, OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.conversation.memory import ConversationBufferWindowMemory


from langchain.output_parsers import ListOutputParser
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Set API Keys

In [3]:
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

# Get the API keys
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')

# If running on CoLab, comment out above and uncomment below

# from google.colab import files, userdata

# PINECONE_API_KEY=userdata.get('PINECONE_API_KEY')
# OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')
# LANGCHAIN_API_KEY=userdata.get('LANGCHAIN_API_KEY')

# Speech-to-text
- ### User input URL
- ### Download YouTube video upon user URL input
- ### Extract and transcribe audio

In [4]:
from vosk import Model, KaldiRecognizer
from tabulate import tabulate
from IPython.display import Image, display

# Ensure ffmpeg is in PATH
os.environ['PATH'] += os.pathsep + '/usr/local/bin'

def is_valid_youtube_url(url):
    """Validate if the given URL is a YouTube URL."""
    pattern = r'^(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+$'
    return re.match(pattern, url) is not None

def download_video(url):
    """Download the audio of the YouTube video as a .wav file."""
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'outtmpl': 'audio.%(ext)s',
        'ffmpeg_location': '/usr/local/bin'
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return 'audio.wav'
    except Exception as e:
        print(f"An error occurred during download: {e}")
        return None

def convert_audio(input_file, output_file):
    """Convert audio to the required format for transcription."""
    command = [
        '/usr/local/bin/ffmpeg',
        '-i', input_file,
        '-acodec', 'pcm_s16le',
        '-ac', '1',
        '-ar', '16000',
        '-y', output_file
    ]
    try:
        subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error converting audio: {e}")
        return False

def transcribe_audio(audio_file):
    """Transcribe audio using the Vosk model."""
    model_path = "model"
    if not os.path.exists(model_path):
        print("Speech recognition model not found. Please make sure you've downloaded it.")
        return None

    model = Model(model_path)
    recognizer = KaldiRecognizer(model, 16000)

    def is_valid_wave_file(wave_file):
        """Check if the wave file has the correct properties."""
        return (wave_file.getnchannels() == 1 and
                wave_file.getsampwidth() == 2 and
                wave_file.getcomptype() == "NONE")
    
    try:
        # Open the original file to check format
        with wave.open(audio_file, "rb") as wf:
            if not is_valid_wave_file(wf):
                print("Converting audio to the correct format...")
                converted_file = "converted_audio.wav"
                if not convert_audio(audio_file, converted_file):
                    return None
                audio_file = converted_file

        # Reopen the audio file (either original or converted)
        with wave.open(audio_file, "rb") as wf:
            results = []
            total_frames = wf.getnframes()
            with tqdm(total=total_frames, desc="Transcribing") as pbar:
                while True:
                    data = wf.readframes(4000)
                    if not data:
                        break
                    if recognizer.AcceptWaveform(data):
                        part_result = json.loads(recognizer.Result())
                        results.append(part_result['text'])
                    pbar.update(4000)

                # Final result
                part_result = json.loads(recognizer.FinalResult())
                results.append(part_result['text'])

            return " ".join(results)
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

def get_youtube_video_info(url):
    """Extract YouTube video info and M3U8 URLs without downloading the video."""
    ydl_opts = {
        'quiet': True,
        'skip_download': True,
        'extract_flat': False,
        'force_generic_extractor': True,
        'no_warnings': True
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)
            info.pop('description', None)

            # Extract M3U8 URLs from formats
            formats = info.get('formats', [])
            m3u8_urls = [fmt['url'] for fmt in formats if 'm3u8' in fmt.get('url', '')]
            info['m3u8_urls'] = m3u8_urls

        return info
    except Exception as e:
        print(f"An error occurred while extracting YouTube video info: {e}")
        return None

def display_youtube_info(info):
    """Display YouTube video metadata in a pretty table format using tabulate."""
    if not info:
        print("No YouTube video info to display.")
        return

    youtube_info = [
        ["Title", info.get("title", "N/A")],
        ["Thumbnail URL", info.get("thumbnail", "N/A")],
        ["Uploader", info.get("uploader", "N/A")],
        ["Uploader ID", info.get("uploader_id", "N/A")],
        ["Upload Date", info.get("upload_date", "N/A")],
        ["Duration", info.get("duration", "N/A")],
        ["View Count", info.get("view_count", "N/A")],
        ["Like Count", info.get("like_count", "N/A")]
    ]

    thumbnail_url = info.get("thumbnail")
    if thumbnail_url:
        display(Image(url=thumbnail_url, width=580))
    
    print("\nYouTube Video Info:")
    print(tabulate(youtube_info, headers=["Field", "Value"], tablefmt="grid", stralign="left"))

def main():
    """Main function to handle the workflow."""
    while True:
        video_url = input("Enter the YouTube video URL: ")
        if is_valid_youtube_url(video_url):
            break
        else:
            print("Invalid YouTube URL. Please enter a valid URL.")

    print("Extracting YouTube video info...")
    youtube_info = get_youtube_video_info(video_url)
    if youtube_info:
        display_youtube_info(youtube_info)

    print("Downloading video...")
    audio_file = download_video(video_url)

    if audio_file and os.path.exists(audio_file):
        print("Transcribing audio...")
        transcription = transcribe_audio(audio_file)

        if transcription:
            print("\nTranscription:")
            print(transcription)

            # Save transcription to a file
            with open('transcription.txt', 'w') as f:
                f.write(transcription)

            print("Transcription saved to 'transcription.txt'.")
        else:
            print("Transcription failed. Please check the error messages above.")
    else:
        print("Failed to download the video. Please check the URL and try again.")

if __name__ == "__main__":
    main()

Enter the YouTube video URL:  https://www.youtube.com/watch?v=qQviI1d_hFA


Extracting YouTube video info...



YouTube Video Info:
+---------------+-------------------------------------------------------+
| Field         | Value                                                 |
| Title         | Michio Kaku: Quantum computing is the next revolution |
+---------------+-------------------------------------------------------+
| Thumbnail URL | https://i.ytimg.com/vi/qQviI1d_hFA/maxresdefault.jpg  |
+---------------+-------------------------------------------------------+
| Uploader      | Big Think                                             |
+---------------+-------------------------------------------------------+
| Uploader ID   | @bigthink                                             |
+---------------+-------------------------------------------------------+
| Upload Date   | 20230818                                              |
+---------------+-------------------------------------------------------+
| Duration      | 677                                                   |
+---------------+

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from model/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from model/graph/HCLr.fst model/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo model/graph/phones/word_boundary.int


Converting audio to the correct format...


Transcribing:   0%|          | 0/10836846 [00:00<?, ?it/s]


Transcription:
we all know the digital computers changed virtually every aspect of our life well the arrival of quantum computers could be even more historic than that we're now and the initial stages of the next revolution we're talking about a new generation of computers the ultimate computer a computer that computer on adam the ultimate constituents of batter itself the question is who's involved in this race to perfect quantum computers and the answer is everyone all the big players are part of this race because if they're not silicon valley could become the next rust belt also anyone who's interested in security is interested in the quantum computers they can crack almost any code that is based on digital technology that's why the f b i the cia and all national governments are following this is very closely what their computers will change everything the economy how he saw problems the way we interact with the universe you name it won't to computers will be there i'm doctor or mi

# Load data into database

In [5]:
# Define the path to the transcription file
transcription_file = 'transcription.txt'

# Read the transcription text
with open(transcription_file, 'r') as file:
    transcription_text = file.read()

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('transcriptions.db')
cursor = conn.cursor()

# Create the transcriptions table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS transcriptions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        speaker TEXT,
        text TEXT,
        timestamp TEXT
    )
''')

# Insert the transcription text into the table
cursor.execute('''
    INSERT INTO transcriptions (speaker, text, timestamp)
    VALUES (?, ?, ?)
''', ('Transcript', transcription_text, '2024-07-15 10:00:00'))

# Commit the transaction and close the connection
conn.commit()
conn.close()

print("Data has been successfully loaded into the database.")


Data has been successfully loaded into the database.


In [6]:
# Connect to the SQLite database
conn = sqlite3.connect('transcriptions.db')
cursor = conn.cursor()

# Query the data
cursor.execute('SELECT * FROM transcriptions')
rows = cursor.fetchall()

for row in rows:
    print(row)

conn.close()


(1, 'Transcript', "we all know the digital computers changed virtually every aspect of our life well the arrival of quantum computers could be even more historic than that we're now and the initial stages of the next revolution we're talking about a new generation of computers the ultimate computer a computer that computer on adam the ultimate constituents of batter itself the question is who's involved in this race to perfect quantum computers and the answer is everyone all the big players are part of this race because if they're not silicon valley could become the next rust belt also anyone who's interested in security is interested in the quantum computers they can crack almost any code that is based on digital technology that's why the f b i the cia and all national governments are following this is very closely what their computers will change everything the economy how he saw problems the way we interact with the universe you name it won't to computers will be there i'm doctor or

In [7]:
tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [8]:
# Function to calculate the length of text in terms of tokens
def tiktoken_len(text):
    return len(text.split())

# Connect to the SQLite database
conn = sqlite3.connect('transcriptions.db')
cursor = conn.cursor()

# Query the data
cursor.execute('SELECT text FROM transcriptions WHERE id = 1')
transcription_text = cursor.fetchone()[0]

# Close the connection
conn.close()

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=tiktoken_len
)

# Split the transcription text into chunks
chunks = text_splitter.split_text(transcription_text)[:3]

# Print the first 3 chunks
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:")
    print(chunk)
    print()


Chunk 1:
we all know the digital computers changed virtually every aspect of our life well the arrival of quantum computers could be even more historic than that we're now and the initial stages of the next revolution we're talking about a new generation of computers the ultimate computer a computer that computer on adam the ultimate constituents of batter itself the question is who's involved in this race to perfect quantum computers and the answer is everyone all the big players are part of this race because if they're not silicon valley could become the next rust belt also anyone who's interested in security is interested in the quantum computers they can crack almost any code that is based on digital technology that's why the f b i the cia and all national governments are following this is very closely what their computers will change everything the economy how he saw problems the way we interact with the universe you name it won't to computers will be there i'm doctor or michio ka

In [9]:
# transcription_text

In [10]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])

(500, 500, 430)

# Text embedding

In [11]:
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=openai_api_key
)

# Indexing

In [12]:
# configure client
pc = Pinecone(api_key=pinecone_api_key)

In [13]:
spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [14]:
index_name = 'langchain-retrieval-augmentation'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10086}},
 'total_vector_count': 10086}

# Data processing

In [15]:
batch_limit=50

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=len  # You can replace with your custom token length function if needed
)

# Connect to SQLite database
conn = sqlite3.connect('transcriptions.db')
cursor = conn.cursor()

# Fetch data from SQLite database
cursor.execute('SELECT id, speaker, text FROM transcriptions')
data = cursor.fetchall()

# Close the database connection
conn.close()

# Initialize lists for texts and metadatas
texts = []
metadatas = []

# Process each record fetched from the database
for i, record in enumerate(tqdm(data)):
    # Metadata fields for this record
    metadata = {
        #'index': str(record[0]),  # Assuming id is the first column
        'speaker': record[1],  # Replace with actual source if available
        'text': record[2],  # Example title based on id
    }

    # Split text into chunks
    record_texts = text_splitter.split_text(record[2])  # Assuming text is the third column

    # Create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j,
        "text": text,
        **metadata
    } for j, text in enumerate(record_texts)]

    # Append texts and metadatas to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)

    # Check if batch limit is reached, then embed and upsert
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)

        # Assuming `index` is where you want to upsert (not defined in the snippet)
        index.upsert(vectors=zip(ids, embeds, metadatas))

        # Clear lists after upserting
        texts = []
        metadatas = []

# Process any remaining texts in the lists
if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

print("Data processing completed.")


  0%|          | 0/28 [00:00<?, ?it/s]

Data processing completed.


In [16]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10562}},
 'total_vector_count': 10562}

# Initialize Pinecone Vector Store

In [17]:
# Define the metadata field that contains your text
text_field = 'text'

# Initialize the Pinecone vector store object
vectorstore = PineconeVectorStore(index, embed, text_field)

In [18]:
query = input("Please enter your query: ")

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

Please enter your query:  What is a qubit?


[Document(metadata={'chunk': 10.0, 'speaker': 'unknown'}, page_content="we all know the digital computers changed virtually every aspect of our life well the arrival of quantum computers could be even more historic than that we're now and the initial stages of the next revolution we're talking about a new generation of computers the ultimate computer a computer that computer on adam the ultimate constituents of batter itself the question is who's involved in this race to perfect quantum computers and the answer is everyone all the big players are part of this race because if they're not silicon valley could become the next rust belt also anyone who's interested in security is interested in the quantum computers they can crack almost any code that is based on digital technology that's why the f b i the cia and all national governments are following this is very closely what their computers will change everything the economy how he saw problems the way we interact with the universe you n

# Retrieve and set memory

In [19]:
# completion llm
llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
    )

# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [20]:
qa.invoke(query)

{'query': 'What is a qubit?',
 'result': 'A qubit is the basic unit of quantum information in quantum computing. Unlike classical bits that can be in a state of either 0 or 1, a qubit can exist in a superposition of both 0 and 1 simultaneously. This unique property allows quantum computers to perform multiple calculations at the same time, making them potentially much more powerful than classical computers for certain tasks.'}

# Multi querying

In [21]:
tools = [
    Tool(
        name='Knowledge Base',
        func=qa.invoke,
        description=(
            'use this tool when answering general knowledge queries to get '
            'more information about the topic'
        )
    )
]

In [22]:
from langchain.agents import create_react_agent
from langchain_core.prompts import PromptTemplate

# Define the prompt template (if not already defined)
template = '''Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}'''

# Create the prompt template
prompt = PromptTemplate.from_template(template)

# Create the React agent
agent = create_react_agent(
    llm=llm,
    tools=tools,
    prompt=prompt
)

# Optionally, you might use `AgentExecutor` if you need to manage interactions
from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=tools)

# Asking questions concerning the video

In [23]:
response = agent_executor.invoke({"input": query})
print(response)

{'input': 'What is a qubit?', 'output': 'A qubit is the basic unit of quantum information in quantum computing, capable of existing in a superposition of both 0 and 1 states simultaneously.'}


In [24]:
response = agent_executor.invoke({"input": "Who is Michio Kaku?"})
print(response)

{'input': 'Who is Michio Kaku?', 'output': 'Michio Kaku is a theoretical physicist, professor at the City University of New York, and author known for his work in theoretical physics and popularizing science.'}


In [25]:
response = agent_executor.invoke({"input": "How fast is a quantum computer?"})
print(response)

{'input': 'How fast is a quantum computer?', 'output': 'Quantum computers have the potential to be infinitely faster than digital computers.'}


# Ask a complete random question not related at all with the video

In [26]:
response = agent_executor.invoke({"input": "history of portugal in XV century?"})
print(response)

{'input': 'history of portugal in XV century?', 'output': 'In the 15th century, Portugal experienced a significant period of exploration and expansion, led by figures like Prince Henry the Navigator and Vasco da Gama. This period marked a turning point in Portuguese exploration and trade dominance, with the establishment of new trade routes and the development of new ships like the caravel.'}


In [27]:
import pinecone
import time

# Initialize Pinecone client directly
pinecone_client = pinecone.Pinecone(api_key=pinecone_api_key, environment='us-west1-gcp')

index_name = 'langchain-multi-query'

if index_name not in pinecone_client.list_indexes().names():  # Use .names() to get list of index names
    # Define index configuration
    index_spec = {
        'dimension': 1536,
        'metric': 'cosine'
    }
    pinecone_client.create_index(name=index_name, **index_spec)  # Pass index name as 'name'
    while not pinecone_client.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pinecone_client.Index(index_name)

In [28]:
len(texts)

17

In [29]:
batch_size = 1

for i in tqdm(range(0, len(texts), batch_size)):
    i_end = min(i+batch_size, len(texts))
    ids = [str(uuid4()) for _ in range(i_end-i)]
    embeds = embed.embed_documents(texts[i:i_end])
    index.upsert(vectors=zip(ids, embeds, metadatas[i:i_end]))
    time.sleep(1)


  0%|          | 0/17 [00:00<?, ?it/s]

In [30]:
text_field = "text"

vectorstore = PineconeVectorStore(index, embed, text_field)

In [31]:
llm = ChatOpenAI(temperature=0.0, openai_api_key=openai_api_key)

In [32]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [33]:
retriever = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=llm)

In [34]:
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [35]:
question = "How fast is a quantum computer?"

texts = retriever.invoke(input=question)
len(texts)

INFO:langchain.retrievers.multi_query:Generated queries: ['What is the speed of a quantum computer?', 'Can you provide information on the velocity of a quantum computer?', 'What is the rate at which a quantum computer operates?']


2

In [36]:
texts

[Document(metadata={'chunk': 9.0, 'speaker': 'Transcript'}, page_content="we all know the digital computers changed virtually every aspect of our life well the arrival of quantum computers could be even more historic than that we're now and the initial stages of the next revolution we're talking about a new generation of computers the ultimate computer a computer that computer on adam the ultimate constituents of batter itself the question is who's involved in this race to perfect quantum computers and the answer is everyone all the big players are part of this race because if they're not silicon valley could become the next rust belt also anyone who's interested in security is interested in the quantum computers they can crack almost any code that is based on digital technology that's why the f b i the cia and all national governments are following this is very closely what their computers will change everything the economy how he saw problems the way we interact with the universe you

In [37]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

QA_PROMPT = PromptTemplate(
    input_variables=["query", "contexts"],
    template="""You are a helpful assistant who answers user queries using the contexts provided. If the question cannot be answered using the information provided say "I don't know".

    Contexts:
    {contexts}

    Question: {query}
    Answer:"""
)

qa_chain = LLMChain(llm=llm, prompt=QA_PROMPT, verbose=True)

output = qa_chain(inputs={"query": question, "contexts": "\n---\n".join([d.page_content for d in texts])})
print(output["text"])

  warn_deprecated(
  warn_deprecated(




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful assistant who answers user queries using the contexts provided. If the question cannot be answered using the information provided say "I don't know".

    Contexts:
    we all know the digital computers changed virtually every aspect of our life well the arrival of quantum computers could be even more historic than that we're now and the initial stages of the next revolution we're talking about a new generation of computers the ultimate computer a computer that computer on adam the ultimate constituents of batter itself the question is who's involved in this race to perfect quantum computers and the answer is everyone all the big players are part of this race because if they're not silicon valley could become the next rust belt also anyone who's interested in security is interested in the quantum computers they can crack almost any code that is based on digital technology that's why the f

In [38]:
from langchain.chains import TransformChain

def retrieval_transform(inputs: dict) -> dict:
    texts = retriever.get_relevant_documents(query=inputs["question"])
    texts = [d.page_content for d in texts]
    texts_dict = {
        "query": inputs["question"],
        "contexts": "\n---\n".join(texts)
    }
    return texts_dict

retrieval_chain = TransformChain(
    input_variables=["question"],
    output_variables=["query", "contexts"],
    transform=retrieval_transform
)

In [39]:
from langchain.chains import SequentialChain

rag_chain = SequentialChain(
    chains=[retrieval_chain, qa_chain],
    input_variables=["question"],
    output_variables=["query", "contexts", "text"],
    verbose=True
)

In [40]:
output = rag_chain({"question": question})
print(output["text"])



[1m> Entering new SequentialChain chain...[0m


  warn_deprecated(
INFO:langchain.retrievers.multi_query:Generated queries: ['What is the speed of a quantum computer?', 'Can you provide information on the velocity of a quantum computer?', 'What is the rate at which a quantum computer operates?']




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful assistant who answers user queries using the contexts provided. If the question cannot be answered using the information provided say "I don't know".

    Contexts:
    we all know the digital computers changed virtually every aspect of our life well the arrival of quantum computers could be even more historic than that we're now and the initial stages of the next revolution we're talking about a new generation of computers the ultimate computer a computer that computer on adam the ultimate constituents of batter itself the question is who's involved in this race to perfect quantum computers and the answer is everyone all the big players are part of this race because if they're not silicon valley could become the next rust belt also anyone who's interested in security is interested in the quantum computers they can crack almost any code that is based on digital technology that's why the f

# Custom Multiquery

## Prompt A
"""Your task is to generate 3 different queries that aim to answer the user question from multiple perspectives.
Every query MUST tackle the question from a different viewpoint, we want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}"""

## Prompt B
"""Your task is to generate 3 different search queries that aim to answer the question from multiple perspectives. The user questions are focused on Quantum Computing, AI, future technology and related subjects.
Every query MUST tackle the question from a different viewpoint, we want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}"""

In [41]:
class SimpleListOutputParser(ListOutputParser):
    def parse(self, text: str) -> List[str]:
        # Split the text into lines and handle potential empty strings
        return [line.strip() for line in text.split("\n") if line.strip()]

output_parser = SimpleListOutputParser()

template = """
Your task is to generate 3 different queries that aim to answer the user question from multiple perspectives.
Every query MUST tackle the question from a different viewpoint, we want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}
"""

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template=template
)

llm = OpenAI(temperature=0.3, openai_api_key=openai_api_key)

llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Debug: Test the LLMChain output
test_question = "What are the effects of climate change?"
result = llm_chain.invoke(test_question)
print("LLMChain output:", result)

# Assuming vectorstore is defined elsewhere
retriever = MultiQueryRetriever(
    retriever=vectorstore.as_retriever(),
    llm_chain=llm_chain,
    parser_key="lines"  # This should match the attribute name in the output parser
)

# Debug: Print the type and content of 'question'
print("Type of question:", type(question))
print("Content of question:", question)

texts = retriever.get_relevant_documents(query=question)
print("Number of retrieved texts:", len(texts))

LLMChain output: {'question': 'What are the effects of climate change?', 'text': ['1. How does climate change impact the environment and natural ecosystems?', '2. What are the social and economic consequences of climate change?', '3. In what ways does climate change affect human health and well-being?']}
Type of question: <class 'str'>
Content of question: How fast is a quantum computer?


INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the speed of a quantum computer compared to a traditional computer?', '2. How does the speed of a quantum computer differ from a classical computer?', '3. Can a quantum computer perform calculations faster than a traditional computer?']


Number of retrieved texts: 2


In [42]:
class SimpleListOutputParser(ListOutputParser):
    def parse(self, text: str) -> List[str]:
        # Split the text into lines and handle potential empty strings
        return [line.strip() for line in text.split("\n") if line.strip()]

output_parser = SimpleListOutputParser()

template = """
Your task is to generate 3 different search queries that aim to answer the question from multiple perspectives. The user questions are focused on Quantum Computing, AI, future technology and related subjects.
Every query MUST tackle the question from a different viewpoint, we want to get a variety of RELEVANT search results.
Provide these alternative questions separated by newlines.
Original question: {question}
"""

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template=template
)

llm = OpenAI(temperature=0.3, openai_api_key=openai_api_key)

llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Debug: Test the LLMChain output
test_question = "What are the effects of climate change?"
result = llm_chain.invoke(test_question)
print("LLMChain output:", result)

# Assuming vectorstore is defined elsewhere
retriever = MultiQueryRetriever(
    retriever=vectorstore.as_retriever(),
    llm_chain=llm_chain,
    parser_key="lines"  # This should match the attribute name in the output parser
)

# Debug: Print the type and content of 'question'
print("Type of question:", type(question))
print("Content of question:", question)

texts = retriever.get_relevant_documents(query=question)
print("Number of retrieved texts:", len(texts))

LLMChain output: {'question': 'What are the effects of climate change?', 'text': ['1. How is climate change impacting the development of quantum computing technology?', '2. What role can AI play in mitigating the effects of climate change?', '3. What are the potential future advancements in technology that could help combat climate change?']}
Type of question: <class 'str'>
Content of question: How fast is a quantum computer?


INFO:langchain.retrievers.multi_query:Generated queries: ['1. What are the advantages of using quantum computers over traditional computers?', '2. How does the speed of a quantum computer compare to that of a supercomputer?', '3. Can quantum computers solve problems faster than classical computers?']


Number of retrieved texts: 3


# BLEU and ROUGE scores

In [59]:
# !pip install sacrebleu nltk rouge-score

In [60]:
import sacrebleu
from rouge_score import rouge_scorer
import nltk
from tabulate import tabulate

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/f.nuno/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
# Function to calculate BLEU score
def calculate_bleu(reference, candidate):
    reference_tokens = nltk.word_tokenize(reference)
    candidate_tokens = nltk.word_tokenize(candidate)
    bleu_score = sacrebleu.corpus_bleu([candidate], [[reference]])
    return bleu_score.score

# Function to calculate ROUGE scores
def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

In [62]:
# Example reference and candidate texts
reference = "LangChain is a framework for developing applications powered by language models."
candidate = "LangChain helps in integrating and managing language models."

# Calculate BLEU score
bleu_score = calculate_bleu(reference, candidate)
print(f"BLEU score: {bleu_score}")

# Calculate ROUGE scores
rouge_scores = calculate_rouge(reference, candidate)
print(f"ROUGE scores: {rouge_scores}")

BLEU score: 13.664845439199912
ROUGE scores: {'rouge1': Score(precision=0.375, recall=0.2727272727272727, fmeasure=0.3157894736842105), 'rouge2': Score(precision=0.14285714285714285, recall=0.1, fmeasure=0.11764705882352941), 'rougeL': Score(precision=0.375, recall=0.2727272727272727, fmeasure=0.3157894736842105)}


In [63]:
# Prepare data for tabulation
table_data = [
    ["Metric", "Precision", "Recall", "F-measure"],
    ["BLEU", "-", "-", f"{bleu_score:.2f}"],
    ["ROUGE-1", f"{rouge_scores['rouge1'].precision:.3f}", f"{rouge_scores['rouge1'].recall:.3f}", f"{rouge_scores['rouge1'].fmeasure:.3f}"],
    ["ROUGE-2", f"{rouge_scores['rouge2'].precision:.3f}", f"{rouge_scores['rouge2'].recall:.3f}", f"{rouge_scores['rouge2'].fmeasure:.3f}"],
    ["ROUGE-L", f"{rouge_scores['rougeL'].precision:.3f}", f"{rouge_scores['rougeL'].recall:.3f}", f"{rouge_scores['rougeL'].fmeasure:.3f}"]
]

In [64]:
# Print table
print(tabulate(table_data, headers="firstrow", tablefmt="pretty"))

+---------+-----------+--------+-----------+
| Metric  | Precision | Recall | F-measure |
+---------+-----------+--------+-----------+
|  BLEU   |     -     |   -    |   13.66   |
| ROUGE-1 |   0.375   | 0.273  |   0.316   |
| ROUGE-2 |   0.143   | 0.100  |   0.118   |
| ROUGE-L |   0.375   | 0.273  |   0.316   |
+---------+-----------+--------+-----------+
