<a href="https://colab.research.google.com/github/AlaFalaki/ANN-languageDetecor/blob/master/Module%206/Lesson%2004-Recreating_the_Google_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q langchain==0.0.208 openai tiktoken newspaper3k

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# let's setup the keys

import os

os.environ["GOOGLE_CSE_ID"] = "<Custom_Search_Engine_ID>"
os.environ["GOOGLE_API_KEY"] = "<Google_API_Key>"
os.environ["OPENAI_API_KEY"] = "<OpenAI_Key>"

In [None]:
from langchain import LLMChain, PromptTemplate
from langchain.llms import OpenAI

llm = OpenAI(temperature=0)

template = """You are an assistant that answer the following question correctly and honestly: {question}\n\n"""
prompt_template = PromptTemplate(input_variables=["question"], template=template)

question_chain = LLMChain(llm=llm, prompt=prompt_template)

question_chain.run("what is the latest fast and furious movie?")

'\nThe latest Fast and Furious movie is Fast & Furious 9, which is set to be released in May 2021.'

In [None]:
# first, we create a tool that allows us to use Google search.
# we'll use it to retrieve the first 10 results

from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper

search = GoogleSearchAPIWrapper()
TOP_N_RESULTS = 10

def top_n_results(query):
    return search.results(query, TOP_N_RESULTS)

tool = Tool(
    name = "Google Search",
    description="Search Google for recent results.",
    func=top_n_results
)

In [None]:
# this is how we can use the tool. For each result, we have:
# 1. the result title
# 2. its URL
# 3. and the snippet that we would see if we were on the Google UI

query = "what is the latest fast and furious movie?"

results = tool.run(query)

for result in results:
    print(result["title"])
    print(result["link"])
    print(result["snippet"])
    print("-"*50)

Fast & Furious movies in order | chronological and release order ...
https://www.radiotimes.com/movies/fast-and-furious-order/
Mar 22, 2023 ... Fast & Furious Presents: Hobbs & Shaw (2019); F9 (2021); Fast and Furious 10 (2023). Tokyo Drift also marks the first appearance of Han Lue, a ...
--------------------------------------------------
FAST X | Official Trailer 2 - YouTube
https://www.youtube.com/watch?v=aOb15GVFZxU
Apr 19, 2023 ... Fast X, the tenth film in the Fast & Furious Saga, launches the final ... witnessed it all and has spent the last 12 years masterminding a ...
--------------------------------------------------
Fast & Furious 10: Release date, cast, plot and latest news on Fast X
https://www.radiotimes.com/movies/fast-and-furious-10-release-date/
Apr 17, 2023 ... Fast X is out in cinemas on 19th May 2023 – find out how to rewatch all the Fast & Furious movies in order, and read our Fast & Furious 9 review ...
--------------------------------------------------
Fast & Fur

In [None]:
# let's visit all the URLs from the results and use the newspaper library
# to download their texts. The library won't work on some URLs, e.g.
# if the content is a PDF file or if the website has some anti-bot mechanisms
# adopted.

import newspaper

pages_content = []

for result in results:
    try:
        article = newspaper.Article(result["link"])
        article.download()
        article.parse()
        if len(article.text) > 0:
            pages_content.append({ "url": result["link"], "text": article.text })
    except:
        continue

print(len(pages_content))

8


In [None]:
# we split the article texts into small chunks. While doing so, we keep track of each
# chunk metadata (i.e. the URL where it comes from). Each metadata is a dictionary and
# we need to use the "source" key for the document source so that the chain
# that we'll create later knows where to retrieve the source.

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=100)

docs = []
for d in pages_content:
    chunks = text_splitter.split_text(d["text"])
    for chunk in chunks:
        new_doc = Document(page_content=chunk, metadata={ "source": d["url"] })
        docs.append(new_doc)
len(docs)

24

In [None]:
# then, we embed both the chunks and the query

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

docs_embeddings = embeddings.embed_documents([doc.page_content for doc in docs])
query_embedding = embeddings.embed_query(query)

In [None]:
# next, we compute the cosine similarities between the document vectors and
# the query vectors using numpy and sklearn. We are interested only in the top 3
# chunks for now because we'll later put them in a prompt and the prompt size is
# limited.

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_indices(list_of_doc_vectors, query_vector, top_k):
    # convert the lists of vectors to numpy arrays
    list_of_doc_vectors = np.array(list_of_doc_vectors)
    query_vector = np.array(query_vector)

    # compute cosine similarities
    similarities = cosine_similarity(query_vector.reshape(1, -1), list_of_doc_vectors).flatten()

    # sort the vectors based on cosine similarity
    sorted_indices = np.argsort(similarities)[::-1]

    # retrieve the top K indices from the sorted list
    top_k_indices = sorted_indices[:top_k]

    return top_k_indices

top_k = 2
best_indexes = get_top_k_indices(docs_embeddings, query_embedding, top_k)
best_k_documents = [doc for i, doc in enumerate(docs) if i in best_indexes]

In [None]:
# we are now ready to create a question answering chain that leverages
# sources, and we'll use the load_qa_with_sources_chain function for that

from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI

chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")

In [None]:
# last, let's generate the response to our query
response = chain({"input_documents": best_k_documents, "question": query}, return_only_outputs=True)

response_text, response_sources = response["output_text"].split("SOURCES:")
response_text = response_text.strip()
response_sources = response_sources.strip()

print(f"Answer: {response_text}")
print(f"Sources: {response_sources}")

Answer: The latest Fast and Furious movie is Fast X, scheduled for release on May 19, 2023.
Sources: https://www.radiotimes.com/movies/fast-and-furious-10-release-date/, https://en.wikipedia.org/wiki/Fast_%26_Furious


In [None]:
response

{'output_text': ' The latest Fast and Furious movie is Fast X, scheduled for release on May 19, 2023.\nSOURCES: https://www.radiotimes.com/movies/fast-and-furious-10-release-date/, https://en.wikipedia.org/wiki/Fast_%26_Furious'}