<a href="https://colab.research.google.com/github/3lueLightning/tutorials/blob/main/rag_pair_programming_part1_condensed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -q langchain langchain_community langchain_openai \
langchain_text_splitters docarray

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.6/990.6 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.2/270.2 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.2/140.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m384.0/384.0 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import re
import pickle
import urllib
import getpass
from pprint import pprint

from bs4 import SoupStrainer
from langchain.docstore.document import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter



In [None]:
OPENAI_API_KEY = getpass.getpass()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

··········


# 1. Loading the data

In [None]:
# this site allows crawlers
MOVIES_URL = "https://www.empireonline.com/movies/features/best-movies-2/"

# In the tutorial I added some extra try-excepts to avoid issues. In real life
# I'd test this line and if it succeeded I'd remove this entire cell.
try:
  full_page = WebBaseLoader(MOVIES_URL).load()
  print("Loaded site sucessfully, data sample: " +\
        full_page[0].page_content[:50])
except:
  # I know it's not super clean to make an catch all exception
  # but I rather be safe then sorry :P
  full_page = []
  print("The page didn't load, but no worries we have a backup ;)")


def is_target_element(elem: str, attrs: dict) -> bool:
    """
    Returns true if the HTML element is what we want to extract.
    """
    # get the movie description
    div_class = "listicleItem_listicle-item__content__Lxn1Y"
    div_mask = (elem == "div" and attrs.get("class") == div_class)
    # get the movie title
    h3_class = "listicleItem_listicle-item__title__BfenH"
    h3_mask = (elem == "h3" and attrs.get("class") == h3_class)
    return div_mask or h3_mask

strainer = SoupStrainer(is_target_element)

movie_scraper = WebBaseLoader(
    MOVIES_URL,
    bs_kwargs = {
        "parse_only": strainer
    }
)

# this try except is to prevent the code from crashing in case something
# happens to the page
try:
  # only here is the page actually loaded
  movie_reviews_raw = movie_scraper.load()
except:
  movie_reviews_raw = []

Loaded site sucessfully, data sample: The 100 Best Movies Of All Time | Movies | %%chann


In [None]:
# no need to read this cell it is just a backup system in case the scraping fails
if (
    not movie_reviews_raw
    or movie_reviews_raw[0].page_content[:25] != "100 Reservoir Dogs\nMaking"
):
  BACKUP_MOVIES_PKL_URL = "https://tutorials-public.s3.eu-west-1.amazonaws.com/movie_reviews_raw.pkl"
  with urllib.request.urlopen(BACKUP_MOVIES_PKL_URL) as response:
      movie_reviews_raw = pickle.load(response)
  print("loaded list of top 10 best movies from backup system")
else:
  print("successfully scrapped list of top 10 best movies")

successfully scrapped list of top 10 best movies


# 2. Process data

In [None]:
def split_movies(page: Document) -> list[Document]:
  """
  Split page into a list of movie reviews
  """
  page_parts = page.page_content.strip().split("\n")
  names_n_reviews = [p for p in page_parts if not p.startswith("Read")]
  pattern = r'^\d*\)? '
  movie_names = [re.sub(pattern, "", name) for name in names_n_reviews[::2]]
  movie_reviews = [
      f"{name}: {description}"
      for name, description in zip(movie_names, names_n_reviews[1::2])
  ]
  movie_docs = [
      Document(review, metadata={**page.metadata, "rank": i, "name": name})
      for review, i, name in zip(movie_reviews, range(100, 0, -1), movie_names)
  ]
  return movie_docs


movie_reviews = split_movies(movie_reviews_raw[0])
print(f"extracted {len(movie_reviews)} movie reviews")

extracted 100 movie reviews


# 3. Set up database

In [None]:
# OpenAI has multiple models, transforms the text in longer vectors (here
# length of 3072) and carries out more information about the original text.
# It is also more expensive and requires more space to store.
EMBEDDING_MODEL_NAME = "text-embedding-3-large"

embeder = OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    length_function=len,
)

# from_documents is the method that inserts or list of documents in the DB
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeder,
    text_splitter=text_splitter,
).from_documents(movie_reviews)

retriever = index.vectorstore.as_retriever()




# 4. Create RAG chain

In [None]:
# this will use the API key set up above
# note: we are note using the OpenAI API directly but using it via langchain

LLM_MODEL_NAME = "gpt-3.5-turbo"
llm = ChatOpenAI(
    model=LLM_MODEL_NAME,
    # higher temperature means more original answers
    temperature=1,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
# define how the LLM should respond in general
system_message = """
When asked a question reply as if you were the wizard of movies with the \
knowledge about movies. Try to be funny were possible but base you answers in \
the information provided in the context section.\
"""

human_message = """
User question:
{input}

-----------------------------------------
Context:
{context}
"""

chat_template = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_message),
])

combine_docs_chain = create_stuff_documents_chain(llm, chat_template)
chat_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [None]:
adventure_movies = chat_chain.invoke(
    {"input": "Can you recommend me an adventure movie?"}
)
# we use pprint rather then simply print to have all the text fit the screen
pprint(adventure_movies["answer"])

('Ah, dear movie enthusiast, for an adventure movie that will have you '
 'crossing deserts, battling Nazis, and searching for mythical artifacts, I '
 'bestow upon you the epic tale of "Raiders of the Lost Ark"! Picture this: A '
 'Han Solo-esque archaeologist on a quest for the power of God, fighting off '
 'the evilest villains of all time - the Nazis! Directed by the masterful '
 "Spielberg and penned by Lucas, it's the ultimate blend of action, humor, and "
 'heart-pounding adventure. So, grab your whip and fedora, and prepare for a '
 'cinematic journey that will leave you on the edge of your seat!')


In [None]:
surrealist_movies = chat_chain.invoke(
    {"input": "Which surrealist movies should I watch ?"}
)
for key, val in surrealist_movies.items():
  print(10 * "-" + f" {key} " + 10 * "-")
  pprint(val)

---------- input ----------
'Which surrealist movies should I watch ?'
---------- context ----------
[Document(metadata={'source': 'https://www.empireonline.com/movies/features/best-movies-2/', 'rank': 61, 'name': "Pan's Labyrinth"}, page_content="Pan's Labyrinth: Guillermo Del Toro's fairy tale for grown-ups, as pull-no-punches brutal as it is gorgeously, baroquely fantastical. There's an earthy, primal feel to his fairy-world here, alien and threatening rather than gasp-inducing and 'magical', thanks in no small part to the truly cheese-dream nightmarish demon-things Del Toro conjures up, sans CGI, with the assistance of performer Doug Jones."),
 Document(metadata={'source': 'https://www.empireonline.com/movies/features/best-movies-2/', 'rank': 97, 'name': 'Amelie'}, page_content="Amelie: Jean-Pierre Jeunet's fourth feature – his second as a solo artist divorced from Marc Caro – saw the\xa0Delicatessen,\xa0The City of Lost Children\xa0and\xa0Alien: Resurrection filmmaker leave behind