### **Load Environment variables from .env file**

In [2]:
from langchain.llms import AzureOpenAI
import openai
import tiktoken
from dotenv import load_dotenv, find_dotenv
import os
from IPython.display import display, HTML, JSON, Markdown

# read local .env file
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv(
    "OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

#### Init LLM model

In [3]:
def init_llm(model=OPENAI_MODEL_NAME,
             deployment_name=OPENAI_DEPLOYMENT_NAME,
             temperature=0,
             max_tokens=800,
             stop="<|im_end|>",
             ):

    llm = AzureOpenAI(deployment_name=deployment_name,
                      model=model,
                      temperature=temperature,
                      max_tokens=max_tokens,
                      model_kwargs={"stop": ["<|im_end|>"]})
    return llm

In [4]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.vectorstores import FAISS

#### Load data, movies reviews

In [5]:
file = "documents/imdb_clean.csv"
loader = CSVLoader(file_path=file)
# LangChain Document is created per line in CSV file, no need
docs = loader.load()
print(len(docs))

2532


In [6]:
# from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings.openai import OpenAIEmbeddings

#### Create in memory vector store to use along with LLM model 


In [7]:
# init in-memory vector store
# this could take several minutes
embeddings = OpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)
db = DocArrayInMemorySearch.from_documents(
    docs,
    embeddings
)

#### Run similarity search on the vector store

In [8]:
qdocs = db.similarity_search(
    "Shawshank redemption", k=5)
for doc in qdocs:
    display(Markdown(doc.page_content))

: 0
title: The Shawshank Redemption
director: Frank Darabont
release_year: 1994
runtime: 142
genre: Drama
rating: 9.3
metascore: 82
gross(M): 28.34

: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Drama
rating: 8.6
metascore: 61
gross(M): 136.8

: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Crime
rating: 8.6
metascore: 61
gross(M): 136.8

: 211
title: Prisoners
director: Denis Villeneuve
release_year: 2013
runtime: 153
genre: Drama
rating: 8.1
metascore: 70
gross(M): 61.0

: 3
title: Schindler's List
director: Steven Spielberg
release_year: 1993
runtime: 195
genre: Biography
rating: 9.0
metascore: 95
gross(M): 96.9

In [13]:
# prepare context for sending to LLM
# concatenate all retrieved documents 
context_for_llm = "".join([qdocs[i].page_content for i in range(len(qdocs))])
print(context_for_llm)

: 0
title: The Shawshank Redemption
director: Frank Darabont
release_year: 1994
runtime: 142
genre: Drama
rating: 9.3
metascore: 82
gross(M): 28.34: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Drama
rating: 8.6
metascore: 61
gross(M): 136.8: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Crime
rating: 8.6
metascore: 61
gross(M): 136.8: 211
title: Prisoners
director: Denis Villeneuve
release_year: 2013
runtime: 153
genre: Drama
rating: 8.1
metascore: 70
gross(M): 61.0: 3
title: Schindler's List
director: Steven Spielberg
release_year: 1993
runtime: 195
genre: Biography
rating: 9.0
metascore: 95
gross(M): 96.9


#### Call LLM to process the query

In [17]:
llm = init_llm()

query = f""" Which movie genre is the prevalent in this text: {context_for_llm}? 
Answer only based on the text.

<|im_end|>
"""

response = llm(query)

display(Markdown(response))

Drama is the prevalent movie genre in this text.

#### Using LangChain RetrievalQA 

In [18]:
# define a retiever as a vector store
retriever = db.as_retriever()

qa_stuff = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=False)

In [19]:
response = qa_stuff.run(query)
display(Markdown(response))

 Drama

In [20]:
query = f" Summarize the movies in the text: {context_for_llm}?"
response = qa_stuff.run(query)
display(Markdown(response))

 The text describes four movies: The Shawshank Redemption, The Green Mile, Prisoners, and Schindler's List. The Shawshank Redemption is a drama movie directed by Frank Darabont and released in 1994. It has a runtime of 142 minutes, a rating of 9.3, a metascore of 82, and a gross of 28.34 million dollars. The Green Mile is another movie directed by Frank Darabont and released in 1999. It has a runtime of 189 minutes, a rating of 8.6, a metascore of 61, and a gross of 136.8 million dollars. It is classified as a drama, but there are also versions of the movie that are classified as crime and fantasy. Prisoners is a drama movie directed by Denis Villeneuve and released in 2013. It has a runtime of 153 minutes, a rating of 8.1, a metascore of 70, and a gross of 61.0 million dollars. Schindler's List is a biography movie directed by Steven Spielberg and released in 1993. It has a runtime of 195 minutes, a rating of 9.0, a metascore of 95, and a gross of 96.9 million dollars.