Reference

https://python.langchain.com/docs/use_cases/question_answering/quickstart

https://colab.research.google.com/github/langchain-ai/langchain/blob/master/cookbook/Multi_modal_RAG.ipynb#scrollTo=140580ef-5db0-43cc-a524-9c39e04d4df0

https://www.youtube.com/watch?v=cBpdiQ3gljM

https://github.com/dorianbrown/rank_bm25

Other:

https://python.langchain.com/docs/expression_language/cookbook/retrieval

https://python.langchain.com/docs/use_cases/question_answering/

In [1]:
# %pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai bs4

In [2]:
# %pip install chromadb 

In [3]:
# os.environ["OPENAI_API_KEY"] = getpass.getpass()

# import dotenv

# dotenv.load_dotenv()


In [5]:

# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

In [18]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [10]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import os

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, ngram_range=(1,2),stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

user_query = "I want to study Applied Mathematics"
folder_path = r"Database\University\University_of_Waterloo\Academic_Programs"

words_to_remove = ["programs", "Studies"]
def remove_unwanted_words(filename, words_to_remove):
    for word in words_to_remove:
        filename = filename.replace(word, "")
    return filename

# Read each .txt file's name and content in this path
documents = []
for filename in os.listdir(folder_path):
    cleaned_filename = remove_unwanted_words(filename[:-4], words_to_remove)
    documents.append(cleaned_filename) # remove .txt

scores = []
for doc in documents:
    score = cosine_sim(user_query,doc)
    weighted_score = score / (0.5+0.5*len(doc))
    scores.append(weighted_score)

# Display scores (optional: you can sort them to show the most relevant documents first)
sorted_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
idx, score = sorted_scores[0]
document_path = os.path.join(folder_path, os.listdir(folder_path)[idx])
# for idx, score in sorted_scores:
#     print(f"Document: {os.listdir(folder_path)[idx]}, Score: {score}")



In [20]:
loader = TextLoader(document_path)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [22]:
rag_chain.invoke(user_query)

'With a degree in Applied Mathematics, you can pursue careers in research, consulting, and development in various industries such as financial institutions, universities, government, and engineering firms. Graduates have gone on to work as Research Assistants, Strategy Analysts, Data Analysts, Software Engineers, and more. Co-op opportunities also provide relevant paid work experience to explore different career areas and types of employers.'

In [15]:
# cleanup
vectorstore.delete_collection()