## Setup

In [1]:
!pip install langchain
!pip install gpt4all
!pip install qdrant-client
!pip install sentence-transformers
!pip install pypdf

Collecting langchain
  Downloading langchain-0.1.14-py3-none-any.whl.metadata (13 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Using cached SQLAlchemy-2.0.29-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Using cached aiohttp-3.9.3-cp311-cp311-win_amd64.whl.metadata (7.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Using cached dataclasses_json-0.6.4-py3-none-any.whl.metadata (25 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-community<0.1,>=0.0.30 (from langchain)
  Downloading langchain_community-0.0.31-py3-none-any.whl.metadata (8.4 kB)
Collecting langchain-core<0.2.0,>=0.1.37 (from langchain)
  Downloading langchain_core-0.1.40-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Using cached langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)
Collecti

## Set huggingface cache home (optional)

In [1]:
import os
cache_dir="huggingface/"
os.environ["HF_HOME"] = cache_dir

model_dir = "H:\\RAGT\\Financial-RAG-GPU-less-Mistral-Langchain\\models\\mistral-7b-instruct-v0.1.Q4_0.gguf"
os.environ["MODEL"] = model_dir

## Check Device (assuming no GPU is available)

I have a GPU available but for you it will print "cpu" if you don't.

In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

## Enable to see Logs while running

In [3]:
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Create a logger
logger = logging.getLogger(__name__)

## Load Data, preprocessing and Creating Qdrant Index

In [18]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings
import re

def preprocess_text(text):
    text_lower = text.lower()
    text_no_punctuation = re.sub(r'[^\w\s\$\%\.\,\"\'\!\?\(\)]', '', text_lower)
    text_normalized_tabs = re.sub(r'(\t)+', '', text_no_punctuation)
    return text_normalized_tabs

# loader = PyPDFLoader("data/msft_annual_2023_report.pdf")
loader = PyPDFLoader("data/Recipes.pdf")
# loader = TextLoader(sample_texts)
# loader = WebBaseLoader("https://cleartax.in/s/top-performing-nps-schemes")
documents = loader.load()
for x in range(len(documents)):
    # do preprocessing
    documents[x].page_content=preprocess_text(documents[x].page_content)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0,separator="\n")
docs = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5", 
                                   model_kwargs = {'device': "cpu"})  # forcefully setting device as cpu
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="msft_data",
    force_recreate=True
)

2024-04-05 14:57:40,068 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
2024-04-05 14:57:40,079 - urllib3.connectionpool - DEBUG - Resetting dropped connection: huggingface.co
2024-04-05 14:57:40,490 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /BAAI/bge-large-en-v1.5/resolve/main/modules.json HTTP/1.1" 200 0
2024-04-05 14:57:40,646 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /BAAI/bge-large-en-v1.5/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0
2024-04-05 14:57:40,789 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /BAAI/bge-large-en-v1.5/resolve/main/README.md HTTP/1.1" 200 0
2024-04-05 14:57:40,929 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /BAAI/bge-large-en-v1.5/resolve/main/modules.json HTTP/1.1" 200 0
2024-04-05 14:57:41,078 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /BAAI/bge-large-e

## Run if not able to download model using GPT4All directly.

In [None]:
# # download model

# !mkdir models
# !wget https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf -O models/mistral-7b-instruct-v0.1.Q4_0.gguf

## Directly calling GPT4All module 

In [19]:
from gpt4all import GPT4All
model = GPT4All(model_name="H:\RAGT\Financial-RAG-GPU-less-Mistral-Langchain\models\mistral-7b-instruct-v0.1.Q4_0.gguf",
                             n_threads = 8,
                             allow_download=False) # set allow_download as true to fetch it.

In [13]:
prompt = "Eiffel tower location:"
print(model.generate(prompt, max_tokens=20))

 Paris, France

The Eiffel Tower is an iconic landmark located in Paris,


## Creating prompts, tuning parameters and creating Chain using Langchain

In [20]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import PromptTemplate, LLMChain
from langchain_community.llms import GPT4All

# template = '''[INST]: You are a financial expert analyst bot, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]\n
# Context: {context}.\n
# Question: {question}\n
# Answer: '''
template = '''[INST]: You are a chef with extensive knowledge of various recipes. Based on the context provided, which could either be specific ingredients the user has on hand or a type of dish the user desires, offer a suitable recipe that matches the request.[\INST]\n
# As a chef renowned for your vast repertoire of recipes, utilize the provided context to craft a recipe. Whether the inquiry involves specific ingredients available to the user or a particular dish they're craving, draw upon your culinary expertise to recommend a recipe that aligns with their needs.
Context: {context}.\n
Question: {question}\n
Answer: '''
rag_prompt = PromptTemplate(template=template, input_variables=["context","question"])

callbacks = [StreamingStdOutCallbackHandler()]

llm = GPT4All(
            model="H:\RAGT\Financial-RAG-GPU-less-Mistral-Langchain\models\mistral-7b-instruct-v0.1.Q4_0.gguf",
            max_tokens=50,
            n_threads=8, 
            temp=0.3,
            top_p=0.2,
            top_k=40,
            n_batch=8,
            seed=100,
            allow_download=False,  # Prevent downloading
            verbose=True)

llm_chain = LLMChain(prompt=rag_prompt, llm=llm, verbose=True)

## Define format_docs for formatting context candidates.

In [21]:
def format_docs(query):
    found_docs = qdrant.similarity_search_with_score(query,k=1)
    return "\n\n".join(doc[0].page_content for doc in found_docs)

## Run Queries

In [22]:
%%time
query = "What can I make with chicken, tomatoes, and rice?"
context = "The user has chicken, tomatoes, and rice available and is looking for a recipe that uses these ingredients."
resp = llm_chain.invoke(
    input={"question": query,
           "context": context
          }
)
print(resp['text'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]: You are a chef with extensive knowledge of various recipes. Based on the context provided, which could either be specific ingredients the user has on hand or a type of dish the user desires, offer a suitable recipe that matches the request.[\INST]

Context: The user has chicken, tomatoes, and rice available and is looking for a recipe that uses these ingredients..

Question: What can I make with chicken, tomatoes, and rice?

Answer: [0m


Exception ignored on calling ctypes callback function: <function LLModel._prompt_callback at 0x0000028290CE2980>
Traceback (most recent call last):
  File "c:\Users\CYTech Student\AppData\Local\Programs\Python\Python311\Lib\site-packages\gpt4all\_pyllmodel.py", line 526, in _prompt_callback
    @staticmethod

KeyboardInterrupt: 



[1m> Finished chain.[0m

One option could be to make Chicken Fried Rice. Here's the recipe:

Ingredients:
- 1 cup uncooked white rice
- 2 cups water
- Salt
- 1 tablespoon vegetable oil
- 1 pound boneless, skinless chicken breast, cut into bite-sized pieces
- 2 medium tomatoes, diced
- 3 green onions, sliced
- Soy sauce (optional)

Instructions:
1. In a medium saucepan, bring water to a boil over high heat. Add rice and salt, stirring occasionally.
2. Reduce the heat to low, cover, and simmer for 18-20 minutes or until the rice is cooked through and the water has been absorbed.
3. In a large skillet, heat oil over medium-high heat. Add chicken and cook until browned on all sides, about 5-7 minutes.
4. Add tomatoes and green onions to the skillet with the chicken. Cook for another 2-3 minutes or until the vegetables are tender.
5. Stir in the cooked rice
CPU times: total: 2h 24min 43s
Wall time: 56min 58s


In [16]:
%%time
query = "who were the top performers in 2023 for microsoft?"
resp = llm_chain.invoke(
    input={"question":query,
           "context": format_docs(query)
          }
)
print(resp['text'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]: You are a financial expert analyst bot, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]

Context: and selling our other products and service s and income taxes.  
highlights from fiscal year 2023 compared with fiscal year 2022 included  
 microsoft cloud revenue increased 22% to $111.6  billion.  
 office commercial products and cloud services revenue increased 10% driven by office 365 commercial 
growth of 13%.  
 office consumer products and cloud services revenue increased 2% and microsoft 365 consumer subscribers 
increased to 67.0  million.  
 linkedin revenue increased 10%.  
 dynamics products and cloud services revenue increased 16% driven by dynamics 365 growth of 24%.  
 server products and cloud services revenue increased 19% driven by azure and other cloud services growth 
of 29%.  
 windows original equipment manufacturer li

In [None]:
%%time
query = "who were the worst performers in 2023 for microsoft?"
resp = llm_chain.invoke(
    input={"question":query,
           "context": format_docs(query)
          }
)
print(resp['text'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]: You are a financial expert analyst bot, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]

Context: and selling our other products and service s and income taxes.  
highlights from fiscal year 2023 compared with fiscal year 2022 included  
 microsoft cloud revenue increased 22% to $111.6  billion.  
 office commercial products and cloud services revenue increased 10% driven by office 365 commercial 
growth of 13%.  
 office consumer products and cloud services revenue increased 2% and microsoft 365 consumer subscribers 
increased to 67.0  million.  
 linkedin revenue increased 10%.  
 dynamics products and cloud services revenue increased 16% driven by dynamics 365 growth of 24%.  
 server products and cloud services revenue increased 19% driven by azure and other cloud services growth 
of 29%.  
 windows original equipment manufacturer li

In [None]:
query = "what was the revenue in united states versus in other countries in 2023?"
resp = llm_chain.invoke(
    input={"question":query,
           "context": format_docs(query)
          }
)
print(resp['text'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]: You are a financial expert analyst bot, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]

Context: year ended june  30, 2023  2022  2021  
        
united states (a) $ 106,744   $ 100,218   $ 83,953   
other countries   105,171    98,052    84,135         
total  $  211,915   $  198,270   $  168,088           
(a) includes billings to oems and certain multinational organizations because of the nature of these businesses and the 
impracticability of determining the geographic source of the revenue.  
revenue, classified by significant product and service offerings, was as follows  
  
(in millions)          
        
year ended june  30, 2023  2022  2021  
        
server products and cloud services  $ 79,970   $ 67,350   $ 52,589   
office products and cloud services   48,728    44,862    39,872   
windows   21,507    24,732    22,488   


## Other Queries ideas

query = "what was the expenses for research and development in 2023?"

query = "what is microsoft 365 consumer count in 2023?"

query = "what was the revenue generated by windows?"

query = "what was the main focus for 2023?"

query = "what's the overall stock information?"

query = "what sector and companies did microsoft invest in 2023?"

query = "how many people were laid off by microsoft and what were they offered?"