In [1]:
from langchain.document_loaders import  UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from pymilvus import MilvusClient, DataType, Collection, connections
from langchain.chains import RetrievalQAWithSourcesChain
import langchain
from langchain.prompts import PromptTemplate
from langchain_milvus import Milvus
load_dotenv()

True

In [2]:
llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0.6
    )

In [9]:
urls=[
    "https://www.moneycontrol.com/news/business/tata-motors-mahindra-gain-certificates-for-production-linked-payouts-11281691.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html",
    "https://www.moneycontrol.com/news/business/stocks/buy-tata-motors-target-of-rs-743-kr-choksey-11080811.html"
]
loader = UnstructuredURLLoader(urls=urls)

In [10]:
data = loader.load()
len(data)

3

In [5]:
splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n','\n',' '],
    chunk_size=300,
    chunk_overlap=30
)
chunks = splitter.split_documents(data)

In [20]:
messages = [
    (
        "system",
        f"{data[0].page_content}",
    ),
    ("human", "Read this passage and answer my question:\nWhat are the number of firms that will apply for certification?"),
]
ai_msg = llm.invoke(messages)
ai_msg.content

'According to the passage, the government expects 23 more firms to apply for certification by the end of September, in addition to the 6 companies that have already applied (including Tata Motors and Mahindra). Therefore, the total number of firms that will apply for certification is 29.'

In [3]:
encoder = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2")

  from tqdm.autonotebook import tqdm, trange


In [10]:
vectors = encoder.embed_documents([chunk.page_content for chunk in chunks],)
len(vectors),len(vectors[0])

(69, 384)

In [11]:
client = MilvusClient(
    uri="http://localhost:19530"
)


In [22]:
schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)


In [23]:
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="source_id",datatype=DataType.INT16)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=len(vectors[0]))


{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'source_id', 'description': '', 'type': <DataType.INT16: 3>}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'enable_dynamic_field': True}

In [24]:
client.create_collection(
    collection_name="finance_articles", 
    schema=schema, 
)

In [25]:
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="vector",
    metric_type="COSINE",
    index_type="FLAT",
    index_name="article_vectors"
)


In [26]:
client.create_index(
    collection_name="finance_articles",
    index_params=index_params,
    sync=False # Whether to wait for index creation to complete before returning. Defaults to True.
)



In [28]:
res = client.describe_index(
    collection_name="finance_articles",
    index_name="article_vectors"
)
print(res)


{'index_type': 'FLAT', 'metric_type': 'COSINE', 'field_name': 'vector', 'index_name': 'article_vectors', 'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}


In [29]:
source_url_map = {i:urls[i] for i in range(len(urls))}
url_source_map = {v:k for k,v in source_url_map.items()}

In [30]:
vector_data = []
for i in range(len(vectors)):
    vector_data.append({'id':i,
                        'source_id':url_source_map[chunks[i].metadata['source']],
                        'vector':vectors[i]})

In [31]:
res = client.insert(
    collection_name="finance_articles",
    data=vector_data
)



In [11]:
question = "What are the number of firms that will apply for certification by the end of September ?"
query = encoder.embed_query(question)

In [35]:
client.load_collection("finance_articles")

In [37]:
res = client.search(
    collection_name="finance_articles", # Replace with the actual name of your collection
    # Replace with your query vector
    data=[query],
    limit=5, # Max. number of search results to return
    search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
    output_fields=['source_id']
)


In [41]:
res[0]

[{'id': 9, 'distance': 0.7440287470817566, 'entity': {'source_id': 0}},
 {'id': 4, 'distance': 0.43733227252960205, 'entity': {'source_id': 0}},
 {'id': 7, 'distance': 0.42282533645629883, 'entity': {'source_id': 0}},
 {'id': 5, 'distance': 0.39404892921447754, 'entity': {'source_id': 0}},
 {'id': 8, 'distance': 0.3491817116737366, 'entity': {'source_id': 0}}]

In [15]:
connections.connect(
  alias="default", 
  host='localhost', 
  port='19530'
)

In [16]:
total_records = Collection("finance_articles").num_entities
total_records

69

In [4]:
index_params = {
    "index_type": 'IVF_FLAT',
    "params": {"nlist": 128},
    "metric_type": "COSINE"
}

In [5]:
vector_store = Milvus(
    embedding_function=encoder,
    collection_name="finance_articles",
    auto_id=True,
    index_params=index_params
)

In [63]:
vector_store.add_documents(chunks)

[453178359009251338,
 453178359009251339,
 453178359009251340,
 453178359009251341,
 453178359009251342,
 453178359009251343,
 453178359009251344,
 453178359009251345,
 453178359009251346,
 453178359009251347,
 453178359009251348,
 453178359009251349,
 453178359009251350,
 453178359009251351,
 453178359009251352,
 453178359009251353,
 453178359009251354,
 453178359009251355,
 453178359009251356,
 453178359009251357,
 453178359009251358,
 453178359009251359,
 453178359009251360,
 453178359009251361,
 453178359009251362,
 453178359009251363,
 453178359009251364,
 453178359009251365,
 453178359009251366,
 453178359009251367,
 453178359009251368,
 453178359009251369,
 453178359009251370,
 453178359009251371,
 453178359009251372,
 453178359009251373,
 453178359009251374,
 453178359009251375,
 453178359009251376,
 453178359009251377,
 453178359009251378,
 453178359009251379,
 453178359009251380,
 453178359009251381,
 453178359009251382,
 453178359009251383,
 453178359009251384,
 453178359009

In [11]:
question = "What are the total number of firms that will apply for certification by the end September?"
vector_store.similarity_search(question,k=4,expr=f"source in {urls[1:2]}")

[Document(metadata={'source': 'https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html', 'pk': 453178359009251375}, page_content='Forum\n\nMC 30\n\nNews\n\nBusiness\n\nMarkets\n\nStocks\n\nIncome Tax Calculator\n\nElection Schedule 2024\n\nIndia News\n\nEconomy\n\nMutual Funds\n\nPersonal Finance\n\nIPO News\n\nStartups\n\nStocks: A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z | Others'),
 Document(metadata={'source': 'https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html', 'pk': 453178359009251379}, page_content='Overdrive\n\nTopper Learning\n\nAbout us | Contact Us | Advisory Alert | Advertise with Us | Support | Disclaimer | Privacy Policy | Cookie Policy | Terms & Conditions | Careers | Financial Terms (Glossary) | FAQs | Sitemap | RSS Feed | Investors'),
 Document(metadata={'source': 'https://www.moneycon

In [35]:
PROMPT_TEMPLATE = """Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags and avoid using first person pronouns.
If you don't know the answer, just say that you don't know, don't try to make up an answer or just say you cannot find it if it is really not present there.
<context>
{summaries}
</context>

<question>
{question}
</question>

The response should be specific and use statistics or numbers when possible.

Assistant:
"""
prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["summaries", "question"]
)

In [36]:
chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm,
                                             retriever=vector_store.as_retriever(search_kwargs={'k':2,'expr':f"source in {urls[:2]}"}),
                                             chain_type="stuff",
                                             return_source_documents=True,
                                             chain_type_kwargs={"prompt":prompt}
                                             )

In [37]:
question = "What are the total number of firms that will apply for certification by the end of January 2025 ?"
chain(question)

{'question': 'What are the total number of firms that will apply for certification by the end of January 2025 ?',
 'answer': "According to the provided context, the government expects 23 more firms to apply for the certification by the end of September 2023, in addition to the 6 firms (including Tata Motors and Mahindra) that have already applied. There is no information available to predict the number of firms that will apply by the end of January 2025. Therefore, it's not possible to provide a specific answer to this question.",
 'sources': '',
 'source_documents': [Document(metadata={'source': 'https://www.moneycontrol.com/news/business/tata-motors-mahindra-gain-certificates-for-production-linked-payouts-11281691.html', 'pk': 453178359009251347}, page_content='Apart from Tata Motors and Mahindra, four other companies have applied for certification which is crucial for receiving the incentives, the government said, without naming the companies.\n\nThe government also said it expects 

In [31]:
langchain.debug=False