In [74]:
import os
import requests
from langchain.document_loaders import WebBaseLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pymongo import MongoClient
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, pipeline
from peft import get_peft_model, LoraConfig, TaskType
from langchain_community.document_loaders.mongodb import MongodbLoader
from langchain.vectorstores import Weaviate
from bs4 import BeautifulSoup  
from langchain_community.document_loaders import UnstructuredURLLoader, UnstructuredPDFLoader, PyPDFLoader
from datetime import datetime
from urllib.parse import urlparse, unquote
import weaviate
from weaviate.connect import ConnectionParams
from weaviate.collections.classes.config import CollectionConfig, Property, DataType, VectorizerConfig
import asyncio
from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults
from langchain_community.document_loaders import WebBaseLoader
from langchain.agents import Tool, initialize_agent
from langchain.agents.agent_types import AgentType
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA 
import torch
import accelerate
import streamlit as st
from dotenv import load_dotenv
load_dotenv()

True

In [91]:

# Use defined Langchain agent to search documents automatically
# The agent is combined search tool with a LLM
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Or another quantized variant
token = os.getenv("LLaMA_API_KEY") # huggingface token

tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",  # Automatically use GPU if available
                                                torch_dtype=torch.float16, token=token)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256) 
llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 70.76it/s]
Device set to use cpu


In [114]:

# Use structured search tool
search_tool = DuckDuckGoSearchResults(output_format="list")
# query = "Find 3 useful documents (including web articles, PDFs, and others) about battery materials in supply chains, at least of them should be from a paper in PDF"
query = "find the site locations of BYD's battery material lithium, return as (latitude, longitude)"
search_results = search_tool.invoke(query)

# Print webpage links
print("Webpage Links:")
for result in search_results[:100]: 
    print("-", result["link"])

# Combine snippets for summarization
combined_snippets = "\n".join(f"{r['title']}: {r['snippet']}" for r in search_results[:3])

# LLM prompt
summary_prompt = f"""You are a helpful assistant.

Here are some search result summaries:
{combined_snippets}

Please summarize the most useful insights in 3 bullet points.
"""

response = llm(summary_prompt)
print("\nSummary:")
print(response)

Webpage Links:
- https://money.usnews.com/investing/news/articles/2025-02-13/exclusive-chinas-byd-holds-mining-rights-in-brazils-lithium-valley-documents-show
- https://procurementmag.com/supply-chain-management/byd-expands-into-brazils-lithium-valley-with-acquisition
- https://engineerine.com/byd-blade-battery/
- https://interestingengineering.com/transportation/china-byd-secures-lithium-mining

Summary:
* **BYD secures lithium mining rights in Brazil:**  BYD, a major Chinese electric vehicle (EV) manufacturer, has acquired mining rights covering a significant area in Brazil's lithium-rich Jequitinhonha Valley, strategically positioning itself near its planned EV factory in Bahia.

* **Strategic location for EV production:** This acquisition directly supports BYD's new EV factory in Bahia, reducing transportation costs and securing a crucial raw material supply for its EV production.

* **Conflicting reports on BYD's battery technology:** While BYD is investing in lithium mining, othe

In [48]:
# === Step 1: Load Raw Data ===
# Load web resource with Langchain, not including PDF
url = "https://python.langchain.com/docs/how_to/document_loader_pdf/"
loader = WebBaseLoader(url)
documents = loader.load()
print(documents)

# Load PDF with Langchain (works for both local and online PDF)
file_path = "https://jessvb.github.io/assets/pdf/Autonomous_Vehicles_Tech_Today_Tomorrow.pdf"
loader = PyPDFLoader(file_path)

async def load_pages():
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)
    return pages

pages = await load_pages()
full_text = "\n".join([p.page_content for p in pages])
metadata = pages[0].metadata if pages else {}

print(f"{pages[0].metadata}\n")
print(pages[0].page_content) 



  blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)


{'producer': 'PyPDF', 'creator': 'Elsevier', 'creationdate': '2018-03-14T06:25:57+00:00', 'author': 'Jessica Van Brummelen', 'crossmarkdomains[1]': 'elsevier.com', 'crossmarkdomains[2]': 'sciencedirect.com', 'crossmarkdomainexclusive': 'true', 'crossmarkmajorversiondate': '2010-04-23', 'elsevierwebpdfspecifications': '6.5', 'keywords': 'Automotive sensors; Autonomous vehicles; Intelligent vehicles; Localization and mapping; Machine vision; Sensor fusion', 'moddate': '2018-03-14T06:25:57+00:00', 'subject': 'Transportation Research Part C, 89 (2018) 384-406. doi:10.1016/j.trc.2018.02.012', 'title': 'Autonomous vehicle perception_ The technology of today and tomorrow', 'doi': '10.1016/j.trc.2018.02.012', 'robots': 'noindex', 'source': 'https://jessvb.github.io/assets/pdf/Autonomous_Vehicles_Tech_Today_Tomorrow.pdf', 'total_pages': 23, 'page': 0, 'page_label': '384'}

Contents lists available atScienceDirect
Transportation Research Part C
journal homepage: www.elsevier.com/locate/trc
Revie

  pages = await load_pages()


In [None]:
# # Extract sub-links on a homepage of website

# from urllib.parse import urljoin

# visited = set()
# max_depth = 2  # Prevents infinite loops

# def crawl_nested_tabs(url, base_url, depth=0):
#     if url in visited or depth > max_depth:
#         return []

#     visited.add(url)

#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.text, "html.parser")
#     except:
#         return []

#     text = soup.get_text(separator="\n", strip=True)
#     results = [(url, text)]

#     # Find nested links (e.g., subtabs, buttons, inner menus)
#     links = [urljoin(base_url, a["href"]) for a in soup.find_all("a", href=True)]

#     for link in links:
#         if base_url in link:  # Stay within the domain
#             results += crawl_nested_tabs(link, base_url, depth + 1)

#     return results

# homepage = "https://example.com"
# data = crawl_nested_tabs(homepage, base_url=homepage)



In [10]:
print(len(pages))

23


In [11]:
# Data store in MongoDB
# --- Helper functions ---
def generate_title_from_url(url: str) -> str:
    path = urlparse(url).path
    title = path.strip("/").split("/")[-1].replace("_", " ")
    return unquote(title).title()

def generate_title_from_filename(filepath: str) -> str:
    name = os.path.basename(filepath).replace("_", " ").replace("-", " ")
    return os.path.splitext(name)[0].title()


myclient = MongoClient("mongodb://localhost:27017/")
mydb = myclient["test_db"]
mycol = mydb["test1"]

for doc in documents:
    insert_docu = {
        "content": doc.page_content,
        "metadata": {
            "source": url,
            "title": generate_title_from_url(url),
            "doc_type": "webpage",
            "timestamp": datetime.utcnow()
        }
    }
    doc_obj = Document(
        page_content=insert_docu["content"],
        metadata=insert_docu.get("metadata", {})    
    ) 
    query = {
        "page_content": doc_obj.page_content,
        "metadata.title": doc_obj.metadata.get("title")
    }
    if mycol.count_documents(query) == 0:
        mycol.insert_one(insert_docu)
    else:
        print("Document already exists.")

single_document = {
    "content": full_text,
    "metadata": {
        **metadata,
        "doc_type": "pdf",
        "source": file_path,
        "title": generate_title_from_filename(file_path),
        "timestamp": datetime.utcnow()
    }
}

for doc in [single_document]:
    doc_obj = Document(
        page_content=doc["content"],
        metadata=doc.get("metadata", {})    
    )
    query = {
        "page_content": doc_obj.page_content,
        "metadata.title": doc_obj.metadata.get("title")
    }
    if mycol.count_documents(query) == 0:
        mycol.insert_one(doc)
    else:
        print("Document already exists.")

print("✅ Webpage and PDF stored with full metadata.")  


✅ Webpage and PDF stored with full metadata.


  "timestamp": datetime.utcnow()
  "timestamp": datetime.utcnow()


In [28]:
# Data retrieve from MongoDB based on timestamp and convert to Langchain format, splitting and embedding
def data_retrieval_by_time(client):
    db_name = client['test_db']
    col_name = db_name['test1']
    start_time = datetime(2025, 1, 1)
    end_time = datetime(2025, 5, 1)
    data = col_name.find({"metadata.timestamp": {
            "$gte": start_time,
            "$lte": end_time
        }   
    })
    return data

results = data_retrieval_by_time(myclient)

# Convert to Langchain format
langchain_documents = [
    Document(
        page_content=doc["content"],
        metadata=doc.get("metadata", {})
    ) for doc in results
]

# Splitting
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

split_docs = splitter.split_documents(langchain_documents)
# print(f"Total chunks created: {len(split_docs)}")
# print("Example chunk:\n", split_docs[0].page_content)
# print("Metadata:", split_docs[0].metadata)

# Embedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

texts = [doc.page_content for doc in split_docs]
text_embeddings = hf.embed_documents(texts)

print(f"Vector length: {len(text_embeddings)}")  # Should match model dimension, e.g., 433
print(f"First 5 values: {text_embeddings[:5]}")

In [32]:
# # Delete entire class from Weaviate
# client = weaviate.connect_to_local()  # ✅ Version 4.x
# client.collections.delete("Supply_chain_material")

In [33]:

import weaviate.classes as wvc

client = weaviate.connect_to_local()  # ✅ Version 4.x
# Define collection name
collection_name = "Supply_chain_material" # Collection name must have the first letter capitalized in Weaviate

# Check if collection/class already exists
existing_collections = client.collections.list_all()

if collection_name not in existing_collections:
    questions = client.collections.create(
        name=collection_name,
        properties=[
            wvc.config.Property(
                name="content",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="source",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="timestamp",
                data_type=wvc.config.DataType.DATE,
            )
        ]
    )


  client = weaviate.connect_to_local()  # ✅ Version 4.x


In [34]:
# Store/retrieve data to/from Weaviate
from uuid import uuid4

# Get the collection object
collection = client.collections.get("supply_chain_material")

# Store documents and vectors
for doc, vector in zip(split_docs, text_embeddings):
    data_obj = {
        "content": doc.page_content
    }

    # Insert the document with vector
    collection.data.insert(properties=data_obj, vector=vector, uuid=uuid4())

In [108]:
# Get the collection object
collection = client.collections.get(collection_name)

query_question = 'Which sensors are currently used in prominent research and commercial vehicles?'
query_vector = hf.embed_query(query_question)
results = collection.query.near_vector(query_vector, limit=3)
retrieved_chunks = [obj.properties["content"] for obj in results.objects]

# LLM prompt

context = "\n".join(retrieved_chunks)
prompt = f"""Answer the question using the following context:

Context:
{context}

Question:
{query_question}

Answer:"""
response = llm(prompt)
print(response)

Llama.generate: 471 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    3282.41 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   21003.77 ms /   187 runs   (  112.32 ms per token,     8.90 tokens per second)
llama_perf_context_print:       total time =   21102.14 ms /   188 tokens



Prominent research and commercial vehicles such as Audi’s Research Vehicle, AutoNOMOS’ s MadeInGermany, Carnegie Mellon’s Urban Challenge entry, “Boss”, Ford’s Hybrid Fusion research vehicle, and Volkswagen Passat use a variety of sensors. These include:
LIDAR (Light Detection and Ranging) for surrounding perception, radar (Radio Detection and Ranging) for long-range detection, sonar for detecting obstacles in close proximity, cameras (Vision Stereovision Infrared) for visual perception, and other sensors such as GPS and accelerometers.
All of these sensors work together to provide a comprehensive view of the vehicle’s surroundings, allowing it to make informed decisions about how to navigate its environment.


In [None]:
# Fine tune pre-trained LLM--LLaMA2 with LoRA algorithm

# Step 1: Connect to Weaviate and fetch documents

response = client.query.get(collection_name, ["content"]).with_limit(1000).do()
documents = [doc["content"] for doc in response["data"]["Get"][collection_name]]

# Step 2: Prepare HuggingFace dataset
dataset = Dataset.from_dict({"text": documents})

# Step 3: Load pre-trained tokenizer and model
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Or another quantized variant
token = "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" # huggingface token
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",  # Automatically use GPU if available
                                                torch_dtype=torch.bfloat16, token=token,load_in_8bit=True) 

# Step 4: Apply LoRA for lightweight fine-tuning
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# Step 5: Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 6: Set up Trainer
training_args = TrainingArguments(
    output_dir="./llama2-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    logging_steps=10,
    save_steps=100,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs",
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Step 7: Train the model
trainer.train()

# Step 8: Save model
model.save_pretrained("llama2-finetuned")
tokenizer.save_pretrained("llama2-finetuned")


In [112]:

# import google.generativeai as genai


# # Configure the API
# genai.configure(api_key=os.environ["Google_API_KEY"])

# # Load the Gemini model
# model = genai.GenerativeModel("models/gemini-1.5-flash")

# # Ask a question
# question = "find BYD battery material production sites, list top 10"
# response = model.generate_content(question)

# # Print the response
# print("Answer:")
# print(response.text)

# from langchain.llms import LlamaCpp

# llm = LlamaCpp(
#     model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",
#     n_ctx=2048,
#     temperature=0.7,
#     max_tokens=256,
#     verbose=False
# )

# response = llm("how many countries in the world?")
# print(response)


llama_context: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized



There are currently 195 recognized sovereign states in the world. This number includes 193 Member States of the United Nations, 2 Observer States (the Holy See and Palestine), and 2 territories (Taiwan and Western Sahara) that are not fully recognized as sovereign states but have been granted observer status in international organizations.
The list of sovereign states in the world is as follows:
1. Afghanistan
2. Albania
3. Algeria
4. Andorra
5. Angola
6. Antigua and Barbuda
7. Argentina
8. Armenia
9. Australia
10. Austria
11. Azerbaijan
12. Bahamas
13. Bahrain
14. Bangladesh
15. Barbados
16. Belarus
17. Belgium
18. Belize
19. Benin
20. Bhutan
21. Bolivia
22. Bosnia and Herzegovina
23. Botswana
24. Brazil
25. Brunei
26. Bulgaria
27. Burkina Faso
28. Burundi
29. Cambodia
30. Cameroon
31. Canada
32. Central African Republic
33. Chad
34. Chile
35. China
36. Colombia
37. Comoros
38. Congo (Brazzaville)
39. Costa Rica
40. Côte d'Ivoire
41. Croatia
42. Cuba
43. Cyprus
44. Czech Republic
45.