In [17]:
import os
import requests
from langchain.document_loaders import WebBaseLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pymongo import MongoClient
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, pipeline
from peft import get_peft_model, LoraConfig, TaskType
from langchain_community.document_loaders.mongodb import MongodbLoader
from langchain.vectorstores import Weaviate
from bs4 import BeautifulSoup  
from langchain_community.document_loaders import UnstructuredURLLoader, UnstructuredPDFLoader, PyPDFLoader
from datetime import datetime
from urllib.parse import urlparse, unquote
import weaviate
from weaviate.connect import ConnectionParams
from weaviate.collections.classes.config import CollectionConfig, Property, DataType, VectorizerConfig
import asyncio
from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults
from langchain_community.document_loaders import WebBaseLoader
from langchain.agents import Tool, initialize_agent
from langchain.agents.agent_types import AgentType
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_huggingface import HuggingFacePipeline
import torch
import accelerate

In [18]:
# Use defined Langchain agent to search documents automatically
# The agent is combined search tool with a LLM
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Or another quantized variant
token = "" # huggingface token

tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",  # Automatically use GPU if available
                                                torch_dtype=torch.float16, token=token)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256) 
llm = HuggingFacePipeline(pipeline=pipe)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 70.51it/s]
Device set to use cpu


In [8]:
# Define tool
# # LLaMA2 does not have the output format compatible with Langchain when using initialize_agent, we need to re-format mannually, but openAI's output format is compatible
# search_tool = DuckDuckGoSearchRun()
# tools = [
#     Tool(
#         name="DuckDuckGo Search",
#         func=search_tool.run,
#         description="Useful for finding recent web pages"
#     )
# ]

# # Use LLaMA2 LLM instead of OpenAI

# agent = initialize_agent(
#     tools=tools,
#     llm=llm,
#     agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
#     verbose=True
# )
# response = agent.run("Find 3 useful links about battery materials in supply chains")
# print(response)


# Use structured search tool
search_tool = DuckDuckGoSearchResults(output_format="list")
query = "Find 3 useful documents (including web articles, PDFs, and others) about battery materials in supply chains, at least of them should be from a paper in PDF"
search_results = search_tool.invoke(query)

# Print webpage links
print("Webpage Links:")
for result in search_results[:3]:  # Only top 3
    print("-", result["link"])

# Combine snippets for summarization
combined_snippets = "\n".join(f"{r['title']}: {r['snippet']}" for r in search_results[:3])

# LLM prompt
summary_prompt = f"""You are a helpful assistant.

Here are some search result summaries:
{combined_snippets}

Please summarize the most useful insights in 3 bullet points.
"""

response = llm(summary_prompt)
print("\nSummary:")
print(response)

Webpage Links:
- https://pubs.acs.org/doi/10.1021/acsenergylett.4c01300
- https://www.weforum.org/publications/powering-the-future-overcoming-battery-supply-chain-challenges-with-circularity/
- https://www.researchgate.net/publication/390319076_Critical_Materials_for_EV_Batteries_Challenges_Opportunities_and_Policymakers

Summary:
You are a helpful assistant.

Here are some search result summaries:
Insights into the Critical Materials Supply Chain of the Battery Market ...: This paper delves into the critical materials supply chain of the battery market with an emphasis on long-term energy security. The study recognizes electric vehicle battery packs as reservoirs of "locked reserves" for extended periods, typically 10 years or more. A comprehensive understanding of material flows and end-of-life battery management is essential to establish a sustainable ...
Battery circularity | World Economic Forum: The lack of effective tracking systems for battery materials hinders responsible sour

In [9]:
# === Step 1: Load Raw Data ===
# Load web resource with Langchain, not including PDF
url = "https://python.langchain.com/docs/how_to/document_loader_pdf/"
loader = WebBaseLoader(url)
documents = loader.load()
print(documents)

# Load PDF with Langchain (works for both local and online PDF)
file_path = "https://jessvb.github.io/assets/pdf/Autonomous_Vehicles_Tech_Today_Tomorrow.pdf"
loader = PyPDFLoader(file_path)

async def load_pages():
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)
    return pages

pages = await load_pages()

# Combine all page contents into one document
full_text = "\n".join([p.page_content for p in pages])
metadata = pages[0].metadata if pages else {}

print(f"{pages[0].metadata}\n")
print(pages[0].page_content) 



  blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)


{'producer': 'PyPDF', 'creator': 'Elsevier', 'creationdate': '2018-03-14T06:25:57+00:00', 'author': 'Jessica Van Brummelen', 'crossmarkdomains[1]': 'elsevier.com', 'crossmarkdomains[2]': 'sciencedirect.com', 'crossmarkdomainexclusive': 'true', 'crossmarkmajorversiondate': '2010-04-23', 'elsevierwebpdfspecifications': '6.5', 'keywords': 'Automotive sensors; Autonomous vehicles; Intelligent vehicles; Localization and mapping; Machine vision; Sensor fusion', 'moddate': '2018-03-14T06:25:57+00:00', 'subject': 'Transportation Research Part C, 89 (2018) 384-406. doi:10.1016/j.trc.2018.02.012', 'title': 'Autonomous vehicle perception_ The technology of today and tomorrow', 'doi': '10.1016/j.trc.2018.02.012', 'robots': 'noindex', 'source': 'https://jessvb.github.io/assets/pdf/Autonomous_Vehicles_Tech_Today_Tomorrow.pdf', 'total_pages': 23, 'page': 0, 'page_label': '384'}

Contents lists available atScienceDirect
Transportation Research Part C
journal homepage: www.elsevier.com/locate/trc
Revie

In [None]:
# # Extract sub-links on a homepage of website

# from urllib.parse import urljoin

# visited = set()
# max_depth = 2  # Prevents infinite loops

# def crawl_nested_tabs(url, base_url, depth=0):
#     if url in visited or depth > max_depth:
#         return []

#     visited.add(url)

#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.text, "html.parser")
#     except:
#         return []

#     text = soup.get_text(separator="\n", strip=True)
#     results = [(url, text)]

#     # Find nested links (e.g., subtabs, buttons, inner menus)
#     links = [urljoin(base_url, a["href"]) for a in soup.find_all("a", href=True)]

#     for link in links:
#         if base_url in link:  # Stay within the domain
#             results += crawl_nested_tabs(link, base_url, depth + 1)

#     return results

# homepage = "https://example.com"
# data = crawl_nested_tabs(homepage, base_url=homepage)



In [10]:
print(len(pages))

23


In [11]:
# Data store in MongoDB
# --- Helper functions ---
def generate_title_from_url(url: str) -> str:
    path = urlparse(url).path
    title = path.strip("/").split("/")[-1].replace("_", " ")
    return unquote(title).title()

def generate_title_from_filename(filepath: str) -> str:
    name = os.path.basename(filepath).replace("_", " ").replace("-", " ")
    return os.path.splitext(name)[0].title()


myclient = MongoClient("mongodb://localhost:27017/")
mydb = myclient["test_db"]
mycol = mydb["test1"]

for doc in documents:
    insert_docu = {
        "content": doc.page_content,
        "metadata": {
            "source": url,
            "title": generate_title_from_url(url),
            "doc_type": "webpage",
            "timestamp": datetime.utcnow()
        }
    }
    doc_obj = Document(
        page_content=insert_docu["content"],
        metadata=insert_docu.get("metadata", {})    
    ) 
    query = {
        "page_content": doc_obj.page_content,
        "metadata.title": doc_obj.metadata.get("title")
    }
    if mycol.count_documents(query) == 0:
        mycol.insert_one(insert_docu)
    else:
        print("Document already exists.")

single_document = {
    "content": full_text,
    "metadata": {
        **metadata,
        "doc_type": "pdf",
        "source": file_path,
        "title": generate_title_from_filename(file_path),
        "timestamp": datetime.utcnow()
    }
}

for doc in [single_document]:
    doc_obj = Document(
        page_content=doc["content"],
        metadata=doc.get("metadata", {})    
    )
    query = {
        "page_content": doc_obj.page_content,
        "metadata.title": doc_obj.metadata.get("title")
    }
    if mycol.count_documents(query) == 0:
        mycol.insert_one(doc)
    else:
        print("Document already exists.")

print("✅ Webpage and PDF stored with full metadata.")  


✅ Webpage and PDF stored with full metadata.


  "timestamp": datetime.utcnow()
  "timestamp": datetime.utcnow()


In [12]:
# Data retrieve from MongoDB based on timestamp and convert to Langchain format, splitting and embedding
def data_retrieval_by_time(client):
    db_name = client['test_db']
    col_name = db_name['test1']
    start_time = datetime(2025, 1, 1)
    end_time = datetime(2025, 5, 1)
    data = col_name.find({"metadata.timestamp": {
            "$gte": start_time,
            "$lte": end_time
        }   
    })
    return data

results = data_retrieval_by_time(myclient)

# Convert to Langchain format
langchain_documents = [
    Document(
        page_content=doc["content"],
        metadata=doc.get("metadata", {})
    ) for doc in results
]

# Splitting
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

split_docs = splitter.split_documents(langchain_documents)
# print(f"Total chunks created: {len(split_docs)}")
# print("Example chunk:\n", split_docs[0].page_content)
# print("Metadata:", split_docs[0].metadata)

# Embedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

texts = [doc.page_content for doc in split_docs]
text_embeddings = hf.embed_documents(texts)

print(f"Vector length: {len(text_embeddings)}")  # Should match model dimension, e.g., 433
print(f"First 5 values: {text_embeddings[:5]}")

Vector length: 433
First 5 values: [[-0.0043886140920221806, -0.017521359026432037, 0.025291312485933304, 0.03053566999733448, -0.0004389022942632437, 0.028207436203956604, 0.011579020880162716, 0.02672705240547657, 0.03842499107122421, 0.01722649857401848, -0.0670296922326088, 0.012565222568809986, 0.005071254447102547, -0.03175431862473488, 0.0170876644551754, -0.004057024605572224, -0.0041424185037612915, -0.021515296772122383, -0.04955199360847473, -0.052555929869413376, 0.01107547152787447, 0.03909304738044739, 0.028485585004091263, -0.02290114387869835, 0.03350146859884262, 0.022235026583075523, 0.001642092363908887, 0.009964756667613983, 0.01040856447070837, 0.006959304679185152, 0.05446632206439972, -0.04878581687808037, 0.009880749508738518, 0.020284287631511688, 1.5946853864079458e-06, -0.044569071382284164, -0.03408395126461983, 0.047009196132421494, 0.002549940487369895, -0.012526975013315678, 0.0698142796754837, -0.04923154041171074, -0.05681130290031433, 0.021649336442351

In [13]:

import weaviate.classes as wvc

client = weaviate.connect_to_local()  # ✅ Version 4.x
# Define collection name
collection_name = "Supply_chain_material" # Collection name must have the first letter capitalized in Weaviate

# Check if collection already exists
existing_collections = client.collections.list_all()

if collection_name not in existing_collections:
    questions = client.collections.create(
        name=collection_name,
        properties=[
            wvc.config.Property(
                name="content",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="source",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="timestamp",
                data_type=wvc.config.DataType.DATE,
            )
        ]
    )


In [14]:
# Store/retrieve data to/from Weaviate
from uuid import uuid4

# Get the collection object
collection = client.collections.get("supply_chain_material")

# Store documents and vectors
for doc, vector in zip(split_docs, text_embeddings):
    data_obj = {
        "content": doc.page_content
    }

    # Insert the document with vector
    collection.data.insert(properties=data_obj, vector=vector, uuid=uuid4())

In [83]:
query_question = 'Which sensors are currently used in prominent research and commercial vehicles?'
query_embedding = hf.embed_query(query_question)

# Get the collection object
collection = client.collections.get(collection_name)

# Perform a vector similarity search
results = collection.query.near_vector(
    near_vector=query_embedding,
    limit=3
)

# Print results
for o in results.objects:
    print(o.properties["content"])


Vehicle Vision Stereovision Infrared
Camera
LIDAR Radar Sonar
Research Vehicles
Audi’s Research Vehicle (Gitlin, 2016b; Pachal, 2016; Souppouris, 2014) ✓✓ ✓ ✓ ✓ ✓
AutoNOMOS’ s MadeInGermany (Volkswagen Passat) (Ghring et al., 2013) ✓✓ ✓ ✓
Carnegie Mellon’ s Urban Challenge entry,“Boss” (2007 Chevy Tahoe; 1st place) (Grisleri
and Fedriga, 2010; Urmson et al., 2008)
✓✓ ✓
Ford’ s Hybrid Fusion research vehicle (Gitlin, 2016a) ✓✓ ✓ ✓
perception requiring further improvement as well as information about recent related advancements in technology.
5.1. Automotive sensor research areas and advancements
This section provides a summary of future areas of research to improve AV sensors and information about recent advancements in
AV sensors. The following list summarizes areas related to AV sensors needing further development:
 Improving detection and reducing uncertainty in poor lighting and weather conditions
3.1. Automotive sensor technology overview
As an overview, automotive sensing falls i

In [None]:
# Fine tune pre-trained LLM--LLaMA2 with LoRA algorithm

# Step 1: Connect to Weaviate and fetch documents

response = client.query.get(collection_name, ["content"]).with_limit(1000).do()
documents = [doc["content"] for doc in response["data"]["Get"][collection_name]]

# Step 2: Prepare HuggingFace dataset
dataset = Dataset.from_dict({"text": documents})

# Step 3: Load pre-trained tokenizer and model
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Or another quantized variant
token = "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" # huggingface token
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",  # Automatically use GPU if available
                                                torch_dtype=torch.bfloat16, token=token,load_in_8bit=True) 

# Step 4: Apply LoRA for lightweight fine-tuning
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# Step 5: Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 6: Set up Trainer
training_args = TrainingArguments(
    output_dir="./llama2-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    logging_steps=10,
    save_steps=100,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs",
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Step 7: Train the model
trainer.train()

# Step 8: Save model
model.save_pretrained("llama2-finetuned")
tokenizer.save_pretrained("llama2-finetuned")
