In [1]:
import os
import uuid
import requests
import streamlit as st
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import pdfplumber
from docx import Document
import os
import json
import csv
import xml.etree.ElementTree as ET
from openpyxl import load_workbook
import pandas as pd
from dotenv import load_dotenv, dotenv_values 

In [2]:
from dotenv import load_dotenv, dotenv_values 
load_dotenv()

GROQ_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [3]:
def sliding_window(text, window_size=1000, overlap=500):
    chunks = []
    start = 0
    while start < len(text):
        end = start + window_size
        chunks.append(text[start:end])
        if end >= len(text):
            break
        start += window_size - overlap
    return chunks

def chunk_file(file_path, row_chunk_size=10):
    ext = os.path.splitext(file_path)[1].lower()
    file_name = os.path.basename(file_path)
    chunks = []

    def make_payloads(text_chunks):
        return [{"text": chunk, "file": file_name} for chunk in text_chunks]

    if ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = make_payloads(sliding_window(text))

    elif ext == ".docx":
        doc = Document(file_path)
        full_text = "\n".join([p.text for p in doc.paragraphs])
        chunks = make_payloads(sliding_window(full_text))

    elif ext == ".pdf":
        with pdfplumber.open(file_path) as pdf:
            full_text = ""
            for page in pdf.pages:
                full_text += (page.extract_text() or "") + "\n"
        chunks = make_payloads(sliding_window(full_text))

    elif ext == ".json":
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            text_chunks = [json.dumps(item, ensure_ascii=False) for item in data]
        elif isinstance(data, dict):
            text_chunks = [json.dumps({k: v}, ensure_ascii=False) for k, v in data.items()]
        else:
            text_chunks = [json.dumps(data, ensure_ascii=False)]
        chunks = make_payloads(text_chunks)

    elif ext == ".xml":
        tree = ET.parse(file_path)
        root = tree.getroot()
        text_chunks = [ET.tostring(elem, encoding='unicode') for elem in root]
        chunks = make_payloads(text_chunks)

    elif ext in [".csv", ".xlsx"]:
        if ext == ".csv":
            df = pd.read_csv(file_path)
        else:
            df = pd.read_excel(file_path)

        text_chunks = []
        for i in range(0, len(df), row_chunk_size):
            chunk_df = df.iloc[i:i + row_chunk_size]
            text_chunks.append(chunk_df.to_csv(index=False, lineterminator='\n'))
        chunks = make_payloads(text_chunks)

    else:
        raise ValueError(f"Unsupported file extension: {ext}")

    return chunks

In [4]:
chunks = []
for file in os.listdir('../texts'):
    file_path = os.path.join('../texts', file)
    if os.path.isfile(file_path):
        try:
            chunks += chunk_file(file_path)
            print(f"File: {file}, Chunks: {len(chunks)}")
        except Exception as e:
            print(f"Error processing {file}: {e}")

File: battery_technology_data.xlsx, Chunks: 10
File: battery_tech_unique_data.json, Chunks: 110
File: graphene_battery.txt, Chunks: 160
File: how_lithium_ion_batteries_work_doj.pdf, Chunks: 161
File: how_to_prolong_lithium_ion_batteries.pdf, Chunks: 165
File: liquid-batteries.txt, Chunks: 206
File: outlook_on_lithium_batteries.pdf, Chunks: 278
File: paper_battery_tech.pdf, Chunks: 477
File: summary_britannica.pdf, Chunks: 558


In [6]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(chunks, convert_to_tensor=True)
print(f"Generated {len(embeddings)} embeddings.")
print(embeddings)

Generated 558 embeddings.
tensor([[-0.0093,  0.0307, -0.0964,  ..., -0.0519,  0.0797,  0.0123],
        [-0.0262,  0.0354, -0.0795,  ..., -0.0707,  0.0807,  0.0245],
        [-0.0099,  0.0234, -0.0970,  ..., -0.0655,  0.0765,  0.0050],
        ...,
        [-0.0930,  0.0909, -0.0533,  ..., -0.0308,  0.0004,  0.0326],
        [-0.0776,  0.0843, -0.0835,  ..., -0.0349, -0.0131,  0.1084],
        [-0.0792,  0.0708, -0.0515,  ..., -0.0400, -0.0076,  0.1469]])


In [None]:
QDRANT_URL = "https://1ae2c6f5-15a4-49e9-af8d-96ab7348e31e.eu-central-1-0.aws.cloud.qdrant.io:6333"
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")


client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)
collection_name = "battery_chunks"
vector_size = embeddings.shape[1]

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
)

points = [
    PointStruct(id=i, vector=embeddings[i], payload=chunks[i])
    for i in range(len(chunks))
]

client.upsert(collection_name=collection_name, points=points)

  client.recreate_collection(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [151]:
query = "Paper battery technology"
query_vector = model.encode(query)

response = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=3
)

for point in response.points:
    print(f"\nScore: {point.score:.4f}")
    print(f"File: {point.payload.get('file')}")


Score: 0.7259
File: paper_battery_tech.pdf

Score: 0.7067
File: paper_battery_tech.pdf

Score: 0.6953
File: paper_battery_tech.pdf


In [152]:
query = "What type of battery is BAT-0002?"
query_vector = model.encode(query)

In [153]:
response = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=20
)

retrieved_chunks = [point.payload.get("text") for point in response.points if point.payload.get("text")]
context = "\n\n".join(retrieved_chunks)

In [154]:
def generate_prompt(context: str, query: str) -> str:
    return f"""You are a technical expert.

Use the following context to answer the user's question.

Context:
{context}

Question:
{query}

Answer:"""

In [155]:
def ask_groq_llm(prompt, model=GROQ_MODEL, key=GROQ_API_KEY):
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a technical expert."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7
    }
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]

In [156]:
prompt = generate_prompt(context, query)
answer = ask_groq_llm(prompt, model=GROQ_MODEL, key=GROQ_API_KEY)
print(f"\nAnswer: {answer}\n")
for point in response.points:
    score = point.score
    file = point.payload.get("file", "Unknown")
    print(f"Score: {score:.4f}\nFile: {file}\n")


Answer: According to the provided context, BAT-0002 is a Li-ion battery.

Score: 0.5829
File: battery_technology_data.xlsx

Score: 0.5747
File: battery_technology_data.xlsx

Score: 0.5737
File: battery_technology_data.xlsx

Score: 0.5687
File: battery_technology_data.xlsx

Score: 0.5667
File: battery_technology_data.xlsx

Score: 0.5607
File: battery_technology_data.xlsx

Score: 0.5590
File: battery_technology_data.xlsx

Score: 0.5549
File: battery_technology_data.xlsx

Score: 0.5527
File: battery_technology_data.xlsx

Score: 0.5482
File: battery_technology_data.xlsx

Score: 0.5385
File: summary_britannica.pdf

Score: 0.5339
File: summary_britannica.pdf

Score: 0.5221
File: summary_britannica.pdf

Score: 0.5177
File: summary_britannica.pdf

Score: 0.5157
File: battery_tech_unique_data.json

Score: 0.5130
File: summary_britannica.pdf

Score: 0.5126
File: battery_tech_unique_data.json

Score: 0.5124
File: summary_britannica.pdf

Score: 0.5090
File: battery_tech_unique_data.json

Score: 0