In [None]:
%pip install langchain-pinecone

In [8]:
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
import os
import pandas as pd
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
from langchain_groq import ChatGroq

os.environ['PINECONE_API_KEY'] = ""
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pinecone_api_key

'4e291d9c-27e4-438e-b424-57a7ea0ba08a'

In [9]:
# Define the relative path to the JSON file
relative_path = os.path.join('.', 'final_results.json')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
# Load the JSON file into a DataFrame
data = pd.read_json(relative_path)

documents = [
    Document(
        page_content=row['article_text'],
        metadata={
            'title': row['title'],
        }
    )
    for _, row in data.iterrows()
]

In [10]:
# Initialize an empty list to hold the split documents
split_documents = []

# Split the article texts and create documents
for _, row in tqdm(data.iterrows(), total=data.shape[0], desc="Splitting documents"):
    article_text = row['article_text']
    title = row['title'] if pd.notnull(row['title']) else ""
    chunks = text_splitter.split_text(article_text)
    
    for chunk in chunks:
        split_documents.append(
            Document(
                page_content=chunk,
                metadata={'title': title}
            )
        )

Splitting documents: 100%|██████████| 1741/1741 [00:00<00:00, 1848.55it/s]


In [11]:
# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)

# Connect to the Pinecone index
index_name = "elrond-index"
index = pc.Index(index_name)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = PineconeVectorStore.from_documents(split_documents, embeddings, pinecone_api_key=pinecone_api_key, index_name=index_name )



In [24]:
query = "What is U-boot?"

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.invoke(query)

llm = ChatGroq(model="llama3-70b-8192")
prompt = """
You are an assistant for question-answering tasks specifically on Embedded Linux and its components like U-boot, Linux kernel, hardware and software stack. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. Answer the questions with 512 words top.

Question: {question} 

Context: {context} 

Answer:
"""

 Document(page_content='U-Boot\n \n\nIntroduction\n U-Boot is an open-source bootloader commonly used in embedded devices. It has its origins in a very simple bootloader designed for the PowerPC architecture which was publicly released in 2000 under the name of PPCBoot. Shortly thereafter it was renamed U-Boot (short for Das Universal Boot) to reflect its evolution into a multi architectural bootloader. Today, U-Boot is a fully-fledged bootloader supporting more than a dozen architectures, several filesystems, and a handful of interfaces. It features a console interface through the serial port with low-level commands and environment variables that provide high flexibility when configuring the boot process. The most remarkable achievement, however, is its good driver assortment, which has established it as the preferred bootloader for most embedded platforms. Toradex also uses U-Boot as the bootloader for its images. You can find the code in our repositories. This article explains how t