In [1]:
import os
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv(dotenv_path="/teamspace/studios/this_studio/groq-chatbot/.env")

llamaparseApi= os.getenv("LLAMA_CLOUD_API_KEY")
groqApi= os.getenv("GROQ_API")
qdrantApi= os.getenv("QDRANT_API_KEY")
qdrantUrl= os.getenv("QDRANT_URL")

In [3]:
from llama_parse import LlamaParse
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Qdrant
import pickle


def loadOrParseData():
    dataFile= "./parsedData.pkl"

    if os.path.exists(dataFile):
        with open(dataFile, "rb") as f:
            parsedData= pickle.load(f)

    else:
        parsingInstruction = """The provided document is the syllabus of B. Tech for Computer Science & Engineering (CSE) of NIT- Delhi. 
        This form provides details of the subject courses that will be taught to a student of B. Tech, Computer Science & Engineering in NIT- Delhi.
        Try to be precise while answering the questions"""
        parser = LlamaParse(api_key=llamaparseApi, result_type="markdown", parsing_instruction=parsingInstruction)
        llama_parse_documents = parser.load_data("./syllabusCSE.pdf")

        with open(dataFile, "wb") as f:
            pickle.dump(llama_parse_documents, f)

        parsedData = llama_parse_documents

    return parsedData

parsedData= loadOrParseData()

In [4]:
markdownFile= "./output.md"

def clean_file(input_path, output_path):
    parsed_data = loadOrParseData()

    if not os.path.exists(markdownFile):
        with open(markdownFile, 'a', encoding='utf-8', errors='replace') as f:
            for doc in parsed_data:
                f.write(doc.text + '\n')

    with open(input_path, 'rb') as f:
        content = f.read()

    with open(output_path, 'w', encoding='utf-8', errors='ignore') as f:
        f.write(content.decode('utf-8', errors='ignore'))

# Clean the file before using it
clean_file(markdownFile, 'cleaned_output.md')

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.embeddings import OllamaEmbeddings
# import nltk
# nltk.download()


def vectorDB():
    parsed_data = loadOrParseData()
    markdownFile= "./cleaned_output.md"

    if parsed_data is None:
        print("No parsed data available.")
        return
    
    # if not os.path.exists(markdownFile):
    #     with open(markdownFile, 'a', encoding='utf-8', errors='replace') as f:
    #         for doc in parsed_data:
    #             f.write(doc.text + '\n')


    # loader = DirectoryLoader('./', glob="**/*.md", show_progress=True)
    # documents = loader.load()

    markdownLoader= UnstructuredMarkdownLoader(markdownFile)
    documents= markdownLoader.load()
    
    textSplitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    docs = textSplitter.split_documents(documents)

    # embeddings = FastEmbedEmbeddings()
    embeddings = OllamaEmbeddings(
        model="znbang/bge:small-en-v1.5-f16",
        base_url="http://localhost:11434"
    )

    qdrant = Qdrant.from_documents(
        documents=docs,
        embedding= embeddings,
        api_key= qdrantApi,
        url= qdrantUrl,
        collection_name= "rag"
    )

    print('Vector DB created successfully!')

vectorDB()

Vector DB created successfully!


In [15]:
from langchain_community.embeddings import OllamaEmbeddings

text= "What is the syllabus of Operating System?"
embeddings = OllamaEmbeddings(
        model="znbang/bge:small-en-v1.5-f16",
        base_url="http://localhost:11434"
    )
queryRes= embeddings.embed_query(text)
queryRes[:10]

[-0.4520740807056427,
 0.027799703180789948,
 0.08549992740154266,
 -0.8991678357124329,
 -0.021244749426841736,
 -0.4902018904685974,
 -0.4642481207847595,
 0.09723646938800812,
 0.22709813714027405,
 -0.27400046586990356]