In [None]:
%pwd

In [None]:
import os
os.chdir('../')

In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from typing import List
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.language_models.chat_models import BaseChatModel
import requests
import os
import json


In [None]:
def pdf_text_extractor(data):
    loader = DirectoryLoader(
        data,
        glob = "*.pdf",
        loader_cls = PyPDFLoader
    )
    documents = loader.load()
    return documents 

In [None]:
extracted_data = pdf_text_extractor('D:\CHATBOT-01\Educational-chatbot\data')

In [None]:
extracted_data

In [None]:
len(extracted_data)

In [None]:
from typing import List
from langchain.schema import Document

In [None]:
def filter_min_docs(docs: List[Document]) -> List[Document]:
    min_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        min_docs.append(
            Document(
            page_content = doc.page_content,
            metadata = {"source": src}
            )
        )
    return min_docs
        

In [None]:
min_docs = filter_min_docs(extracted_data)

In [None]:
min_docs

In [None]:
def text_split(min_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )
    texts_chunk = text_splitter.split_documents(min_docs)
    return texts_chunk

In [None]:
texts_chunk = text_split(min_docs)
len(texts_chunk)


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
        )
    return embeddings

embedding = download_embeddings()



In [None]:
vector = embedding.embed_query("Hello world")
vector

In [None]:
print("length of vector:", len(vector))

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["DEEPSEEK_API_KEY"] = DEEPSEEK_API_KEY

In [None]:
print(os.getenv("DEEPSEEK_API_KEY"))


In [None]:
from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [None]:
pc

In [None]:
from pinecone import ServerlessSpec

index_name = "educational-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "cosine",
        spec = ServerlessSpec(
            cloud = "aws", region = "us-east-1"
            )
    )
    index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(
    documents = texts_chunk,
    embedding = embedding,
    index_name = index_name
)

In [None]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
retriever_doc = retriever.invoke("Explain computer vision")
retriever_doc

In [None]:
class OpenRouterChat(BaseChatModel):
    """Custom LangChain ChatModel for OpenRouter API"""
    url = "https://openrouter.ai/api/v1/chat/completions"

    def __init__(self, api_key: str, model_name: str = "openai/gpt-4o"):
        super().__init__()
        self.api_key = api_key
        self.model_name = model_name

    def _call(self, messages, **kwargs):
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }

        data = {
            "model": self.model_name,
            "messages": [{"role": m.type, "content": m.content} for m in messages],
        }

        response = requests.post(self.url, headers=headers, json=data)
        if response.status_code != 200:
            raise Exception(f"OpenRouter API error: {response.text}")

        content = response.json()["choices"][0]["message"]["content"]
        return content

    @property
    def _llm_type(self):
        return "openrouter-chat"

# Initialize the OpenRouter Chat Model
ChatModel = OpenRouterChat(api_key=OPENROUTER_API_KEY, model_name="openai/gpt-4o")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate 

In [None]:
system_prompt = (
    "You are a friendly, expert educational assistant. Your goal is to provide "
    "clear and accurate information to a student."
    "Use ONLY the following pieces of retrieved context to answer the question. "
    "If the context does not contain the answer, politely say that you don't "
    "have the necessary information to answer that question."
    "Keep your answer to a maximum of three concise sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")   
    ] 
)

In [None]:
question_answering_chain = create_stuff_documents_chain(ChatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [None]:
try:
	response = rag_chain.invoke({"input": "What is Computational Thinking?"})
	print(response["answer"])
except Exception as e:
	if "insufficient_quota" in str(e) or "429" in str(e):
		print("DEEPSEEK AI API quota exceeded. Please check your DEEPSEEK AI account billing and quota.")
	else:
		raise

In [None]:
print(DEEPSEEK_API_KEY)