In [2]:
from collections import defaultdict

In [81]:
from langchain.chains import (
    ConversationalRetrievalChain,
    RetrievalQA,
    RetrievalQAWithSourcesChain,
)
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain_google_community import (
    VertexAIMultiTurnSearchRetriever,
    VertexAISearchRetriever,
)
from langchain_google_vertexai import VertexAI

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
import re
from google.cloud import storage
import json
import vertexai


In [4]:


def read_json_from_gcs(bucket_name, file_path):
    client = storage.Client()

    bucket = client.bucket(bucket_name)

    blob = bucket.blob(file_path)

    json_data = json.loads(blob.download_as_text())

    return json_data

bucket_name = "lossless-learning"
file_path = "topics.json"

data = read_json_from_gcs(bucket_name, file_path)


In [56]:
PROJECT_ID = "ardent-sun-453501-d5"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}


vertexai.init(project=PROJECT_ID, location=LOCATION)

DATA_STORE_ID = "book_store"  # @param {type:"string"}
DATA_STORE_LOCATION = "global"  # @param {type:"string"}

MODEL = "gemini-2.0-flash"  # @param {type:"string"}


llm = VertexAI(model_name=MODEL)

retriever = VertexAISearchRetriever(
    project_id=PROJECT_ID,
    location_id=DATA_STORE_LOCATION,
    data_store_id=DATA_STORE_ID,
    get_extractive_answers=True,
    max_documents=4,
    max_extractive_segment_count=5,
    max_extractive_answer_count=5,
    beta=True
    
)

In [156]:

system_prompt = (
    
    "Guding Principle - Follow the below instructions strictly:-"
    "You are a helpful AI that provides detailed and descriptive answers using LaTeX formatting. "
    "Use proper LaTeX structure for equations with `$$...$$`, and text formatting. "
    "Use at least 200 words in your response, and answer based only on the context provided. "
    "If you don't know the answer, say you don't know.\n\n"
    
    
    "Context: {context}"
)



prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)


docs = defaultdict(list)

summary_doc = defaultdict(list)

null_topics = []


# k = 0

for domain in data.keys():
    print('started domain .....', domain)
    for sub_domain in data[domain].keys():
        for topic in data[domain][sub_domain].keys():
            search_query = "Explain the {}".format(topic) 
            res = chain.invoke({"input":search_query})
            ans = res['answer']
            
            dd = {'domain': domain, 'sub_domain': sub_domain, 'topic': topic, 'summary':ans}
            summary_doc[domain].append(dd)
            
            if len(res['context'])==0:
                
                null_topics.append(topic)
                
            else:
                
                for i in range(min(5, len(res['context']))):


                    raw_source = res['context'][i].metadata['source']
                    pc = res['context'][i].page_content
                    match = re.search(r'^(.*\.pdf)(\d+)$',raw_source)

                    if match:
                        page_number = int(match.group(2))
                        source = str(match.group(1))

                    else:

                        page_number = ''
                        source = ''

                    d = {'domain': domain, 'sub_domain': sub_domain, 'topic': topic, 'resource_type': 'book', 'gcs_uri':source, 
                        'page_no':page_number, 'page_oevrview':pc }

                    docs['domain'].append(d)   
                    
                    
    print('Domain done .....', domain)

started domain ..... Foundational Mathematics
Domain done ..... Foundational Mathematics
started domain ..... Programming Fundamentals
Domain done ..... Programming Fundamentals
started domain ..... Classical Machine Learning
Domain done ..... Classical Machine Learning
started domain ..... Deep Learning
Domain done ..... Deep Learning
started domain ..... MLOps
Domain done ..... MLOps


In [157]:
import json


with open("data_firstore.json", "w") as file:
    json.dump(docs, file, indent=4)

In [158]:
with open("data_summary.json", "w") as file:
    json.dump(summary_doc, file, indent=4)

In [89]:
res['context']

{'input': 'Explain the Differentiation in calculus',
 'context': [Document(metadata={'id': '61193cd79056eb817128e4722ba3de53', 'source': 'gs://lossless-learning/books/Calculus by Gilbert Strang.pdf55'}, page_content='The slope is the rate at which y changes with x. The derivative of a function is its &quot;rate of change.&quot; I mention that physics books use x(t)for distance. Darn it. To emphasize the definition of a derivative, here it is again with y and x: Ay - y(x + Ax)-y(x)- distance up -- - - dy = lim - AY = yl(x).'),
  Document(metadata={'id': '61193cd79056eb817128e4722ba3de53', 'source': 'gs://lossless-learning/books/Calculus by Gilbert Strang.pdf51'}, page_content='On the graph of f(t), the distance up is divided by the distance across. That gives the average slope Af /At. The left sides of (1) and (2)are instantaneous speeds dfldt. They give the slope at the instant t. This is the derivative dfldt (when At and Af shrink to zero). Look again.'),
  Document(metadata={'id': '6

In [92]:
a = res['answer']

In [93]:
clean_s = a.replace('\\\\', '\\')
print(clean_s)


Differentiation in calculus concerns finding the derivative of a function. The derivative, often denoted as $f'(x)$ or $\frac{dy}{dx}$, represents the instantaneous rate of change of a function with respect to its variable. Geometrically, the derivative $f'(a)$ at a point $a$ is the slope of the tangent line to the graph of the function $f$ at that point.

**Formal Definition of the Derivative:**

The derivative of a function $y = f(x)$ is defined as the limit of the difference quotient as the change in $x$ (denoted as $\Delta x$ or $h$) approaches zero:

$\qquad \frac{dy}{dx} = f'(x) = \lim_{\Delta x \to 0} \frac{f(x + \Delta x) - f(x)}{\Delta x}$

This limit, if it exists, gives the instantaneous rate of change of $f(x)$ at the point $x$.

**Notations:**

Various notations are used to represent the derivative:

*   $f'(x)$ (Lagrange's notation)
*   $\frac{dy}{dx}$ (Leibniz's notation)
*   $y'$

Leibniz's notation $\frac{dy}{dx}$ emphasizes the idea of an infinitesimally small change 