In [None]:
import langchain
import google.genai
from dotenv import load_dotenv
!pip install langchain_google_genai -q
!pip install -U langchain langchain-core langchain-community -q

In [None]:
!pip install pymupdf -q
!pip install -U langchain-text-splitters -q

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
import getpass
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")


model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=1.0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    )

Enter your Google AI API key: ··········


In [None]:
model.invoke('hello').content

'Hi there! How can I help you today?'

## Data Ingestion

In [None]:
#data load
from langchain_community.document_loaders import PyMuPDFLoader

file_path = "/content/Entrepreneurial Intention Prediction Report.pdf"
loader = PyMuPDFLoader(file_path)



In [None]:
docs = loader.load()
document = [docs[i].page_content for i in range(len(docs))][0]

In [None]:
#text splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_text(document)

In [None]:
texts

['Entrepreneurial Intention Prediction \nReport \nName: Ayush Vishwakarma\u200b',
 'Dataset: GEM APS Global Individual Level Data\u200b',
 'Target Variable: Entrepreneurial Intention (futsupno)\u200b\nDate: 07 October 2025 \n \n1. Introduction',
 'The objective of this analysis is to model and predict entrepreneurial intention among individuals',
 'based on demographic, attitudinal, and social network features. The GEM 2020 Adult',
 'Population Survey (APS) dataset provides various attributes related to personal characteristics,',
 'motivations, and perceptions regarding entrepreneurship.',
 'The target variable, futsupno, indicates whether an individual intends to engage in',
 'entrepreneurial activities in the near future. A value of 1 represents intention to start a',
 'business,',
 'while 0 indicates no intention. \n \n2. Selected Features',
 'Seventeen predictors were used for modeling:',
 '●\u200b Demographic: age, gender, hhsize, gemhhinc, gemeduc\u200b',
 '●\u200b Entrepreneuri

In [None]:
#embedding
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
vector = embeddings.embed_documents(texts)
len(vector)

16

In [None]:
#vector store
from langchain_core.vectorstores import InMemoryVectorStore


vectorstore = InMemoryVectorStore.from_texts(
    texts,
    embedding=embeddings,
)

# Use the vectorstore as a retriever
retriever = vectorstore.as_retriever()

# # Retrieve the most similar text
# retrieved_documents = retriever.invoke("what is Dataset")

# # show the retrieved document's content
# retrieved_documents[0].page_content
query = "what is Dataset"
docs = vectorstore.similarity_search(query, k=4)

# Display the results
for i, doc in enumerate(docs):
    print(f"Document {i+1}:")
    print(doc.page_content)
    print("-" * 50)

Document 1:
Population Survey (APS) dataset provides various attributes related to personal characteristics,
--------------------------------------------------
Document 2:
Dataset: GEM APS Global Individual Level Data​
--------------------------------------------------
Document 3:
while 0 indicates no intention. 
 
2. Selected Features
--------------------------------------------------
Document 4:
●​ Demographic: age, gender, hhsize, gemhhinc, gemeduc​
--------------------------------------------------


In [None]:
from langchain_core.prompts import ChatPromptTemplate

template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [None]:
prompt=ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse ten sentences maximum and keep the answer concise.\nQuestion: {question}\nContext: {context}\nAnswer:\n"), additional_kwargs={})])

In [None]:
from langchain_core.runnables import RunnablePassthrough


rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | model
)
rag_chain.invoke("dataset")

AIMessage(content='The GEM APS Global Individual Level Data dataset offers a variety of personal attributes. This dataset is also known as the Population Survey (APS) dataset. It includes information on personal characteristics. The dataset has seventeen predictors that were used for modeling.', additional_kwargs={}, response_metadata={'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash-lite', 'safety_ratings': [], 'model_provider': 'google_genai'}, id='lc_run--019b4c64-19de-71e3-951b-12027123fd10-0', usage_metadata={'input_tokens': 298, 'output_tokens': 47, 'total_tokens': 345, 'input_token_details': {'cache_read': 0}})