In [5]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.text import TextLoader
import os
from dotenv import load_dotenv

In [None]:
load_dotenv() 

In [2]:
# List all folders in the specified directory
folder_names = os.listdir(r"D:\Langchain TuT\Apr23html")
folder_names[:10]

['AYUSH',
 'Cabinet',
 'Cabinet_Committee_on_Economic_Affairs_CC',
 'Competition_Commission_of_India',
 'Department_of_Atomic_Energy',
 'Department_of_Space',
 'Ministry_of_Agriculture_Farmers_Welfare',
 'Ministry_of_Chemicals_and_Fertilizers',
 'Ministry_of_Civil_Aviation',
 'Ministry_of_Coal']

In [3]:
# DOCS (2-D List) -> 0TH DIMENSION CONTAINS FOLDER NAME -> 1ST DIMENSION CONTAINS FILES IN THOSE FOLDERS

# Function to load documents from a specified directory
def load_documents(parent_folder):
    loader = DirectoryLoader(parent_folder, glob="**/*.txt", loader_cls=TextLoader)
    docs = loader.load()
    return docs

# Load documents from all folders
docs = []
for i in folder_names:
    parent_folder_path = os.path.join(r"D:\Langchain TuT\Apr23html", i)
    docs.append(load_documents(parent_folder_path))

In [4]:
# Example access to the second folder's documents
docs[1]

[Document(metadata={'source': 'D:\\Langchain TuT\\Apr23html\\Cabinet\\Cabinet_approves_National_Quantum_Missio.txt'}, page_content="Cabinet approves National Quantum Mission to scale-up scientific & industrial R&D for quantum technologies\n\nNational Quantum Mission received cabinet approval at a total cost of Rs. 6003.65 crore, to scale up scientific and industrial R&D, for accelerating Quantum Technology led economic growth and leverage India into a leading nation in the area\nThe Union Cabinet, chaired by the Hon'ble Prime Minister Shri Narendra Modi, today approved the National Quantum Mission (NQM) at a total cost of Rs.6003.65 crore from 2023-24 to 2030-31, aiming to seed, nurture and scale up scientific and industrial R&D and create a vibrant & innovative ecosystem in Quantum Technology (QT). This will accelerate QT led economic growth, nurture the ecosystem in the country and make India one of the leading nations in the development of Quantum Technologies & Applications (QTA).\

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text_with_metadata(docs, chunk_size=300, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks_with_metadata = []  # List to store the final chunked text

    for folder in docs:
        for txt_file in folder: # iterate over each text file "txt_file" is merely a variable name
            file_name = txt_file.metadata.get('file_name')  # Get file name from metadata
            content = txt_file.page_content  # Get the text content
            chunks = text_splitter.split_text(content)  # Chunk the text content

            # Append each chunk with metadata
            for chunk in chunks:
                chunks_with_metadata.append({
                    'file_name': file_name,
                    'chunk': chunk
                })
    
    return chunks_with_metadata

# Chunk the loaded documents
chunks = chunk_text_with_metadata(docs)

In [6]:
for chunk in chunks:
    print(chunk)

{'file_name': None, 'chunk': 'AIIA hosts walkthrough for Working Group of C20 on Integrated Holistic Health'}
{'file_name': None, 'chunk': 'MoU signed between AIIA and University of Amrita Vishwa Vidhyapeetham for collaboration in the field of Research and Academics in Ayurveda'}
{'file_name': None, 'chunk': 'All India Institute of Ayurveda (AIIA) under the Ministry of AYUSH hosted a walkthrough for the “Working Group of C20 on Integrated Holistic Health on 8th April 2023. A large group of over 400 delegates participated in the event and the institute showcased\xa0how an integrated approach in health care'}
{'file_name': None, 'chunk': 'an integrated approach in health care can be effectively implemented.'}
{'file_name': None, 'chunk': 'The C-20 is one of the eight official engagement groups of the G-20 forum. The 19 countries are Argentina, Australia, Brazil, Canada, China, Germany, France, India, Indonesia, Italy, Japan, Mexico, the Russian Federation, Saudi Arabia, South Africa, Sou

In [7]:
len(chunks)

16946

In [8]:
path = r"D:\Langchain TuT\Apr23html"
files = []

# Read all text files in the directory and store their contents
for file_name in os.listdir(path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            files.append({
               'file_name': file_name,
               'content': content
            })

In [9]:
files

[]

In [11]:
# Base directory for folders
base_directory = r"\\GRATITUDE\Users\Public\Documents\Apr23html"

# Print all sub-directory paths
for i in folder_names:
    directory_path = os.path.join(base_directory, i)
    print(directory_path)

# %% 
# Check the total number of chunks with metadata


\\GRATITUDE\Users\Public\Documents\Apr23html\AYUSH
\\GRATITUDE\Users\Public\Documents\Apr23html\Cabinet
\\GRATITUDE\Users\Public\Documents\Apr23html\Cabinet_Committee_on_Economic_Affairs_CC
\\GRATITUDE\Users\Public\Documents\Apr23html\Competition_Commission_of_India
\\GRATITUDE\Users\Public\Documents\Apr23html\Department_of_Atomic_Energy
\\GRATITUDE\Users\Public\Documents\Apr23html\Department_of_Space
\\GRATITUDE\Users\Public\Documents\Apr23html\Ministry_of_Agriculture_Farmers_Welfare
\\GRATITUDE\Users\Public\Documents\Apr23html\Ministry_of_Chemicals_and_Fertilizers
\\GRATITUDE\Users\Public\Documents\Apr23html\Ministry_of_Civil_Aviation
\\GRATITUDE\Users\Public\Documents\Apr23html\Ministry_of_Coal
\\GRATITUDE\Users\Public\Documents\Apr23html\Ministry_of_Commerce_Industry
\\GRATITUDE\Users\Public\Documents\Apr23html\Ministry_of_Communications
\\GRATITUDE\Users\Public\Documents\Apr23html\Ministry_of_Consumer_Affairs_Food_Public
\\GRATITUDE\Users\Public\Documents\Apr23html\Ministry_of_Coo

In [13]:
len(chunks)

16946

In [8]:
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec # Import Pinecone and ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")  # Add your Pinecone API key 
google_api_key = os.getenv("GOOGLE_API_KEY")  # Add your Google Generative AI key

pinecone_api_key = pinecone_api_key # Use your API key
pc = Pinecone(api_key=pinecone_api_key)

CREATE INDEX IN PINECONE  

In [19]:
# Create a new Pinecone index if not already existing
index_name = "test3"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [20]:
# Generate embeddings using HuggingFace model, in short, convert text to vectors
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")

In [21]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings) # Create a PineconeVectorStore object, kis index mei store hona and kya store hona

COMMENTED OUT THE CELL ONLY BECAUSE WE DON'T WANNA STORE THE CHUNKS ALL OVER AGAIN IN THE DB

-> HOWEVER THE CELL RAN THE FIRST TIME IN ORDER TO STORE THE EMBEDDINGS ON TO THE DATABASE 

-> BUT AFTER THAT WE COMMENTEND OUT THE CELL SO THAT IT MAY NOT RUN AGAIN!!!

In [18]:
# vector_store.add_texts(texts=[chunk["chunk"] for chunk in chunks]) 

# ek baar vector store add karne ke baad usnhi ko vaapas se pull kr skte, since test3 mei already stored they vectors, to h hame vapas se load krne ki jarurat nhi padi

KeyboardInterrupt: 

In [22]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 6}, threshold=0.7) # Simple retrieval of data, not RAG

SIMPLE RETRIEVER OF THE MOST SIMILAR CONTENTS FROM THE TEXT FILES BASED ON THE USER'S QUESTION

In [23]:
retrieved_docs = retriever.invoke("Tell me about G20 MAACS meeting?")
len(retrieved_docs)

6

In [24]:
# Print the top 6 retrieved documents
print(retrieved_docs[:6])

[Document(id='1e205225-2e6f-41e3-a246-295e557bfa85', metadata={}, page_content='among G20 members on specific thematic areas was discussed during the meeting.'), Document(id='e9acc3b0-3a2a-4b54-9fe9-5b1e7cf730fb', metadata={}, page_content='The 2nd G20 FMCBG meeting will comprise of three sessions:-'), Document(id='c26c5543-f47f-411d-868a-779c90dbb3e7', metadata={}, page_content='In the run-up to the G20 FMCBG meeting, the G20 Finance and Central Bank Deputies met with the major MDBs on April 12, 2023 to discuss the status of implementation of recommendations of the G20 Independent Panel of MDBs’ CAF. These updates will contribute to the preparation of the G20 Roadmap on'), Document(id='1b5ebec4-7135-4802-bb94-7cc00a8fabdc', metadata={}, page_content='The meeting today provided an opportunity for diagnostics industry partners to put forward their recommendations to G20 Member States so that they can be taken into consideration during the G20 second health working group meeting which ha

In [9]:
# Generate a response using Google Generative AI
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-1.0-pro",
    google_api_key=google_api_key,
    temperature=0.01, # HIGHER THE TEMP. LESSER THE ACCURACY OF THE MODEL
)

# Prompt template for summarizing retrieved documents
test_promt = """You are an expert document summarizer for UPSC aspirants. You will be given a context, read the whole context properly, and then summarize the important points based on the question asked. If the question is not clear, you can ask for more information.

{context}

Question: {question}

Answer:"""

In [31]:
 # Print the retrieved document content
for i in retrieved_docs:
    print(i.page_content)

among G20 members on specific thematic areas was discussed during the meeting.
The 2nd G20 FMCBG meeting will comprise of three sessions:-
In the run-up to the G20 FMCBG meeting, the G20 Finance and Central Bank Deputies met with the major MDBs on April 12, 2023 to discuss the status of implementation of recommendations of the G20 Independent Panel of MDBs’ CAF. These updates will contribute to the preparation of the G20 Roadmap on
The meeting today provided an opportunity for diagnostics industry partners to put forward their recommendations to G20 Member States so that they can be taken into consideration during the G20 second health working group meeting which has outlined “Strengthening cooperation in pharmaceutical
These sessions were followed by presentations by L20 Chair (Labour 20) and B20 Chair (Business 20) on outcomes of their inception meetings. The L20 convenes trade union leaders from the G20 countries, in representation of millions of workers worldwide. It contributes to

In [32]:
# Format the documents retrieved based on the question, in a way that they donot comeprise of meta data 
def format_docs(docs):
    return ''.join([doc.page_content for doc in docs])

Used the "format_docs" down here, to create the perfect response

In [33]:
response = llm.invoke(test_promt.format(context=format_docs(retrieved_docs), question="Tell me about the G20 meeting?"))
print(response)

content='The G20 meeting discussed specific thematic areas among member states. The meeting comprised three sessions:\n\n1. Diagnostics industry partners presented recommendations to G20 Member States for consideration in the second health working group meeting.\n2. Presentations by L20 (Labour 20) and B20 (Business 20) on outcomes of their inception meetings.\n3. Thematic discussions and deliberations that will be reflected in the Communique of G20 EMPOWER and provided as recommendations to G20 leaders.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGL

In [34]:
# Pull prompt template from Langchain hub for RAG (Retrieve and Generate) chain
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

# Define the prompt for RAG
prompt_template = """You are an expert document summarizer for UPSC aspirants. You will be given a context, read the whole context properly, and summarize the important points.

{context}

Question: {question}

Answer:"""

# Create a custom prompt template
custom_rag_prompt = PromptTemplate.from_template(prompt_template)

# Create a RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

# %% 
# Invoke the RAG chain for a question
print(rag_chain.invoke("Tell me about Yoga Mahotsav"))

Yoga Mahotsav is an event organized to promote inclusivity and celebrate diversity through Yoga activities. It aims to encourage people from all walks of life to incorporate Yoga into their daily routines and experience its benefits. The event includes various activities such as Common Yoga Protocol (CYP) practice, workshops, and demonstrations.


In [None]:
vector_store.add_texts(texts=[chunk["chunk"] for chunk in chunks]) 

# ek baar vector store add karne ke baad usnhi ko vaapas se pull kr skte, since test3 mei already stored they vectors, to h hame vapas se load krne ki jarurat nhi padi