In [None]:
from transformers import BertTokenizer
#understanding tokens in LLM
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
text = 'The ball is on the table.'
tokens = tokenizer.tokenize(text)
print (tokens)

In [None]:
encoded_input = tokenizer.encode(text, return_tensors='pt')
print(encoded_input) 


In [None]:
decoded_text = tokenizer.decode(encoded_input[0])
print(decoded_text)

In [None]:
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import DocugamiLoader
from langchain_google_genai import ChatGoogleGenerativeAI
import os
from dotenv import load_dotenv

load_dotenv() 

gcp_key = os.getenv('GOOGLE_GEMINI_API_KEY')

# Check if the API key is loaded
if not gcp_key:
    raise ValueError("GOOGLE_GEMINI_API_KEY not found in environment variables.")
print("GOOGLE_GEMINI_API_KEY loaded successfully: ", gcp_key)

In [None]:
from google.cloud import aiplatform
from google.oauth2 import service_account
import os 
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '../../acoustic-realm-385717-f4e225be9213.json'
credentials = service_account.Credentials.from_service_account_file(
    '../../acoustic-realm-385717-f4e225be9213.json'
)

project = 'acoustic-realm-385717'
location = 'us-central1'
aiplatform.init(project=project, location=location, credentials=credentials)

models = aiplatform.Model.list()
for model in models: 
    print(f"Model ID: {model.name}, Model Display Name: {model.display_name}, Model Version: {model.version_id}")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.google_palm import GooglePalmEmbeddings
from langchain.llms import GooglePalm
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
import chromadb
import os 
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import numpy as np 


def load_file():
    loader = DirectoryLoader(path=files_loc, glob="**/*.pdf")
    loaded_file = loader.load()
    print(f"Loaded {len(loaded_file)} files.")
    return loaded_file

def splitting(loaded_file):
    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=2000, chunk_overlap=200)
    chunks = text_splitter.split_documents(loaded_file)
    #print(f'sample text: \n {chunks[34].page_content[:500]}')
    return chunks 
#def generate_embeddings()
def text_embedding(split_texts,embeddings):
    
    text_contents = [text.page_content for text in split_texts]
    document_embeddings = embeddings.embed_documents(text_contents)
    
    try: 
        print(f'One of the embeddings: {document_embeddings[34]}')
        print('Embedding generation successful.')
    except Exception as e:
        print(f"Error during embedding: {e}")
        return None
    
    return document_embeddings  

def vectorDB(split_texts, embeddings):
    #create a vectorDB using langchain.chroma
    '''try:
        vector_store = Chroma.from_documents(
                                             collection_name= 'Seneca_Collection',
                                             documents=split_texts, 
                                             embedding=embeddings, 
                                             persist_directory="../data")
        print("Vector store created successfully.")
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return None'''
    vector_store = Chroma.as_retriever() 
    return vector_store

def similaritySearch(vctordb):
    # form similarity search which will be used to retrieve relevant documents
    similarity_search = vector_database.similarity_search("What is the main theme of the document?", k=3)
    print(f"Similarity search results: {similarity_search}")

def retrievel_qa_chain(vector_store):
    llm = GooglePalm(model_name="gemini-1.5-pro", google_api_key=gcp_key) 
    print("Language model initialized.") 
    print("Creating retrieval QA chain...") 
    retrievel_qa = create_retrieval_chain(llm=llm, retriever=vector_store.as_retriever())
    print("Retrieval QA chain created.") 
    return retrievel_qa

if __name__ == "__main__": 
    gcp_key = os.getenv('GOOGLE_GEMINI_API_KEY') 
    files_loc = '../data/'
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=gcp_key)
    loaded_files = load_file()
    split_texts = splitting(loaded_files)
    embeddings_texts = text_embedding(split_texts,embeddings)
    vector_database = vectorDB(split_texts, embeddings)
    similarity_search = similaritySearch(vector_database)
    


In [None]:
from langchain.schema.messages import HumanMessage, AIMessage, SystemMessage
from langchain.chat_models import ChatGooglePalm
 
# Function to chat with the model
def chat_with_model(query):
    chat = ChatGoogleGenerativeAI(model="gemini-2.5-pro", google_api_key=gcp_key)
    messages = [SystemMessage(content='You are an AI assistant. Provide concise and accurate answers to user queries.'),
                HumanMessage(content=query)] 
    
    response = chat([HumanMessage(content=query)])
    answer = chat.invoke([HumanMessage(content=query)])
    return print(answer) 

if __name__ == '__main__':
    query_tg = 'how many oceans are there on earth?'
    response = chat_with_model(query_tg) 