In [None]:
%%capture

!pip install sentence_transformers
!pip install git+https://github.com/LucknowAI/Lucknow-LLM

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import numpy as np
from lucknowllm import UnstructuredDataLoader, split_into_segments, GeminiModel
from google.colab import userdata

In [None]:
MODEL_NAME = 'paraphrase-MiniLM-L6-v2'
API_KEY =  userdata.get('key')
GEMINI_NAME = 'gemini-1.0-pro'
FOLDER_NAME = 'Cultural_Festival_of_Lucknow'
FILE_NAME = "Lucknow_Mahotsav.txt"
TOP_N = 3

In [None]:
sentence_model = SentenceTransformer(MODEL_NAME)
gemini_model   = GeminiModel(api_key=API_KEY, model_name=GEMINI_NAME)

In [None]:
def load_and_preprocess_data():
    loader = UnstructuredDataLoader()
    external_database = loader.get_data(folder_name='Cultural_Festival_of_Lucknow', file_name='Lucknow_Mahotsav.txt')
    chunks = []
    for document in external_database:
        chunks.extend(split_into_segments(document['data']))
    return chunks

In [None]:
def embed_text_data(model, text_data):
    return model.encode(text_data)

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b.T) / (np.linalg.norm(a, axis=1)[:, np.newaxis] * np.linalg.norm(b, axis=1))

In [None]:
def find_top_n_similar(query_vec, data_vecs, top_n=3):
    similarities = cosine_similarity(query_vec[np.newaxis, :], data_vecs)
    top_indices = np.argsort(similarities[0])[::-1][:top_n]
    return top_indices

In [None]:
def generate_gemini_response(prompt):
    return gemini_model.generate_content(prompt)

In [None]:
def main(queries):
    chunks = load_and_preprocess_data()
    embedded_data = embed_text_data(sentence_model, chunks)
    embedded_queries = embed_text_data(sentence_model, queries)

    for i, query_vec in enumerate(embedded_queries):
        top_indices = find_top_n_similar(query_vec, embedded_data, TOP_N)
        top_documents = [chunks[index] for index in top_indices]

        prompt = f"You are an expert question answering system, I'll give you a question and context, and you'll return the answer. Query: {queries[i]} Contexts: {top_documents[0]}"
        model_output = generate_gemini_response(prompt)

        return model_output

In [None]:
queries = ["What is the duration of Lucknow Mahotsav, and when does it usually take place?"]
res = main(queries)
print(res)

The duration of Lucknow Mahotsav is 10 days, and it usually takes place in the month of November or December.
