# Installing required libraries

In [None]:
!pip install redis faiss-cpu numpy torch transformers openai
!sudo apt-get install redis-server
!redis-server --daemonize yes
!pip install langchain_community tiktoken
!pip install together

# Importing libraries

In [None]:
import redis
import faiss
import heapq
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from typing import List, Tuple, Any
from transformers import AutoTokenizer, AutoModel
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
import torch
import tiktoken
from sklearn.cluster import DBSCAN
import pandas as pd
from together import Together

# Cache Base Storage

In [None]:
class CacheBase:
    def __init__(self, redis_host: str = 'localhost', redis_port: int = 6379):
        self.redis_client = redis.Redis(host=redis_host, port=redis_port)

    def set(self, key: str, value: str):
        self.redis_client.set(key, value)

    def get(self, key: str) -> str:
        return self.redis_client.get(key)

    def delete(self, key: str):
        self.redis_client.delete(key)

    def clear(self):
        self.redis_client.flushdb()

# Vector Base Storage

In [None]:
class VectorBase:
    def __init__(self, dimension: int):
        self.dimension = dimension
        self.index = faiss.IndexFlatIP(dimension)  # Inner product is equivalent to cosine similarity for normalized vectors

    def add(self, vectors: np.ndarray):
        self.index.add(vectors)

    def search(self, query: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
        return self.index.search(query, k)

    def clear(self):
        self.index = faiss.IndexFlatIP(self.dimension)

# Embedding model

In [None]:
class Embedder:
    def __init__(self, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.dimension = self.model.config.hidden_size

    def embed(self, texts: List[str]) -> np.ndarray:
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)  # Normalize for cosine similarity

# Large Language Model(LLM)

In [None]:
class LLM:
    def __init__(self):
        self.client = Together(api_key="")


    def generate(self, user_prompt: str) -> str:
      return self.client.chat.completions.create(
            model="meta-llama/Meta-Llama-3-8B-Instruct-Lite",
            messages=[{"role": "user", "content": user_prompt}],
        ).choices[0].message.content

# Cache Manager
Invloves two main components: cache base and vector base storages

In [None]:
class CacheManager:
    def __init__(self, cache_base: CacheBase, vector_base: VectorBase,
                 embedder: Embedder, llm: LLM, cache_size=10000, save=True):
        self.cache_base = cache_base
        self.vector_base = vector_base
        self.cache_size = cache_size
        self.embedder = embedder
        self.save = save
        self.counter = 0
        self.saved_tokens = [0, 0]
        self.llm = llm
        self.total_hits = 0
        self.patterns = []  # To store identified clusters
        self.patterns_meta = []
        self.Ts = 100000
        self.Tp = 0.1
        self.min_samples = 3
        self.eps = 1
        self.tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")

    def add_to_cache(self, query: str, response: str):
        self.cache_base.set(query, response)
        embedding = self.embedder.embed([query])
        self.vector_base.add(embedding)
        self.cache_base.set(f"faiss_index:{self.counter}", query)
        self.counter += 1

    def clear(self):
        self.cache_base.clear()
        self.vector_base.clear()
        self.counter = 0
        self.total_hits = 0
        self.saved_tokens = [0, 0]

    def __get_token_count(self, text: str) -> int:
        if isinstance(text, bytes):
            text = text.decode('utf-8')
        tokens = self.tokenizer.encode(text)
        return len(tokens)

    def get_from_cache(self, query: str, similarity_threshold: float = 0.9) -> str:
        query_embedding = self.embedder.embed([query])
        distances, indices = self.vector_base.search(query_embedding, 1)
        faiss_index = indices[0][0]
        cached_query = self.cache_base.get(f"faiss_index:{faiss_index}")

        if distances[0][0] >= similarity_threshold and cached_query:
            cached_response = self.cache_base.get(cached_query)
            self.total_hits += 1
            query_tokens = self.__get_token_count(query)
            response_tokens = self.__get_token_count(cached_response)
            self.saved_tokens[0] += query_tokens
            self.saved_tokens[1] += response_tokens
            return cached_response

        response = self.llm.generate(query)

        if self.save and self.counter <= self.cache_size:
            # self.add_to_cache(query, response)
            if self.cluster_queries(query,response):
              self.add_to_cache(query, response)
              # self.cache_base.delete(f"faiss_index:{self.counter - 1}")
              # self.vector_base.index.remove_ids(np.array([self.counter - 1]))
              # self.counter -= 1

        return response

    def populate_cache(self, queries: list[str], responses: list[str]):
        for query, response in zip(queries, responses):
            self.add_to_cache(query, response)

    def cluster_queries(self,query,response):
        if self.counter < self.min_samples:
            return True

        embeddings = []
        saved_tokens = []


        for index in range(self.counter):
            query = str(self.cache_base.get(f"faiss_index:{index}"))
            response = str(self.cache_base.get(query))
            saved_tokens.append(self.__get_token_count(query) + self.__get_token_count(response))
            embeddings.append(self.embedder.embed([query]).flatten())

        embedded_query = self.embedder.embed([query])
        embeddings.append(embedded_query.flatten())
        saved_tokens.append(self.__get_token_count(query)+self.__get_token_count(response))

        embeddings = np.array(embeddings)
        saved_tokens = np.array(saved_tokens)

        dbscan = DBSCAN(eps=self.eps, min_samples=self.min_samples)
        clusters = dbscan.fit_predict(embeddings)
        unique_clusters = np.unique(clusters)

        for cluster in unique_clusters:
            cluster_indices = np.where(clusters == cluster)[0]
            cluster_proportion = len(cluster_indices) / len(clusters)
            saved_tokens_ratio = saved_tokens[cluster_indices].sum()

            if cluster_proportion < self.Tp or saved_tokens_ratio > self.Ts:
                return False

        return True

# Initialization

In [None]:
cache_base = CacheBase()
embedder = Embedder()
vector_base = VectorBase(dimension= embedder.dimension)
llm = LLM()



In [None]:
cache_manager = CacheManager(cache_base,vector_base,embedder,llm,save=True)

# Clear The Cache Manager

In [None]:
cache_manager.clear()

# Number of the cached data

In [None]:
cache_manager.counter

5

# Hit ratio

In [None]:
(cache_manager.total_hits/cache_manager.counter)

0.4

# Total number of saved tokens

In [None]:
cache_manager.saved_tokens

[0, 0]

In [None]:
%%time
cache_manager.get_from_cache("What is the capital of France?")

CPU times: user 83.9 ms, sys: 32.7 ms, total: 117 ms
Wall time: 1.07 s


'The capital of France is Paris.'

In [None]:
%%time
cache_manager.get_from_cache("Where the capital of France is located?")

CPU times: user 18.5 ms, sys: 949 µs, total: 19.4 ms
Wall time: 21.8 ms


b'The capital of France is Paris.'

In [None]:
%%time
cache_manager.get_from_cache("What is the openai?")

CPU times: user 63.5 ms, sys: 4.01 ms, total: 67.5 ms
Wall time: 4.28 s


"OpenAI is a non-profit artificial intelligence (AI) research organization that aims to ensure that AI is beneficial to humanity. It was founded in 2015 by a group of researchers and entrepreneurs, including Elon Musk, Sam Altman, and others.\n\nOpenAI's mission is to:\n\n1. **Advance the state of AI**: OpenAI's research focuses on developing and applying AI to help humans learn, work, and create. They aim to push the boundaries of what is possible with AI, while ensuring that the technology is safe, transparent, and beneficial to society.\n2. **Make AI more accessible**: OpenAI provides open-source AI tools, models, and datasets to the research community, making it easier for others to build upon their work and advance the field.\n3. **Ensure AI is beneficial to humanity**: OpenAI is committed to ensuring that AI is used for the greater good, rather than for malicious purposes. They work to develop AI systems that are transparent, explainable, and accountable, and that align with huma

In [None]:
%%time
cache_manager.get_from_cache("tell me about openai?")

CPU times: user 18.9 ms, sys: 0 ns, total: 18.9 ms
Wall time: 20.7 ms


b"OpenAI is a non-profit artificial intelligence (AI) research organization that aims to ensure that AI is beneficial to humanity. It was founded in 2015 by a group of researchers and entrepreneurs, including Elon Musk, Sam Altman, and others.\n\nOpenAI's mission is to:\n\n1. **Advance the state of AI**: OpenAI's research focuses on developing and applying AI to help humans learn, work, and create. They aim to push the boundaries of what is possible with AI, while ensuring that the technology is safe, transparent, and beneficial to society.\n2. **Make AI more accessible**: OpenAI provides open-source AI tools, models, and datasets to the research community, making it easier for others to build upon their work and advance the field.\n3. **Ensure AI is beneficial to humanity**: OpenAI is committed to ensuring that AI is used for the greater good, rather than for malicious purposes. They work to develop AI systems that are transparent, explainable, and accountable, and that align with hum

# Retrieving the dataset

In [None]:
!gdown 1t0ojkPMjY1Euaz3Ve66_EVBFFazfKLeK

Downloading...
From: https://drive.google.com/uc?id=1t0ojkPMjY1Euaz3Ve66_EVBFFazfKLeK
To: /content/similar_quora_general_dataset_final.csv
  0% 0.00/7.53M [00:00<?, ?B/s]100% 7.53M/7.53M [00:00<00:00, 110MB/s]


In [None]:
df = pd.read_csv('similar_quora_general_dataset_final.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.2  10000 non-null  int64 
 1   Unnamed: 0.1  10000 non-null  int64 
 2   Unnamed: 0    10000 non-null  int64 
 3   id            10000 non-null  int64 
 4   qid1          10000 non-null  int64 
 5   qid2          10000 non-null  int64 
 6   question1     10000 non-null  object
 7   question2     10000 non-null  object
 8   is_duplicate  10000 non-null  int64 
 9   answer        10000 non-null  object
dtypes: int64(7), object(3)
memory usage: 781.4+ KB


In [None]:
df.drop(columns=['Unnamed: 0.2',	'Unnamed: 0.1',	'Unnamed: 0',	'id',	'qid1',	'qid2'],inplace=True)

In [None]:
df.head()

Unnamed: 0,question1,question2,is_duplicate,answer
0,What is it like to be undergraduate students?,What is it like to be an undergraduate student?,1,Being an Undergrad student can vary greatly d...
1,Money: What would a world without money be like?,What would the world be like if money didn't e...,1,A world
2,What are some Punjabi jokes?,What are some good Punjabi jokes?,1,1. Why don't you ever see a bald Sikh man at ...
3,How was the International Space Station built?,How was international space station made?,1,The construction of theInternationalSpaceStat...
4,What are the best Skullcandy earbuds?,What are the best Skullcandy earbuds and headp...,1,"The ""best"" skull candy headphones may vary de..."


 Population the cache storage with question1

In [None]:
cache_manager.populate_cache(df['question1'],df['answer'])

Evaluating the embedding model performance on question2

In [None]:
for query in df['question2']:
  cache_manager.get_from_cache(query)
  print(cache_manager.total_hits)

# Loading the dataset

In [None]:
!kaggle datasets download praneshmukhopadhyay/amazon-questionanswer-dataset

Dataset URL: https://www.kaggle.com/datasets/praneshmukhopadhyay/amazon-questionanswer-dataset
License(s): CC0-1.0
Downloading amazon-questionanswer-dataset.zip to /content
100% 424M/426M [00:19<00:00, 25.8MB/s]
100% 426M/426M [00:19<00:00, 23.1MB/s]


In [None]:
unzipped_file_path = "/content/amazon-questionanswer-dataset.zip"
!unzip -q {unzipped_file_path}

In [None]:
df_questions = pd.read_csv("/content/multi_questions.csv")

In [None]:
df_answers = pd.read_csv("/content/multi_answers.csv")

# Warming up the KNN classifier and CacheManager

In [None]:
merged_df = pd.merge(df_questions, df_answers, on='QuestionID', how='inner')

n_samples_per_category = 10

sampled_queries_with_answers = merged_df.groupby('Category').apply(lambda x: x.sample(n=n_samples_per_category, random_state=42))

sampled_queries_with_answers = sampled_queries_with_answers.reset_index(drop=True)

  sampled_queries_with_answers = merged_df.groupby('Category').apply(lambda x: x.sample(n=n_samples_per_category, random_state=42))


In [None]:
cache_warmup, test = train_test_split(sampled_queries_with_answers, test_size=0.2, random_state=42)

# Pre-Processing

In [None]:
import re

def pre_processing1(query: str):

    query = query.lower()

    query = re.sub(r'[^\w\s]', '', query)
    query = re.sub(r'\d+', '', query)

    query = ' '.join(query.split())

    return query

In [None]:
assistant_content = """You are a text preprocessing assistant. Your task is to clean and prepare the given queries for further evaluation. Do not provide answers or additional information—only process the text to make it clearer and free from errors.

              Please follow these steps:
              1. Convert the text to lowercase.
              2. Remove any special characters, punctuation.
              3. Remove common stopwords (if applicable).
              4. Dont change the meaning of the sentence.
              5. Return the cleaned text as a single string without any additional comments or outputs.

              The input is delimited by < >.
              """

In [None]:
from together import Together
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from textblob import TextBlob

def pre_processing(query: str):

    client = Together(api_key="")

    response = client.chat.completions.create(
        model="meta-llama/Llama-Vision-Free",
        messages=[
            {"role": "assistant", "content": assistant_content},
            {"role": "user", "content": f"<{query}>"}
        ],
    )

    return response.choices[0].message.content


In [None]:
confusing_queries = [
    "Whre th captiol of Frnace is locatd?",
    "Wha t ime do the stors cloes?",
    "Cna you tell me whts the weathr like in New Yrok?",

    "Where the captial of Fraunce iz?",
    "How many pple liv in the UK?",
    "What ar the best rstrnts in Pariss?",

    "What is the capital of France?!#@?",
    "How many countries??!! are there in Europe??",
    "What is the time???!! in Tokyo???",

    "Where is the capital of France is located?",
    "What is the bestest way to learn coding?",
    "How much does it cost for the tickets for concert?",

    "Dove si trova la capitale di France?",
    "Where is the Hauptstadt of Germany?",
    "Qual é o preço do bilhete para o cinema?",

    "Can you please tell me what is the capital city of France?",
    "I would like to know what time it is now at this moment in New York.",
    "What is the current weather condition right now in Paris?",

    "What’s the thing with the Eiffel Tower?",
    "Tell me about the place where people go to see shows.",
    "What do you think about the food in that one country?",

    "Can you tell me, with a detailed explanation, about the capital of France, including its history, culture, and what it’s like to live there?",
    "What is the name of the city that is known for its art, cuisine, and history, particularly in relation to a famous landmark known as the Eiffel Tower?"
]


In [None]:
for query in confusing_queries:
  print(pre_processing(query))