In [None]:
!pip install redis faiss-cpu numpy torch transformers openai
!sudo apt-get install redis-server
!redis-server --daemonize yes
!pip install langchain_community langchain_openai tiktoken

In [None]:
from langchain_community.callbacks import get_openai_callback
from langchain_openai import ChatOpenAI

In [None]:
import redis
import faiss
import numpy as np
from typing import List, Tuple, Any
from transformers import AutoTokenizer, AutoModel
import torch
import tiktoken
import pandas as pd

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
class CacheBase:
    def __init__(self, redis_host: str = 'localhost', redis_port: int = 6379):
        self.redis_client = redis.Redis(host=redis_host, port=redis_port)

    def set(self, key: str, value: str):
        self.redis_client.set(key, value)

    def get(self, key: str) -> str:
        return self.redis_client.get(key)

    def delete(self, key: str):
        self.redis_client.delete(key)

In [None]:
class VectorBase:
    def __init__(self, dimension: int):
        self.index = faiss.IndexFlatIP(dimension)  # Inner product is equivalent to cosine similarity for normalized vectors

    def add(self, vectors: np.ndarray):
        self.index.add(vectors)

    def search(self, query: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
        return self.index.search(query, k)

In [None]:
class Embedder:
    def __init__(self, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed(self, texts: List[str]) -> np.ndarray:
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)  # Normalize for cosine similarity

In [None]:
class LLM:
    def __init__(self):
        self.llm = ChatOpenAI(model_name="gpt-4o-mini-2024-07-18")

    def generate(self, prompt: str) -> str:
        response = self.llm.invoke(prompt).content
        return response

In [None]:
class CacheManager:
    def __init__(self, cache_base: CacheBase, vector_base: VectorBase, embedder: Embedder, df: pd.DataFrame, llm: LLM, use_df: bool = True):
        """
        :param cache_base: Cache object for key-value caching
        :param vector_base: VectorBase object for FAISS-based vector search
        :param embedder: Embedder object for generating embeddings
        :param df: DataFrame containing the dataset with questions and answers
        :param llm: LLM object for generating responses when not found in cache or DataFrame
        :param use_df: Boolean flag to decide if we should use the DataFrame for answers
        """
        self.cache_base = cache_base
        self.vector_base = vector_base
        self.embedder = embedder
        self.df = df  # Store DataFrame for looking up responses
        self.use_df = use_df  # Flag to determine if we use df or LLM
        self.llm = llm  # LLM for generating responses if not found in DataFrame
        self.counter = 0  # Keeps track of indices for mapping queries to FAISS indices
        self.saved_tokens = [0, 0]  # [prompt_tokens_saved, completion_tokens_saved]
        self.tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")

    def add_to_cache(self, query: str, response: str):
        """Caches question1 with a corresponding response"""
        self.cache_base.set(query, response)

        embedding = self.embedder.embed([query])
        self.vector_base.add(embedding)

        self.cache_base.set(f"faiss_index:{self.counter}", query)
        self.counter += 1

    def get_token_count(self, text: str) -> int:
        """Helper method to count tokens in a text using tiktoken."""
        if isinstance(text, bytes):
            text = text.decode('utf-8')

        tokens = self.tokenizer.encode(text)
        return len(tokens)

    def get_response_from_df(self, query: str) -> str:
        """Look up the response for a given query in the DataFrame."""
        row = self.df[(self.df['question1'] == query) | (self.df['question2'] == query)]
        if not row.empty:
            return row['answer'].values[0]  # Return the answer for the matching question1
        return None

    def get_from_cache(self, query: str, similarity_threshold: float = 0.9) -> str:
        """Check if question2 can be matched with a cached question1 based on embeddings"""
        query_embedding = self.embedder.embed([query])
        distances, indices = self.vector_base.search(query_embedding, 1)

        if distances[0][0] >= similarity_threshold:
            faiss_index = indices[0][0]
            cached_query = self.cache_base.get(f"faiss_index:{faiss_index}")

            if cached_query:
                cached_response = self.cache_base.get(cached_query)

                query_tokens = self.get_token_count(query)
                response_tokens = self.get_token_count(cached_response)

                self.saved_tokens[0] += query_tokens
                self.saved_tokens[1] += response_tokens

                return cached_response
        else:
            # Look up the response in the DataFrame first
            if self.use_df:
                response = self.get_response_from_df(query)
                if response:
                    self.add_to_cache(query, response)  # Cache the result
                    return response

            # If not found in the DataFrame, use the LLM to generate the response
            response = self.llm.generate(query)
            print(f'query : {query} response from LLM {response}')
            self.add_to_cache(query, response)
            return response

        return None


In [None]:
!gdown 1t0ojkPMjY1Euaz3Ve66_EVBFFazfKLeK

Downloading...
From: https://drive.google.com/uc?id=1t0ojkPMjY1Euaz3Ve66_EVBFFazfKLeK
To: /content/similar_quora_general_dataset_final.csv
  0% 0.00/7.53M [00:00<?, ?B/s] 28% 2.10M/7.53M [00:00<00:00, 20.6MB/s]100% 7.53M/7.53M [00:00<00:00, 54.2MB/s]


In [None]:
df = pd.read_csv("similar_quora_general_dataset_final.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.2  10000 non-null  int64 
 1   Unnamed: 0.1  10000 non-null  int64 
 2   Unnamed: 0    10000 non-null  int64 
 3   id            10000 non-null  int64 
 4   qid1          10000 non-null  int64 
 5   qid2          10000 non-null  int64 
 6   question1     10000 non-null  object
 7   question2     10000 non-null  object
 8   is_duplicate  10000 non-null  int64 
 9   answer        10000 non-null  object
dtypes: int64(7), object(3)
memory usage: 781.4+ KB


In [None]:
df = df.head(1000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.2  1000 non-null   int64 
 1   Unnamed: 0.1  1000 non-null   int64 
 2   Unnamed: 0    1000 non-null   int64 
 3   id            1000 non-null   int64 
 4   qid1          1000 non-null   int64 
 5   qid2          1000 non-null   int64 
 6   question1     1000 non-null   object
 7   question2     1000 non-null   object
 8   is_duplicate  1000 non-null   int64 
 9   answer        1000 non-null   object
dtypes: int64(7), object(3)
memory usage: 78.2+ KB


In [None]:
cache_base = CacheBase()
vector_base = VectorBase(384)
embedder = Embedder()
llm = LLM()

cache_manager = CacheManager(cache_base, vector_base, embedder, df, llm, use_df=True)



In [None]:
df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,answer
0,0,0,279065,279065,548630,548631,What is it like to be undergraduate students?,What is it like to be an undergraduate student?,1,Being an Undergrad student can vary greatly d...
1,1,1,87099,87099,172963,172964,Money: What would a world without money be like?,What would the world be like if money didn't e...,1,A world
2,2,2,314999,314999,618180,618181,What are some Punjabi jokes?,What are some good Punjabi jokes?,1,1. Why don't you ever see a bald Sikh man at ...
3,3,3,230470,230470,454154,454155,How was the International Space Station built?,How was international space station made?,1,The construction of theInternationalSpaceStat...
4,4,4,179067,179067,353704,353705,What are the best Skullcandy earbuds?,What are the best Skullcandy earbuds and headp...,1,"The ""best"" skull candy headphones may vary de..."


In [None]:
%%time
cache_manager.get_from_cache("What is it like to be undergraduate students?")

CPU times: user 35.4 ms, sys: 3.01 ms, total: 38.4 ms
Wall time: 38.6 ms


" Being an Undergrad student can vary greatly depending on the individual, their major, institution, and personal experiences. However, in general, Under grads can expect a challenging yet rewarding experience filled with academic coursework, extracurricular activities and socializing opportunities. \nThe academic side of things typically involves taking multiple classes per semester that are related your major or area you'd also interested in exploring further . There will usually homework assignments , exams , papers that you'll need prepare for so keeping up good study habits important .\nYou will have more freedom as an Under grad student than when studying at higher levels; however this comes along side more responsibilities such as managing time effectively between classes, studying for examinations or working part-time jobs if needed\nMany colleges offer various clubs and organizations that allow Students  opportunity participate sports teams that align their interest  pursue ho

In [None]:
%%time
cache_manager.get_from_cache("What is it like to be an undergraduate student?	")

CPU times: user 20.7 ms, sys: 978 µs, total: 21.7 ms
Wall time: 24.8 ms


b" Being an Undergrad student can vary greatly depending on the individual, their major, institution, and personal experiences. However, in general, Under grads can expect a challenging yet rewarding experience filled with academic coursework, extracurricular activities and socializing opportunities. \nThe academic side of things typically involves taking multiple classes per semester that are related your major or area you'd also interested in exploring further . There will usually homework assignments , exams , papers that you'll need prepare for so keeping up good study habits important .\nYou will have more freedom as an Under grad student than when studying at higher levels; however this comes along side more responsibilities such as managing time effectively between classes, studying for examinations or working part-time jobs if needed\nMany colleges offer various clubs and organizations that allow Students  opportunity participate sports teams that align their interest  pursue h

In [None]:
%%time
cache_manager.get_from_cache("What is the capital of France?")

query : What is the capital of France? response from LLM The capital of France is Paris.
CPU times: user 70.1 ms, sys: 0 ns, total: 70.1 ms
Wall time: 899 ms


'The capital of France is Paris.'

In [None]:
%%time
cache_manager.get_from_cache("Where the capital of France is located?")

CPU times: user 20.8 ms, sys: 0 ns, total: 20.8 ms
Wall time: 22 ms


b'The capital of France is Paris.'

In [None]:
# Function to cache question1 with the answers (using the DataFrame)
def cache_question1_with_answer(cache_manager, df):
    epoch = 1
    epoch_interval = int(df.shape[0] * 0.1)
    for idx, row in df.iterrows():
        question = row['question1']
        answer = row['answer']
        cache_manager.add_to_cache(question, answer)
        if (idx + 1) % epoch_interval == 0:
            print(f"Epoch {epoch}")
            epoch += 1

# Function to test question2 and compute hit ratio based on 'is_duplicate' label
def test_question2_for_hits(cache_manager, df):
    total_questions = len(df)
    hit_count = 0
    duplicate_count = 0
    epoch = 1
    epoch_interval = int(df.shape[0] * 0.1)
    for idx, row in df.iterrows():
        query = row['question2']
        is_duplicate = row['is_duplicate']

        cached_response = cache_manager.get_from_cache(query, similarity_threshold=0.9)

        if cached_response:
            if is_duplicate == 1:
                hit_count += 1  # Correct hit for duplicate
            duplicate_count += 1

        if (idx + 1) % epoch_interval == 0:
          print(f"Epoch {epoch}")
          epoch += 1

    hit_ratio = hit_count / duplicate_count if duplicate_count > 0 else 0
    return hit_ratio

In [None]:
import time

# Cache question1 with 'answer' from the dataset
start_time_question1 = time.time()
cache_question1_with_answer(cache_manager, df)
end_time_question1 = time.time()
print(f"Time for question1: {end_time_question1 - start_time_question1:.2f} seconds")


Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Time for question1: 215.19 seconds


In [None]:
# Test the hit ratio for question2 (check cache hits)
start_time_question2 = time.time()
hit_ratio = test_question2_for_hits(cache_manager, df)
end_time_question2 = time.time()
print(f"Time for question2: {end_time_question2 - start_time_question2:.2f} seconds")

print(f"Hit Ratio: {hit_ratio}")

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Time for question2: 290.08 seconds
Hit Ratio: 1.0
