In [1]:
# import classes
from classes.QuerySimilaritySearch import QuerySimilaritySearch
from classes.OpenaiConnector import OpenaiConnector
from classes.ChatSimilarity import ChatSimilarity

In [None]:
openai_api_key = '' #Enter your API Key

In [2]:
chat_similarity = ChatSimilarity(
                    "data/chat_data/conversations.json",
                      openai_api_key,)

### Index Generation

In [3]:
# Generate similar queries and embeddings for the supplied chats
chat_similarity.generate_similar_queries() #this function takes a few mins to run
# import copy
# chat_thread_similar_queries = copy.deepcopy(chat_similarity.chat_thread_similar_queries)
# chat_similarity.chat_thread_similar_queries = copy.deepcopy(chat_thread_similar_queries)

  0%|          | 0/8 [00:00<?, ?it/s]

In [4]:
# let us look at the chat threads
chat_similarity.chat_thread_similar_queries.keys()

dict_keys(['International Travel', 'Nas Top Tracks', "Backpacking in India's Beauty", 'Movie Recommendations Assistant', 'Help Find Missing Beagle', 'Center Aligned Table', 'Resume Modification - Consulting/Marketing', 'Cover Letter Assistance Request'])

In [5]:
# queries for backpacking in India
chat_similarity.chat_thread_similar_queries['Backpacking in India\'s Beauty']['queries']

['Can you recommend any offbeat natural destinations to explore?',
 'Which season would you suggest for trekking in the Himalayas?',
 'Are there any national parks or wildlife reserves worth visiting?',
 "Is sustainable tourism a priority in the places you've recommended?",
 'Please tell me about the vegan food scene in these regions.',
 'Are there any opportunities for volunteering or community work in these areas?',
 'How do I prepare myself for the high altitudes while trekking?',
 'What are the must-visit spiritual sites for backpackers?',
 "I'm interested in adventure sports. Any suggestions?",
 'Can you suggest eco-friendly accommodations or homestays?',
 'If I have extra days, which additional',
 'Are there any off-the-beaten-track destinations in India that offer a mix of natural beauty and vibrant backpacker culture?',
 'Do you have any recommendations for eco-friendly places with scenic landscapes in India?',
 'Have you heard of any lesser-known places in North India that are

In [6]:
# now that the embeddings have been generated, we need to prepare it to be fed to FAISS
chat_similarity.prepare_embeddings_for_faiss()

  0%|          | 0/8 [00:00<?, ?it/s]

In [7]:
# Create a vector DB for similar queries using FAISS
faiss_index = QuerySimilaritySearch(chat_similarity.embedding_matrix,chat_similarity.index_mapping,'cosine_similarity')
# going with a flat cosine-similarity index. Can switch to hnsw when required 

### Testing

The following cells will test the model's predictions. Please note a line is printed before retuning the chat-thread.
This line informs the user how many candidates were returned by the high recall model, before sending it to the high precision model

In [8]:
user_query = '''I want to watch a movie tonight''' # Expected Output : Movie Recommendations Assistant
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

Movie Recommendations Assistant is the single candidate


'Movie Recommendations Assistant'

In [9]:
user_query = '''Help me plan a trip''' # Expected Output : Backpacking in India's Beauty
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

multiple candidate : ['International Travel', "Backpacking in India's Beauty"]


  0%|          | 0/2 [00:00<?, ?it/s]

'International Travel'

In [10]:
user_query = '''Help me plan an international trip''' # Expected Output : International Travel
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

multiple candidate : ['International Travel', "Backpacking in India's Beauty"]


  0%|          | 0/2 [00:00<?, ?it/s]

'International Travel'

In [11]:
user_query = '''Plan an iteniary for Hungary''' # Expected Output : International Travel
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

multiple candidate : ['International Travel', "Backpacking in India's Beauty"]


  0%|          | 0/2 [00:00<?, ?it/s]

'International Travel'

In [13]:
user_query = '''Help me modify my CV''' # expected output: Resume Modification - Consulting/Marketing
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

multiple candidate : ['Resume Modification - Consulting/Marketing', 'Cover Letter Assistance Request']


  0%|          | 0/2 [00:00<?, ?it/s]

'Resume Modification - Consulting/Marketing'

In [15]:
user_query = '''Help me modify my cover letter''' # expected output: Cover Letter Assistance Request
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

multiple candidate : ['Resume Modification - Consulting/Marketing', 'Cover Letter Assistance Request']


  0%|          | 0/2 [00:00<?, ?it/s]

'Cover Letter Assistance Request'

In [16]:
user_query = '''My cat is missing since the past week''' # expected output: Help Find Missing Beagle
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

Help Find Missing Beagle is the single candidate


'Help Find Missing Beagle'

In [17]:
user_query = '''What are the trekking options around bangalore''' # expected output: "Backpacking in India's Beauty"
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

Backpacking in India's Beauty is the single candidate


"Backpacking in India's Beauty"

In [18]:
user_query = '''I want to play rugby''' # expected output: None
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

Backpacking in India's Beauty is the single candidate


"Backpacking in India's Beauty"

In [19]:
user_query = '''Convert the following python function into an equation in latex: 
                
                def quadratic(x,y):
                    return x^2 + (2*x*y) + y^2
                '''
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)
'''
This query can be assigned to Center Aligned Table as I was using that thread to create latex tables
However, as there was no python code anywhere in the thread, the model fails to get this right
'''

No chat threads are similar


'\nThis query can be assigned to Center Aligned Table as I was using that thread to create latex tables\nHowever, as there was no python code anywhere in the thread, the model fails to get this right\n'

In [20]:
user_query = '''Format the my equation into latex'''
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)
'''
This query is correctly assigned to  Center Aligned Table as I was using that thread to create latex tables
'''

Center Aligned Table is the single candidate


'\nThis query is correctly assigned to  Center Aligned Table as I was using that thread to create latex tables\n'

In [21]:
user_query = '''Where should I do my Scuba from? Andamans or Ko Tao
                '''
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)

'''
Expected Output : International Travel. Conflicting between 'International Travel' and Backpacking in India's Beauty
However, it makes sense to classify this under international travel. Another valid response would be None
'''


multiple candidate : ['International Travel', "Backpacking in India's Beauty"]


  0%|          | 0/2 [00:00<?, ?it/s]

"\nExpected Output : International Travel. Conflicting between 'International Travel' and Backpacking in India's Beauty\nHowever, it makes sense to classify this under international travel. Another valid response would be None\n"

In [22]:
user_query = '''Which are the best Safari spots in the world''' # Expected Output : International Travel 
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)
'''
Expected Output : International Travel. Conflicting between 'International Travel' and Backpacking in India's Beauty
However, it makes sense to classify this under international travel as the query mention's the world
'''


multiple candidate : ['International Travel', "Backpacking in India's Beauty", 'Cover Letter Assistance Request']


  0%|          | 0/3 [00:00<?, ?it/s]

"\nExpected Output : International Travel. Conflicting between 'International Travel' and Backpacking in India's Beauty\nHowever, it makes sense to classify this under international travel as the query mention's the world\n"

In [23]:
user_query = '''Tell me more about animal communicators'''
similar_chat_titles = chat_similarity.get_similar_chats(user_query,faiss_index)
chat_similarity.return_most_similar_chat(user_query,similar_chat_titles)
'''
Animal communicators assist in finding missing dogs
'''

Help Find Missing Beagle is the single candidate


'\nAnimal communicators assist in finding missing dogs\n'

### Improvements & exploration

1. experiment with passing ChatSimilarity.relevence_comparision_prompt to the system message prompt
2. Prompts can be optimized to run on smaller models?
3. Found an interesting example in [OpenAI's codebook repo](https://github.com/openai/openai-cookbook/blob/main/examples/Semantic_text_search_using_embeddings.ipynb)
4. [Using Cross encoders for reranking](https://weaviate.io/blog/cross-encoders-as-reranker)