Importing Libraries

In [None]:
# Import the sys module for system-related functionality
import sys

# Import the openai module for using OpenAI's API
import openai

# Import the streamlit library for creating web applications
import streamlit as st

# Import the numpy library for numerical operations
import numpy as np

# Import cosine_similarity function from sklearn for calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Import SentenceTransformer from sentence_transformers for sentence embeddings
from sentence_transformers import SentenceTransformer

# Import google.generativeai module (if it exists; please verify the correct module)
import google.generativeai as genai

# Import json module for working with JSON data
import json

# Import requests module for making HTTP requests
import requests

# Import re module for regular expressions
import re

# Import word_tokenize and stopwords from nltk for natural language processing
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Base LLM

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set the device to load the model onto (e.g., "cuda" for GPU)
device = "cuda"

# Load the pre-trained model and tokenizer from the Mistral-7B-Instruct-v0.1 checkpoint
base_LLM = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

############ User input ############
user_input = "what is your knowledge cutoff?"

# Tokenize the user input
encoded_user_input = tokenizer(user_input, return_tensors="pt")

# Move the model inputs to the specified device (e.g., GPU)
model_inputs = encoded_user_input.to(device)

# Move the entire model to the specified device (e.g., GPU)
base_LLM.to(device)

# Generate a response from the model using the provided code snippet
output = base_LLM.generate(inputs=model_inputs, do_sample=True, max_new_tokens=2048)

# Decode and print the generated response
decoded_response = tokenizer.decode(output[0])
print(decoded_response)

Categories

In [None]:
# Dictionary with main category names as keys
categories_data = {
    'sports_stats': 'Statistics and performance metrics of professional athletes and sports personalities',
    'entertainment_stats': 'Statistical insights into the achievements and activities of personalities in the entertainment industry',
    'current_events': 'Up-to-the-minute coverage of breaking news on global current events',
    'entertainment_updates': 'Latest developments, news, and highlights about your favorite movies, TV shows, and celebrities',
    'research_papers': 'Exploration of research papers and scholarly articles on various topics',
}

Semantic Search LLM

In [None]:
# Import the SentenceTransformer class from the sentence_transformers library
from sentence_transformers import SentenceTransformer

# Specify the pre-trained model name to be used
model_HF_name = "Sakil/sentence_similarity_semantic_search"

# Create an instance of the SentenceTransformer model using the specified pre-trained model
model_HF = SentenceTransformer(model_HF_name)

# Get the values from the 'categories_data' dictionary and convert them to a list
# Then, encode the list of category values using the SentenceTransformer model and convert the result to a PyTorch tensor
categories_data_embeddings_hf_ = model_HF.encode(list(categories_data.values()), convert_to_tensor=True)

Gemini MultiModel Similarity Search

In [None]:
# importing google.generativeai as genai
import google.generativeai as genai

# setting the api key
genai.configure(api_key="Your_GEMINI_API_KEY")

# Define a function to calculate the embeddings of the input text
def model_MM(input_text):
    categories_data_embeddings_MM_ = genai.embed_content(
        model="models/embedding-001",
        content=input_text,
        task_type="retrieval_document",
        title="Embedding of inputs")
    return categories_data_embeddings_MM_['embedding']

# calculate embedding of sentence2 and categories_data
categories_data_embeddings_MM_ = model_MM(list(categories_data.values()))

Cosine Similarity

In [None]:
# Importing required libraries
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_best_result(user_prompt, embedding_model_MM, embedding_model_hf, categories_data_embeddings_MM, categories_data_embeddings_hf):
    # Convert user prompt to embedding vector
    user_prompt_embedding_MM = embedding_model_MM([user_prompt])
    user_prompt_embedding_hf = embedding_model_hf.encode([user_prompt], convert_to_tensor=True)

    # Calculate cosine similarity with categories data embeddings
    similarity_scores_MM = cosine_similarity(user_prompt_embedding_MM, categories_data_embeddings_MM)
    similarity_scores_hf = cosine_similarity(user_prompt_embedding_hf, categories_data_embeddings_hf)

    # Find the index of the best result with the highest score for both models
    best_result_index_MM = np.argmax(similarity_scores_MM)
    best_result_index_hf = np.argmax(similarity_scores_hf)

    # Get the best first result with the highest score for both models
    best_result_MM = list(categories_data.keys())[best_result_index_MM]
    best_result_hf = list(categories_data.keys())[best_result_index_hf]

    return best_result_MM, best_result_hf

Finding Categories for a given text

In [None]:
# User prompt
user_prompt = ""

# Find the best result with the highest score for both models
best_result_MM, best_result_hf = find_best_result(user_prompt, model_MM, model_HF, categories_data_embeddings_MM_, categories_data_embeddings_hf_)

# Print the results
print("Best result from MM model:", best_result_MM)
print("Best result from HF model:", best_result_hf)

ArXiv API

In [None]:
# Import the 'requests' module for making HTTP requests and the 're' module for regular expressions.
import requests
import re

# Define a function named 'query_arxiv' that takes a search query, start position, and maximum results as parameters.
def query_arxiv(search_query, start=0, max_results=2):
    
    # Define the base URL for the ArXiv API.
    base_url = "http://export.arxiv.org/api/query?"
    
    # Create a dictionary 'query_params' with the specified parameters for the API query.
    query_params = {
        'search_query': f'all:{search_query}',  # Construct the search query parameter.
        'start': start,  # Set the start position parameter.
        'max_results': max_results  # Set the maximum results parameter.
    }

    # Make an HTTP GET request to the ArXiv API using the 'requests.get' function with the constructed URL and parameters.
    response = requests.get(base_url, params=query_params)

    # Use regular expression (regex) to extract content between '<summary>' tags in the API response.
    summaries = re.findall('<summary>(.*?)</summary>', response.text, re.DOTALL)

    # Replace newline and tab characters with spaces, and trim leading/trailing spaces for each summary.
    # Join the modified summaries into a single string separated by spaces.
    merged_summary = ' '.join(summary.replace('\n', ' ').replace('\t', ' ').strip() for summary in summaries)

    # Return the merged and cleaned summary.
    return merged_summary


Extracting Entities

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def extract_keywords(text):
    # Tokenize, remove stopwords, and merge into a string
    merged_words = ' '.join([word for word in word_tokenize(text) if word.lower() not in set(stopwords.words('english'))])

    # Remove ? ! , . from the string
    merged_words = re.sub('[\?\!\,\.\']', '', merged_words)  # Missing 'import re' statement

    # strip the string
    merged_words = merged_words.strip()

    # Print the remaining keywords
    return merged_words

# Example usage
prompt = ""
extract_keywords(prompt)

Bing News API

In [None]:
import json
import requests
import re

def get_current_information(query):
    # Hardcoded values for the URL and subscription key
    search_url = "https://api.bing.microsoft.com/v7.0/news/search"  # actual search URL
    subscription_key = "YOUR_BING_API_KEY"  # Replace with the actual subscription key

    # Make the API request
    response = requests.get(search_url, headers={"Ocp-Apim-Subscription-Key": subscription_key},
                            params={"q": query, "textDecorations": True, "textFormat": "HTML"})
    response.raise_for_status()

    # Extract descriptions from the response
    descriptions = [item.get('description', '') for item in json.loads(json.dumps(response.json())).get('value', [])]

    # Clean and format the descriptions
    cleaned_description = re.sub('<.*?>', '', '\n'.join(descriptions))
    cleaned_description = re.sub('\.\.\.', '', cleaned_description)
    cleaned_description = re.sub('&#39;', "'", cleaned_description)

    return cleaned_description

Cosine Similarity with Gemini MultiModel

In [None]:
def find_best_result_MM(user_prompt, embedding_model_MM, categories_data_embeddings_MM):
    # Convert user prompt to embedding vector
    user_prompt_embedding_MM = embedding_model_MM([user_prompt])

    # Calculate cosine similarity with categories data embeddings
    similarity_scores_MM = cosine_similarity(user_prompt_embedding_MM, categories_data_embeddings_MM)

    # Find the index of the best result with the highest score for both models
    best_result_index_MM = np.argmax(similarity_scores_MM)

    # Get the best first result with the highest score for both models
    best_result_MM = list(categories_data.keys())[best_result_index_MM]

    return best_result_MM

Classification with Gemini MultiModel and trigger relevant Api

In [None]:
user_prompt = ""

if find_best_result_MM(user_prompt, model_MM, categories_data_embeddings_MM_) == 'research_papers':
    information = query_arxiv(user_prompt)
elif find_best_result_MM(user_prompt, model_MM, categories_data_embeddings_MM_) == 'current_events':
    information = get_current_information(extract_keywords(user_prompt))

# Create a prompt template with the obtained information
prompt_template = f'''
{information}

answer the following question in a detailed manner based on the above information
{user_prompt}
'''

# Generate a response from the base language model using the provided prompt
output = base_LLM.generate(inputs=model_inputs, do_sample=True, max_new_tokens=2048)

# Decode and print the generated response
decoded_response = tokenizer.decode(output[0])
print(decoded_response)

Bye Bye