In [None]:
corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new about history, art, or science.",
    "Attend a live music concert and feel the rhythm of your favorite genre, be it rock, jazz, or classical.",
    "Go for a hike in nature and admire the beauty of the natural scenery, from mountains and forests to deserts and beaches.",
    "Have a picnic with friends and share some laughs, good food, and great company in the outdoors.",
    "Explore a new cuisine by dining at an ethnic restaurant and tantalize your taste buds with exotic flavors.",
    "Take a yoga class and stretch your body and mind, promoting relaxation and inner peace.",
    "Join a local sports league and enjoy some friendly competition, while getting exercise and socializing with others who share your passion for the sport.",
    "Attend a workshop or lecture on a topic you're interested in, to expand your knowledge and gain new skills.",
    "Visit an amusement park and experience the thrill of riding roller coasters, bumper cars, and other exciting attractions.",
    "Go stargazing on a clear night and marvel at the wonders of the universe.",
    "Volunteer at a local charity and give back to your community.",
    "Learn a new language and open yourself up to new cultures and experiences.",
    "Take a road trip and explore new places, experiencing the beauty and diversity of the world around you.",
    "Go camping under the stars and reconnect with nature.",
    "Read a book and get lost in a captivating story.",
    "Binge-watch a tv show or movie series and enjoy a relaxing escape.",
    "Try a new hobby or activity, like painting, pottery, or playing a musical instrument.",
    "Spend time with loved ones and create lasting memories.",
    "Take a relaxing bath and unwind after a long day."
]

In [None]:
import numpy as np
from collections import Counter

In [None]:
def cosine_similarity(query, document):
    # Tokenize and count words
    query_counts = Counter(query.lower().split())
    doc_counts = Counter(document.lower().split())
    
    # Get the union of words
    all_words = set(query_counts.keys()) | set(doc_counts.keys())
    
    # Create vectors
    query_vec = np.array([query_counts.get(word, 0) for word in all_words])
    doc_vec = np.array([doc_counts.get(word, 0) for word in all_words])
    
    # Compute dot product and norms
    dot_product = np.dot(query_vec, doc_vec)
    norm_query = np.linalg.norm(query_vec)
    norm_document = np.linalg.norm(doc_vec)
    
    # Compute and return the cosine similarity
    if norm_query == 0 or norm_document == 0:
        return 0.0  # Handle edge case where one vector is zero
    return dot_product / (norm_query * norm_document)

In [None]:
def return_response(query, corpus):
    similarities = []
    for doc in corpus:
        similarity = cosine_similarity(user_input, doc)
        similarities.append(similarity)
    return corpus_of_documents[similarities.index(max(similarities))]

In [None]:
user_input = "I like to hike"
return_response(user_input, corpus_of_documents)

In [None]:
relevant_document = return_response(user_input, corpus_of_documents)
full_response = []

prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

In [None]:
!pip install xformers==0.0.25 numpy==1.23.5 pyarrow==14.0.1 fsspec==2023.6.0 torch -q
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -q
!pip install --no-deps trl peft accelerate bitsandbytes -q

In [None]:
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "glouriousgautam/Qwen2-1.5b-oasstguanaco-qdora-merged",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to  instead
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
messages = [
    {"from": "human", "value": prompt.format(user_input=user_input, relevant_document=relevant_document)},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

### Method 2

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List

In [None]:
def get_top_similar_documents(query: str, corpus: List[str], top_n: int = 1) -> List[str]:
    # Preprocess the query and corpus
    preprocessed_query = query.lower().strip()
    preprocessed_corpus = [doc.lower().strip() for doc in corpus]

    vectorizer = TfidfVectorizer(stop_words='english')
    corpus_vectors = vectorizer.fit_transform(preprocessed_corpus)
    query_vector = vectorizer.transform([preprocessed_query])

    # Compute cosine similarities
    similarities = cosine_similarity(query_vector, corpus_vectors).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]

    return [corpus[i] for i in top_indices]

In [None]:
get_top_similar_documents("What is a leisure activity that you like?", corpus_of_documents)