In [17]:
import ast
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import torch
import gc
import numpy as np
from tqdm import tqdm

tqdm.pandas()

# RAG: Query based retrival using embeddings
### Overview
The goal of this step is to filter and rank articles based on their embedding similarity scores to a user-provided prompt. This is done using a pre-trained model to convert the prompt into embeddings, and then comparing these embeddings with the embeddings of articles stored in a DataFrame. <br>

The open source model is the same used before in the preprocessing step, which is [`jinaai/jina-embeddings-v3`](https://huggingface.co/jinaai/jina-embeddings-v3).

#### Possibile future improvements
The current filtering approach based on user queries can be expanded by incorporating additional criteria such as number of downloads, likes, or publication date. This a simple implementation which does not require any computational resources.<br>
Moreover, the retrieval performance can be improved by experimenting with alternative similarity metrics such as cosine similarity or Euclidean distance or trying with another model.


In [18]:
embedding_model = "jinaai/jina-embeddings-v3"

def convert_prompt_to_embedding(prompt):
    """
    Converts text prompt to embeddings using a pre-trained model.

    Args:
        prompt (str): Input text to convert to embeddings

    Returns:
        numpy.ndarray: Embedding vector representation of the input text
    """
    model = AutoModel.from_pretrained(embedding_model, trust_remote_code=True).to("cpu") # for now cpu
    embedding = model.encode(prompt, task="text-matching")
    torch.cuda.empty_cache()
    gc.collect()
    return embedding

def compute_score(embeddings, prompt):
    models_embedding = np.array(embeddings)
    return np.dot(models_embedding, prompt)

def filter_by_score(data, prompt, range=10):
    """
    Filters and ranks articles based on embedding similarity scores.

    Args:
        data (pd.DataFrame): DataFrame containing 'embeddings' column
        prompt (numpy.ndarray): Prompt embedding to compare against
        range (int, optional): Number of top articles to return. Defaults to 10.

    Returns:
        pd.DataFrame: Top N articles sorted by similarity score
    """

    data['embeddings'] = data['embeddings'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x) #da sistemare

    data['score'] = data['embeddings'].progress_apply(lambda x: compute_score(x, prompt))
    data.sort_values(by='score', ascending=False, inplace=True)
    return data.head(range)


def filter_by_user_prompt(data, user_prompt):
    prompt_embedding = convert_prompt_to_embedding(user_prompt)
    data = filter_by_score(data, prompt_embedding)
    return data

In [19]:
df = pd.read_csv("huggingface_models_embeddings.csv")
df.head()

Unnamed: 0,model_id,base_model,author,readme_file,license,language,downloads,likes,tags,pipeline_tag,library_name,created_at,embeddings
0,ByteDance-Seed/BAGEL-7B-MoT,['Qwen/Qwen2.5-7B-Instruct'],ByteDance-Seed,🥯 BAGEL • Unified Model for Multimodal Underst...,apache-2.0,,8217,931,"bagel-mot, any-to-any, arxiv:2505.14683, base_...",any-to-any,bagel-mot,2025-05-19 23:27:50+00:00,"[0.10378886014223099, -0.051912061870098114, 0..."
1,deepseek-ai/DeepSeek-R1-0528,,deepseek-ai,DeepSeek-R1-0528\nPaper Link👁️\nIntroduction\n...,mit,,41622,1660,"transformers, safetensors, deepseek_v3, text-g...",text-generation,transformers,2025-05-28 09:46:42+00:00,"[0.1293579787015915, -0.03305066004395485, 0.0..."
2,deepseek-ai/DeepSeek-R1-0528-Qwen3-8B,,deepseek-ai,DeepSeek-R1-0528\nPaper Link👁️\nIntroduction\n...,mit,,55792,603,"transformers, safetensors, qwen3, text-generat...",text-generation,transformers,2025-05-29 11:07:47+00:00,"[0.11969968676567078, -0.029439039528369904, 0..."
3,ResembleAI/chatterbox,,ResembleAI,Chatterbox TTS\nMade with ❤️ by\nWe're excit...,mit,['en'],0,522,"chatterbox, text-to-speech, speech generation,...",text-to-speech,chatterbox,2025-04-24 12:03:33+00:00,"[0.24664218723773956, 0.07128866761922836, 0.0..."
4,google/gemma-3n-E4B-it-litert-preview,,google,[!Note]\nThis repository corresponds to the Pr...,gemma,,0,827,"image-text-to-text, arxiv:1905.07830, arxiv:19...",image-text-to-text,,2025-05-18 19:24:14+00:00,"[0.12829765677452087, -0.1102091372013092, 0.0..."


In [20]:
user_prompts = [
    "What is the best model for text generation?",
    "Which model should I use for sentiment analysis?",
    "Find top models for image classification.",
    "Best models for summarization tasks?",
    "What are the newest models for code generation?",
    "Top-performing models for question answering?",
    "Which models support Italian language?",
    "Best lightweight models for mobile deployment.",
    "Which models are most popular on Hugging Face?",
    "Find models optimized for speed and low latency."
]


# Log results
with open("rag_query_results_log.txt", "w", encoding="utf-8") as log_file:
    for i, prompt in enumerate(user_prompts, 1):
        log_file.write(f"\n--- Query {i}: {prompt} ---\n")
        filtered_df = filter_by_user_prompt(df, prompt)
        for _, row in filtered_df.iterrows():
            log_file.write(f"\n{row['model_id']} | score: {row['score']:.4f}\n")
            log_file.write(f"Author: {row['author']}\n")
            log_file.write(f"Pipeline Tag: {row['pipeline_tag']}\n")
            # divider
            log_file.write("-" * 50 + "\n")
        log_file.write("\n")

100%|██████████| 1086/1086 [00:00<00:00, 528730.60it/s]
100%|██████████| 1086/1086 [00:00<?, ?it/s]
100%|██████████| 1086/1086 [00:00<00:00, 42905.05it/s]
100%|██████████| 1086/1086 [00:00<00:00, 48051.21it/s]
100%|██████████| 1086/1086 [00:00<00:00, 108966.42it/s]
100%|██████████| 1086/1086 [00:00<00:00, 175971.19it/s]
100%|██████████| 1086/1086 [00:00<00:00, 134799.63it/s]
100%|██████████| 1086/1086 [00:00<00:00, 216895.11it/s]
100%|██████████| 1086/1086 [00:00<00:00, 108292.86it/s]
100%|██████████| 1086/1086 [00:00<00:00, 166435.77it/s]
