In [None]:
import ast
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import torch
import gc
import numpy as np
from tqdm import tqdm

tqdm.pandas()

# RAG: Query based retrival using embeddings
### Overview
The goal of this step is to filter and rank articles based on their embedding similarity scores to a user-provided prompt. This is done using a pre-trained model to convert the prompt into embeddings, and then comparing these embeddings with the embeddings of articles stored in a DataFrame. <br>

The open source model is the same used before in the preprocessing step, which is [`jinaai/jina-embeddings-v3`](https://huggingface.co/jinaai/jina-embeddings-v3).

#### Possibile future improvements
The current filtering approach based on user queries can be expanded by incorporating additional criteria such as number of downloads, likes, or publication date. This a simple implementation which does not require any computational resources.<br>
Moreover, the retrieval performance can be improved by experimenting with alternative similarity metrics such as cosine similarity or Euclidean distance or trying with another model.


In [None]:
system_content_prompt = """
As an intelligent language model, your role is to accurately determine whether the provided data is relevant to the user's query.
Answer ONLY with 'Yes' or 'No'
"""
language_model = "meta-llama/Llama-3.1-8B-Instruct"
embedding_model = "jinaai/jina-embeddings-v3"


def convert_prompt_to_embedding(prompt):
    """
    Converts text prompt to embeddings using a pre-trained model.

    Args:
        prompt (str): Input text to convert to embeddings

    Returns:
        numpy.ndarray: Embedding vector representation of the input text
    """
    model = AutoModel.from_pretrained(embedding_model, trust_remote_code=True).to("cpu")  # for now cpu
    embedding = model.encode(prompt, task="text-matching")
    torch.cuda.empty_cache()
    gc.collect()
    return embedding


def compute_score(embeddings, prompt):
    models_embedding = np.array(embeddings)
    return np.dot(models_embedding, prompt)


def filter_by_score(data, prompt, range=10):
    """
    Filters and ranks models based on embedding similarity scores.

    Args:
        data (pd.DataFrame): DataFrame containing 'embeddings' column
        prompt (numpy.ndarray): Prompt embedding to compare against
        range (int, optional): Number of top models to return. Defaults to 10.

    Returns:
        pd.DataFrame: Top N articles sorted by similarity score
    """

    data['embeddings'] = data['embeddings'].apply(
        lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)  #da sistemare

    data['score'] = data['embeddings'].progress_apply(lambda x: compute_score(x, prompt))
    data.sort_values(by='score', ascending=False, inplace=True)
    return data.head(range)


def build_user_content(data, mode="model"):
    content = ""

    if mode == "model":
        content = data['model_id'] + "\n" + data['base_model'] + "\n" + data['author'] + "\n" + data[
            'readme_file'] + "\n" + data['license'] + "\n" + data['language'] + "\n" + data['tags'] + "\n" + data[
                      'pipeline_tag'] + "\n" + data['library_name']

    elif mode == "dataset":
        content = data['dataset_id'] + "\n" + data['author'] + "\n" + data['readme_file'] + "\n" + data[
            'tags'] + "\n" + data['language'] + "\n" + data['license'] + "\n" + data['multilinguality'] + "\n" + data[
                      'size_categories'] + "\n" + data['task-categories']

    return content


def filter_by_user_prompt(data, user_prompt, mode="model"):
    prompt_embedding = convert_prompt_to_embedding(user_prompt)
    data = filter_by_score(data, prompt_embedding)

    indices_to_remove = []

    tokenizer = AutoTokenizer.from_pretrained(language_model, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        language_model,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        device_map='auto'
    )

    for i, item in tqdm(data.iterrows(), total=len(data), desc="Evaluating data relevance"):
        user_content = build_user_content(data, mode)

        messages = [
            {"role": "system", "content": system_content_prompt},
            {"role": "user", "content": user_content},
        ]

        tokenized = tokenizer.apply_chat_template(messages, tokenize=False)
        tokenized = tokenizer(tokenized, return_tensors='pt').to('cuda')
        generated_ids = model.generate(**tokenized, max_new_tokens=3000)
        output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        result = output[-2:]
        print(result)

        if "no" in result.lower():
            indices_to_remove.append(i)

        if indices_to_remove:
            data = data.drop(indices_to_remove)
            data = data.reset_index(drop=True)

    print(f"Shortlisted {len(data)} relevant items")

    return data

In [None]:
df = pd.read_csv("huggingface_models_embeddings.csv")
df.head()

### Example with models from Hugging Face

In [None]:
# Model prompts
user_prompts = [
    "What is the best model for text generation?",
    "Which model should I use for sentiment analysis?",
    "Find top models for image classification.",
    "Best models for summarization tasks?",
    "What are the newest models for code generation?",
    "Top-performing models for question answering?",
    "Which models support Italian language?",
    "Best lightweight models for mobile deployment.",
    "Which models are most popular on Hugging Face?",
    "Find models optimized for speed and low latency."
]

# Log results
with open("models_results_log.txt", "w", encoding="utf-8") as log_file:
    for i, prompt in enumerate(user_prompts, 1):
        log_file.write(f"\n--- Query {i}: {prompt} ---\n")
        filtered_df = filter_by_user_prompt(df, prompt, mode="model")
        for _, row in filtered_df.iterrows():
            log_file.write(f"\n{row['model_id']} | score: {row['score']:.4f}\n")
            log_file.write(f"Author: {row['author']}\n")
            log_file.write(f"Pipeline Tag: {row['pipeline_tag']}\n")
            log_file.write(f"ReadmeFile, first 100 characters: {row['readme_file'][:100]}\n")
            log_file.write("-" * 50 + "\n")
        log_file.write("\n")

### Example with datasets from Hugging Face

In [None]:
df = pd.read_csv("Output/datasets_hg_embeddings.csv")
print(df.shape)

In [None]:
df.head()

In [None]:
# Dataset prompts
user_prompts = [
    "Which datasets are best for sentiment analysis in Italian?",
    "Find large-scale datasets with multilingual support and open licenses.",
    "What are the top trending datasets for text summarization?",
    "Datasets with detailed README files and active contributors.",
    "Show datasets suitable for low-resource language modeling.",
    "Which datasets support image classification tasks and are under 1GB?",
    "List recently created datasets for question answering in biomedical domain.",
    "Find datasets curated for cross-lingual classification tasks with labeled examples and language identifiers.",
    "Find high-quality datasets for code generation with permissive licenses.",
    "List benchmark NLP datasets commonly used in academic research and model evaluation."
]

# Log results
with open("datasets_results_log.txt", "w", encoding="utf-8") as log_file:
    for i, prompt in enumerate(user_prompts, 1):
        log_file.write(f"\n--- Query {i}: {prompt} ---\n")
        filtered_df = filter_by_user_prompt(df, prompt, mode="dataset")
        for _, row in filtered_df.iterrows():
            log_file.write(f"\n{row['dataset_id']} | score: {row['score']:.4f}\n")
            log_file.write(f"Author: {row['author']}\n")
            log_file.write(f"Task Categories: {row['task-categories']}\n")
            log_file.write(f"ReadmeFile, first 100 characters: {row['readme_file'][:100]}\n")
            # divider
            log_file.write("-" * 50 + "\n")
        log_file.write("\n")