<a href="https://colab.research.google.com/github/Brunozml/artistotllm/blob/main/transformer_similarity_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Retrieving from GitHub**

In [1]:
"""
Transformer-based Text Similarity Analysis Tool

This module uses state-of-the-art transformer models to compute semantic similarity
between texts. It leverages pre-trained sentence transformers to encode texts into
high-dimensional vector representations and measures their similarity in the neural space.
"""
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import Tuple, Optional, List
import requests

def load_model(model_name: str = 'all-MiniLM-L6-v2') -> SentenceTransformer:
    """
    Load a Sentence-BERT model.

    Args:
        model_name: Name of the pre-trained model to load

    Returns:
        Loaded SentenceTransformer model
    """
    try:
        return SentenceTransformer(model_name)
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        return None

def get_text_embedding(text: str, model: SentenceTransformer) -> np.ndarray:
    """
    Generate embedding vector for a single text.

    Args:
        text: Input text to encode
        model: Pre-loaded SentenceTransformer model

    Returns:
        Numpy array representing the text embedding
    """
    if model is None:
        raise ValueError("Model is not loaded properly")

    # Handle empty text
    if not text.strip():
        return np.zeros(model.get_sentence_embedding_dimension())

    embedding = model.encode([text])[0]
    return embedding

def transformer_similarity(text1: str, text2: str, model: Optional[SentenceTransformer] = None,
                         model_name: str = 'all-MiniLM-L6-v2') -> Tuple[float, dict]:
    """
    Compare two texts using transformer-based sentence embeddings.

    Args:
        text1: First text to compare
        text2: Second text to compare
        model: Pre-loaded model (optional, will load if None)
        model_name: Name of model to load if model is None

    Returns:
        Tuple of (similarity_score, metadata_dict)
    """
    # Load model if not provided
    if model is None:
        model = load_model(model_name)
        if model is None:
            return 0.0, {"error": "Failed to load model"}

    try:
        # Get embeddings
        embedding1 = get_text_embedding(text1, model)
        embedding2 = get_text_embedding(text2, model)

        # Handle zero vectors
        norm1 = np.linalg.norm(embedding1)
        norm2 = np.linalg.norm(embedding2)

        if norm1 == 0 or norm2 == 0:
            similarity = 0.0
        else:
            # Calculate cosine similarity
            similarity = np.dot(embedding1, embedding2) / (norm1 * norm2)

        return similarity, {
            "embedding_dim": len(embedding1),
            "model_name": model_name,
            "embedding1_sample": embedding1[:5].tolist(),
            "embedding2_sample": embedding2[:5].tolist(),
            "embedding1_norm": float(norm1),
            "embedding2_norm": float(norm2)
        }

    except Exception as e:
        return 0.0, {"error": f"Error computing similarity: {e}"}

def batch_similarity(texts: List[str], model: Optional[SentenceTransformer] = None,
                    model_name: str = 'all-MiniLM-L6-v2') -> Tuple[np.ndarray, dict]:
    """
    Compute pairwise similarities for a list of texts.

    Args:
        texts: List of texts to compare
        model: Pre-loaded model (optional)
        model_name: Name of model to load if model is None

    Returns:
        Tuple of (similarity_matrix, metadata_dict)
    """
    if model is None:
        model = load_model(model_name)
        if model is None:
            return np.array([]), {"error": "Failed to load model"}

    try:
        # Get all embeddings at once (more efficient)
        embeddings = model.encode(texts)

        # Compute pairwise similarities
        similarity_matrix = np.zeros((len(texts), len(texts)))

        for i in range(len(texts)):
            for j in range(len(texts)):
                if i == j:
                    similarity_matrix[i, j] = 1.0
                else:
                    norm_i = np.linalg.norm(embeddings[i])
                    norm_j = np.linalg.norm(embeddings[j])

                    if norm_i == 0 or norm_j == 0:
                        similarity_matrix[i, j] = 0.0
                    else:
                        similarity_matrix[i, j] = np.dot(embeddings[i], embeddings[j]) / (norm_i * norm_j)

        return similarity_matrix, {
            "num_texts": len(texts),
            "embedding_dim": embeddings.shape[1],
            "model_name": model_name
        }

    except Exception as e:
        return np.array([]), {"error": f"Error computing batch similarity: {e}"}

def read_file(filepath):
    """Read text from a file (local or URL) and return its contents"""
    if filepath.startswith('http://') or filepath.startswith('https://'):
        # If it's a URL, use requests to fetch the content
        try:
            response = requests.get(filepath)
            response.raise_for_status()  # Raise an exception for bad status codes
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL {filepath}: {e}")
            return None
    else:
        # Otherwise, treat it as a local file path
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                return f.read()
        except FileNotFoundError:
            print(f"Error: Local file not found at {filepath}")
            return None
        except Exception as e:
            print(f"Error reading local file {filepath}: {e}")
            return None


if __name__ == "__main__":
    # Read the files
    # Construct the full URLs for the raw files
    data_path = 'https://raw.githubusercontent.com/Brunozml/artistotllm/main/data/raw/'
    file1 = 'what_to_do.txt'
    file2 = 'how_to_think.txt'

    text1 = read_file(data_path + file1)
    text2 = read_file(data_path + file2)

    # Only proceed if both texts were successfully read
    if text1 is not None and text2 is not None:
        # Load model once for efficiency
        print("Loading transformer model...")
        model = load_model()

        if model is not None:
            # Compare texts
            similarity_score, metadata = transformer_similarity(text1, text2, model)

            # Print results
            print(f"Text 1: {file1}")
            print(f"Text 2: {file2}")
            print(f"\nTransformer Similarity score: {similarity_score:.4f}")
            print(f"Model used: {metadata.get('model_name', 'Unknown')}")
            print(f"Embedding dimension: {metadata.get('embedding_dim', 'Unknown')}")

            # Optional: Show sample embedding values
            # print("\nSample of embeddings (first 5 dimensions):")
            # print(f"Text 1: {metadata.get('embedding1_sample', [])}")
            # print(f"Text 2: {metadata.get('embedding2_sample', [])}")
        else:
            print("Failed to load transformer model. Please check your sentence-transformers installation.")
    else:
        print("Could not read one or both input files. Exiting.")

Loading transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Text 1: what_to_do.txt
Text 2: how_to_think.txt

Transformer Similarity score: 0.2603
Model used: all-MiniLM-L6-v2
Embedding dimension: 384


**Manually inserting texts**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

def compare_texts(text1, text2):
    """Compare two texts using transformer model and return similarity score"""
    # Load model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Get embeddings
    embedding1 = model.encode([text1])[0]
    embedding2 = model.encode([text2])[0]

    # Calculate similarity
    similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    return similarity

if __name__ == "__main__":
    # Paste your texts here
    text1 = """
    Paste your first text here
    """

    text2 = """
    Paste your second text here
    """

    # Compare and print result
    score = compare_texts(text1, text2)
    print(f"Similarity score: {score:.3f}")