<a href="https://colab.research.google.com/github/Brunozml/artistotllm/blob/main/text_similarity_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np
from typing import Tuple, Set
import requests
from collections import Counter

def get_word_set(text: str) -> Set[str]:
    """
    Extract unique words from text after preprocessing.
    Returns a set of lowercase words with punctuation removed.
    """
    # Convert to lowercase and extract words, removing punctuation
    words = set(re.findall(r'\w+', text.lower()))
    return words

def get_word_frequency(text: str) -> dict:
    """
    Get word frequency distribution in the text.
    Returns a dictionary with words as keys and their frequencies as values.
    """
    # Extract words and count frequencies
    words = re.findall(r'\w+', text.lower())
    word_freq = Counter(words)

    # Normalize frequencies
    total = sum(word_freq.values())
    if total == 0:  # Handle empty text case to avoid division by zero
        return {}

    normalized_freq = {word: count / total for word, count in word_freq.items()}
    return normalized_freq

def jaccard_similarity(text1: str, text2: str) -> Tuple[float, dict]:
    """
    Compare two texts using Jaccard similarity (word overlap approach).
    Returns a similarity score between 0 and 1 and additional metrics.
    """
    # Get word sets
    words1 = get_word_set(text1)
    words2 = get_word_set(text2)

    # Find intersection and union
    intersection = words1.intersection(words2)
    union = words1.union(words2)

    # Calculate Jaccard similarity
    if len(union) == 0:  # Handle empty texts
        similarity = 0.0
    else:
        similarity = len(intersection) / len(union)

    return similarity, {
        "shared_words": intersection,
        "total_unique_words": len(union),
        "text1_unique_words": len(words1),
        "text2_unique_words": len(words2)
    }

def cosine_similarity(text1: str, text2: str) -> Tuple[float, dict]:
    """
    Compare two texts using cosine similarity based on word frequencies.
    Returns a similarity score between 0 and 1 and frequency distributions.
    """
    # Get word frequency distributions
    freq1 = get_word_frequency(text1)
    freq2 = get_word_frequency(text2)

    # Get all unique words
    all_words = set(freq1.keys()) | set(freq2.keys())

    # Create frequency vectors
    vec1 = np.array([freq1.get(word, 0) for word in all_words])
    vec2 = np.array([freq2.get(word, 0) for word in all_words])

    # Handle cases where one or both vectors are zero vectors
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        similarity = 0.0
    else:
        similarity = np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)

    return similarity, {"text1_freq": freq1, "text2_freq": freq2}

def text_similarity(text1: str, text2: str, method: str = "jaccard") -> Tuple[float, dict]:
    """
    Compare two texts based on the specified similarity method.
    Available methods: 'jaccard', 'cosine'
    Returns a similarity score and additional metrics.
    """
    if method.lower() == "jaccard":
        return jaccard_similarity(text1, text2)
    elif method.lower() == "cosine":
        return cosine_similarity(text1, text2)
    else:
        raise ValueError(f"Unknown method: {method}. Available methods: 'jaccard', 'cosine'")

def read_file(filepath):
    """Read text from a file (local or URL) and return its contents"""
    if filepath.startswith('http://') or filepath.startswith('https://'):
        # If it's a URL, use requests to fetch the content
        try:
            response = requests.get(filepath)
            response.raise_for_status()  # Raise an exception for bad status codes
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL {filepath}: {e}")
            return None
    else:
        # Otherwise, treat it as a local file path
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                return f.read()
        except FileNotFoundError:
            print(f"Error: Local file not found at {filepath}")
            return None
        except Exception as e:
            print(f"Error reading local file {filepath}: {e}")
            return None


if __name__ == "__main__":
    # Read the files
    # Construct the full URLs for the raw files
    data_path = 'https://raw.githubusercontent.com/Brunozml/artistotllm/main/data/raw/'
    file1 = 'hypewrite_what_to_do.txt'
    file2 = 'gpt_what_to_do.txt'

    text1 = read_file(data_path + file1)
    text2 = read_file(data_path + file2)

    # Only proceed if both texts were successfully read
    if text1 is not None and text2 is not None:
        # Compare texts using both methods
        jaccard_score, jaccard_metrics = text_similarity(text1, text2, method="jaccard")
        cosine_score, cosine_metrics = text_similarity(text1, text2, method="cosine")

        # Print results
        print(f"Text 1: {file1}")
        print(f"Text 2: {file2}")
        print(f"\nJaccard Similarity score: {jaccard_score:.2f}")
        print(f"Cosine Similarity score: {cosine_score:.2f}")
        print(f"\nNumber of shared words: {len(jaccard_metrics['shared_words'])}")
        print(f"Sample of shared words (up to 10): {list(jaccard_metrics['shared_words'])[:10]}")
        print(f"Text 1 unique words: {jaccard_metrics['text1_unique_words']}")
        print(f"Text 2 unique words: {jaccard_metrics['text2_unique_words']}")
        print(f"Total unique words across both texts: {jaccard_metrics['total_unique_words']}")
    else:
        print("Could not read one or both input files. Exiting.")

Text 1: hypewrite_what_to_do.txt
Text 2: gpt_what_to_do.txt

Jaccard Similarity score: 0.21
Cosine Similarity score: 0.66

Number of shared words: 66
Sample of shared words (up to 10): ['in', 'an', 'if', 'good', 'learning', 'right', 'at', 'now', 'on', 'and']
Text 1 unique words: 226
Text 2 unique words: 158
Total unique words across both texts: 318


In [None]:
# prompt: take the previous code and add a variation where i insert the text myself for both files and not retrieve from a link

# Insert the text for file 1 here
text1 = """
This is the content of the first file.
It can be multiple lines of text.
We will compare this with the second file.
"""

# Insert the text for file 2 here
text2 = """
This is the content of the second file.
It also has multiple lines.
Let's see how similar they are.
"""

# Only proceed if both texts are not empty
if text1 and text2:
    # Compare texts using both methods
    jaccard_score, jaccard_metrics = text_similarity(text1, text2, method="jaccard")
    cosine_score, cosine_metrics = text_similarity(text1, text2, method="cosine")

    # Print results
    print("Text 1 (user-provided content)")
    print("Text 2 (user-provided content)")
    print(f"\nJaccard Similarity score: {jaccard_score:.2f}")
    print(f"Cosine Similarity score: {cosine_score:.2f}")
    print(f"\nNumber of shared words: {len(jaccard_metrics['shared_words'])}")
    print(f"Sample of shared words (up to 10): {list(jaccard_metrics['shared_words'])[:10]}")
    print(f"Text 1 unique words: {jaccard_metrics['text1_unique_words']}")
    print(f"Text 2 unique words: {jaccard_metrics['text2_unique_words']}")
    print(f"Total unique words across both texts: {jaccard_metrics['total_unique_words']}")
else:
    print("One or both input texts are empty. Cannot perform comparison.")