<a href="https://colab.research.google.com/github/Brunozml/artistotllm/blob/main/text_similarity_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import numpy as np
from typing import Tuple, Set
import requests
from collections import Counter

def get_word_set(text: str) -> Set[str]:
    """
    Extract unique words from text after preprocessing.
    Returns a set of lowercase words with punctuation removed.
    """
    # Convert to lowercase and extract words, removing punctuation
    words = set(re.findall(r'\w+', text.lower()))
    return words

def get_word_frequency(text: str) -> dict:
    """
    Get word frequency distribution in the text.
    Returns a dictionary with words as keys and their frequencies as values.
    """
    # Extract words and count frequencies
    words = re.findall(r'\w+', text.lower())
    word_freq = Counter(words)

    # Normalize frequencies
    total = sum(word_freq.values())
    if total == 0:  # Handle empty text case to avoid division by zero
        return {}

    normalized_freq = {word: count / total for word, count in word_freq.items()}
    return normalized_freq

def jaccard_similarity(text1: str, text2: str) -> Tuple[float, dict]:
    """
    Compare two texts using Jaccard similarity (word overlap approach).
    Returns a similarity score between 0 and 1 and additional metrics.
    """
    # Get word sets
    words1 = get_word_set(text1)
    words2 = get_word_set(text2)

    # Find intersection and union
    intersection = words1.intersection(words2)
    union = words1.union(words2)

    # Calculate Jaccard similarity
    if len(union) == 0:  # Handle empty texts
        similarity = 0.0
    else:
        similarity = len(intersection) / len(union)

    return similarity, {
        "shared_words": intersection,
        "total_unique_words": len(union),
        "text1_unique_words": len(words1),
        "text2_unique_words": len(words2)
    }

def cosine_similarity(text1: str, text2: str) -> Tuple[float, dict]:
    """
    Compare two texts using cosine similarity based on word frequencies.
    Returns a similarity score between 0 and 1 and frequency distributions.
    """
    # Get word frequency distributions
    freq1 = get_word_frequency(text1)
    freq2 = get_word_frequency(text2)

    # Get all unique words
    all_words = set(freq1.keys()) | set(freq2.keys())

    # Create frequency vectors
    vec1 = np.array([freq1.get(word, 0) for word in all_words])
    vec2 = np.array([freq2.get(word, 0) for word in all_words])

    # Handle cases where one or both vectors are zero vectors
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        similarity = 0.0
    else:
        similarity = np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)

    return similarity, {"text1_freq": freq1, "text2_freq": freq2}

def text_similarity(text1: str, text2: str, method: str = "jaccard") -> Tuple[float, dict]:
    """
    Compare two texts based on the specified similarity method.
    Available methods: 'jaccard', 'cosine'
    Returns a similarity score and additional metrics.
    """
    if method.lower() == "jaccard":
        return jaccard_similarity(text1, text2)
    elif method.lower() == "cosine":
        return cosine_similarity(text1, text2)
    else:
        raise ValueError(f"Unknown method: {method}. Available methods: 'jaccard', 'cosine'")

def read_file(filepath):
    """Read text from a file (local or URL) and return its contents"""
    if filepath.startswith('http://') or filepath.startswith('https://'):
        # If it's a URL, use requests to fetch the content
        try:
            response = requests.get(filepath)
            response.raise_for_status()  # Raise an exception for bad status codes
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL {filepath}: {e}")
            return None
    else:
        # Otherwise, treat it as a local file path
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                return f.read()
        except FileNotFoundError:
            print(f"Error: Local file not found at {filepath}")
            return None
        except Exception as e:
            print(f"Error reading local file {filepath}: {e}")
            return None


if __name__ == "__main__":
    # Read the files
    # Construct the full URLs for the raw files
    data_path = 'https://raw.githubusercontent.com/Brunozml/artistotllm/main/data/raw/'
    file1 = 'hypewrite_what_to_do.txt'
    file2 = 'gpt_what_to_do.txt'

    text1 = read_file(data_path + file1)
    text2 = read_file(data_path + file2)

    # Only proceed if both texts were successfully read
    if text1 is not None and text2 is not None:
        # Compare texts using both methods
        jaccard_score, jaccard_metrics = text_similarity(text1, text2, method="jaccard")
        cosine_score, cosine_metrics = text_similarity(text1, text2, method="cosine")

        # Print results
        print(f"Text 1: {file1}")
        print(f"Text 2: {file2}")
        print(f"\nJaccard Similarity score: {jaccard_score:.2f}")
        print(f"Cosine Similarity score: {cosine_score:.2f}")
        print(f"\nNumber of shared words: {len(jaccard_metrics['shared_words'])}")
        print(f"Sample of shared words (up to 10): {list(jaccard_metrics['shared_words'])[:10]}")
        print(f"Text 1 unique words: {jaccard_metrics['text1_unique_words']}")
        print(f"Text 2 unique words: {jaccard_metrics['text2_unique_words']}")
        print(f"Total unique words across both texts: {jaccard_metrics['total_unique_words']}")
    else:
        print("Could not read one or both input files. Exiting.")

Text 1: hypewrite_what_to_do.txt
Text 2: gpt_what_to_do.txt

Jaccard Similarity score: 0.21
Cosine Similarity score: 0.66

Number of shared words: 66
Sample of shared words (up to 10): ['treat', 'and', 'this', 'isn', 't', 'now', 'a', 'learning', 'doing', 'something']
Text 1 unique words: 226
Text 2 unique words: 158
Total unique words across both texts: 318


In [7]:
def compare_texts(text1, text2):
    """
    Compare similarity between two texts using a simple word overlap approach.
    Returns a similarity score between 0 and 1.
    """
    import re
    words1 = set(re.findall(r'\w+', text1.lower()))
    words2 = set(re.findall(r'\w+', text2.lower()))

    # Find common words
    common_words = words1.intersection(words2)

    # Calculate similarity score
    similarity = len(common_words) / max(len(words1), len(words2))

    return similarity, common_words

# Manually input your texts here
text1 = """The word "prig" isn't very common now, but if you look up the definition, it will sound familiar. Google's isn't bad: A self-righteously moralistic person who behaves as if superior to others. This sense of the word originated in the 18th century, and its age is an important clue: it shows that although wokeness is a comparatively recent phenomenon, it's an instance of a much older one.There's a certain kind of person who's attracted to a shallow, exacting kind of moral purity, and who demonstrates his purity by attacking anyone who breaks the rules. Every society has these people. All that changes is the rules they enforce. In Victorian England it was Christian virtue. In Stalin's Russia it was orthodox Marxism-Leninism. For the woke, it's social justice.So if you want to understand wokeness, the question to ask is not why people behave this way. Every society has prigs. The question to ask is why our prigs are priggish about these ideas, at this moment. And to answer that we have to ask when and where wokeness began.The answer to the first question is the 1980s. Wokeness is a second, more aggressive wave of political correctness, which started in the late 1980s, died down in the late 1990s, and then returned with a vengeance in the early 2010s, finally peaking after the riots of 2020.This was not the original meaning of "woke," but it's rarely used in the original sense now. Now the pejorative sense is the dominant one. What does it mean now? I've often been asked to define both wokeness and political correctness by people who think they're meaningless labels, so I will. They both have the same definition: An aggressively performative focus on social justice. In other words, it's people being prigs about social justice. And that's the real problem — the performativeness, not the social justice.Racism, for example, is a genuine problem. Not a problem on the scale that the woke believe it to be, but a genuine one. I don't think any reasonable person would deny that. The problem with political correctness was not that it focused on marginalized groups, but the shallow, aggressive way in which it did so. Instead of going out into the world and quietly helping members of marginalized groups, the politically correct focused on getting people in trouble for using the wrong words to talk about them.As for where political correctness began, if you think about it, you probably already know the answer. Did it begin outside universities and spread to them from this external source? Obviously not; it has always been most extreme in universities. So where in universities did it begin? Did it begin in math, or the hard sciences, or engineering, and spread from there to the humanities and social sciences? Those are amusing images, but no, obviously it began in the humanities and social sciences.Why there? And why then? What happened in the humanities and social sciences in the 1980s?A successful theory of the origin of"""
text2 = """wokeness would need to explain why it emerged in the humanities and social sciences, and why it emerged then. One possible explanation is that the humanities and social sciences were undergoing a profound shift in the 1980s, driven by the rise of postmodernism and poststructuralism. These intellectual movements, which emphasized the fragmented and provisional nature of knowledge, created a fertile ground for the kind of moral absolutism that characterizes wokeness.
"""

# Compare the texts
similarity_score, shared_words = compare_texts(text1, text2)

# Print results
print(f"Text 1:\n{text1[:100]}...\n")  # Preview first 100 chars
print(f"Text 2:\n{text2[:100]}...\n")
print(f"Similarity score: {similarity_score:.2f}")
print(f"Number of shared words: {len(shared_words)}")
print(f"Sample of shared words (up to 10): {list(shared_words)[:10]}")


Text 1:
The word "prig" isn't very common now, but if you look up the definition, it will sound familiar. Go...

Text 2:
wokeness would need to explain why it emerged in the humanities and social sciences, and why it emer...

Similarity score: 0.10
Number of shared words: 24
Sample of shared words (up to 10): ['and', 'a', 'it', 'these', 'sciences', 'to', 'why', 'social', 'in', 'humanities']
